diff --git a/.travis.yml b/.travis.yml index 37a40eb2..f8460d1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,9 +9,9 @@ os: - osx install: - - wget 'https://googletest.googlecode.com/files/gtest-1.6.0.zip' - - unzip gtest-1.6.0.zip - - ln -s gtest-1.6.0 gtest + - wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip' + - unzip gtest-1.7.0.zip + - ln -s gtest-1.7.0 gtest - sudo pip install BeautifulSoup - sudo pip install html5lib==0.95 - ln -s `python -c 'import html5lib, os; print os.path.dirname(html5lib.__file__)'`/tests/testdata . diff --git a/DEBUGGING.md b/DEBUGGING.md index 262ba1f1..8b8a56df 100644 --- a/DEBUGGING.md +++ b/DEBUGGING.md @@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core The same goes for core dumps in other example binaries. +To run only a single unit test, pass the --gtest_filter='TestName' flag to the +lt-gumbo_test binary. + Assertions ========== diff --git a/Makefile.am b/Makefile.am index 59cc2bb1..ea914116 100644 --- a/Makefile.am +++ b/Makefile.am @@ -37,6 +37,15 @@ clean-local: endif !HAVE_SHARED_LIBGTEST +src/tag_strings.h: src/tag.in + @sed 's/\(.*\)/"\1",/g' <$< >$@ + +src/tag_enum.h: src/tag.in + @sed 's/\(.*\)/GUMBO_TAG_\U\1,/g;s/-/_/g' <$< >$@ + +python/gumbo/gumboc_tags.py: src/tag.in + @sed -e '1i TagNames = [' -e 's/\(.*\)/\t"\U\1",/g' -e 's/-/_/g' -e "\$$a]" <$< >$@ + lib_LTLIBRARIES = libgumbo.la libgumbo_la_CFLAGS = -Wall libgumbo_la_LDFLAGS = -version-info 1:0:0 -no-undefined @@ -55,6 +64,8 @@ libgumbo_la_SOURCES = \ src/string_piece.c \ src/string_piece.h \ src/tag.c \ + src/tag_enum.h \ + src/tag_strings.h \ src/token_type.h \ src/tokenizer.c \ src/tokenizer.h \ diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 9c2c1c86..31d2ab42 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -62,7 +62,7 @@ int main(int argc, char** argv) { clock_t start_time = clock(); for (int i = 0; i < kNumReps; ++i) { GumboOutput* output = gumbo_parse(contents.c_str()); - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } clock_t end_time = clock(); std::cout << filename << ": " diff --git a/examples/clean_text.cc b/examples/clean_text.cc index 25113118..2e01b080 100644 --- a/examples/clean_text.cc +++ b/examples/clean_text.cc @@ -66,5 +66,5 @@ int main(int argc, char** argv) { GumboOutput* output = gumbo_parse(contents.c_str()); std::cout << cleantext(output->root) << std::endl; - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } diff --git a/examples/find_links.cc b/examples/find_links.cc index d84231d3..c1b56e7a 100644 --- a/examples/find_links.cc +++ b/examples/find_links.cc @@ -62,5 +62,5 @@ int main(int argc, char** argv) { GumboOutput* output = gumbo_parse(contents.c_str()); search_for_links(output->root); - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } diff --git a/examples/get_title.c b/examples/get_title.c index 15f2e294..e6dcdece 100644 --- a/examples/get_title.c +++ b/examples/get_title.c @@ -88,6 +88,6 @@ int main(int argc, const char** argv) { &kGumboDefaultOptions, input, input_length); const char* title = find_title(output->root); printf("%s\n", title); - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); free(input); } diff --git a/examples/positions_of_class.cc b/examples/positions_of_class.cc index 646ea23b..01ba4f7e 100644 --- a/examples/positions_of_class.cc +++ b/examples/positions_of_class.cc @@ -88,5 +88,5 @@ int main(int argc, char** argv) { GumboOutput* output = gumbo_parse_with_options( &kGumboDefaultOptions, contents.data(), contents.length()); search_for_class(output->root, contents, cls); - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } diff --git a/examples/prettyprint.cc b/examples/prettyprint.cc index 95c6eccc..02e59afa 100644 --- a/examples/prettyprint.cc +++ b/examples/prettyprint.cc @@ -210,7 +210,7 @@ static std::string prettyprint_contents(GumboNode* node, int lvl, const std::str contents.append(val); - } else if (child->type == GUMBO_NODE_ELEMENT) { + } else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE)) { std::string val = prettyprint(child, lvl, indent_chars); @@ -351,5 +351,5 @@ int main(int argc, char** argv) { GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length()); std::string indent_chars = " "; std::cout << prettyprint(output->document, 0, indent_chars) << std::endl; - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } diff --git a/examples/serialize.cc b/examples/serialize.cc index b6da9fa7..d1a41611 100644 --- a/examples/serialize.cc +++ b/examples/serialize.cc @@ -188,7 +188,7 @@ static std::string serialize_contents(GumboNode* node) { contents.append(substitute_xml_entities_into_text(std::string(child->v.text.text))); } - } else if (child->type == GUMBO_NODE_ELEMENT) { + } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) { contents.append(serialize(child)); } else if (child->type == GUMBO_NODE_WHITESPACE) { @@ -283,5 +283,5 @@ int main(int argc, char** argv) { GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length()); std::cout << serialize(output->document) << std::endl; - gumbo_destroy_output(&kGumboDefaultOptions, output); + gumbo_destroy_output(output); } diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index d377d58c..b2e40abd 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -26,6 +26,7 @@ import contextlib import ctypes import os.path +import gumboc_tags _name_of_lib = 'libgumbo.so' if sys.platform.startswith('darwin'): @@ -246,158 +247,12 @@ def to_url(self): class Tag(Enum): - _values_ = [ - 'HTML', - 'HEAD', - 'TITLE', - 'BASE', - 'LINK', - 'META', - 'STYLE', - 'SCRIPT', - 'NOSCRIPT', - 'TEMPLATE', - 'BODY', - 'ARTICLE', - 'SECTION', - 'NAV', - 'ASIDE', - 'H1', - 'H2', - 'H3', - 'H4', - 'H5', - 'H6', - 'HGROUP', - 'HEADER', - 'FOOTER', - 'ADDRESS', - 'P', - 'HR', - 'PRE', - 'BLOCKQUOTE', - 'OL', - 'UL', - 'LI', - 'DL', - 'DT', - 'DD', - 'FIGURE', - 'FIGCAPTION', - 'MAIN', - 'DIV', - 'A', - 'EM', - 'STRONG', - 'SMALL', - 'S', - 'CITE', - 'Q', - 'DFN', - 'ABBR', - 'DATA', - 'TIME', - 'CODE', - 'VAR', - 'SAMP', - 'KBD', - 'SUB', - 'SUP', - 'I', - 'B', - 'U', - 'MARK', - 'RUBY', - 'RT', - 'RP', - 'BDI', - 'BDO', - 'SPAN', - 'BR', - 'WBR', - 'INS', - 'DEL', - 'IMAGE', - 'IMG', - 'IFRAME', - 'EMBED', - 'OBJECT', - 'PARAM', - 'VIDEO', - 'AUDIO', - 'SOURCE', - 'TRACK', - 'CANVAS', - 'MAP', - 'AREA', - 'MATH', - 'MI', - 'MO', - 'MN', - 'MS', - 'MTEXT', - 'MGLYPH', - 'MALIGNMARK', - 'ANNOTATION_XML', - 'SVG', - 'FOREIGNOBJECT', - 'DESC', - 'TABLE', - 'CAPTION', - 'COLGROUP', - 'COL', - 'TBODY', - 'THEAD', - 'TFOOT', - 'TR', - 'TD', - 'TH', - 'FORM', - 'FIELDSET', - 'LEGEND', - 'LABEL', - 'INPUT', - 'BUTTON', - 'SELECT', - 'DATALIST', - 'OPTGROUP', - 'OPTION', - 'TEXTAREA', - 'KEYGEN', - 'OUTPUT', - 'PROGRESS', - 'METER', - 'DETAILS', - 'SUMMARY', - 'MENU', - 'MENUITEM', - 'APPLET', - 'ACRONYM', - 'BGSOUND', - 'DIR', - 'FRAME', - 'FRAMESET', - 'NOFRAMES', - 'ISINDEX', - 'LISTING', - 'XMP', - 'NEXTID', - 'NOEMBED', - 'PLAINTEXT', - 'RB', - 'STRIKE', - 'BASEFONT', - 'BIG', - 'BLINK', - 'CENTER', - 'FONT', - 'MARQUEE', - 'MULTICOL', - 'NOBR', - 'SPACER', - 'TT', - 'UNKNOWN', - ] + @staticmethod + def from_str(tagname): + text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) + return _tag_enum(text_ptr) + + _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST'] class Element(ctypes.Structure): @@ -444,7 +299,8 @@ def __repr__(self): class NodeType(Enum): - _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', 'COMMENT', 'WHITESPACE'] + _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', + 'COMMENT', 'WHITESPACE', 'TEMPLATE'] class NodeUnion(ctypes.Union): @@ -463,7 +319,7 @@ def _contents(self): # __getattr__, so we factor it out to a helper. if self.type == NodeType.DOCUMENT: return self.v.document - elif self.type == NodeType.ELEMENT: + elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): return self.v.element else: return self.v.text @@ -496,11 +352,6 @@ def __repr__(self): class Options(ctypes.Structure): _fields_ = [ - # TODO(jdtang): Allow the Python API to set the allocator/deallocator - # function. Right now these are treated as opaque void pointers. - ('allocator', ctypes.c_void_p), - ('deallocator', ctypes.c_void_p), - ('userdata', ctypes.c_void_p), ('tab_stop', ctypes.c_int), ('stop_on_first_error', ctypes.c_bool), ('max_errors', ctypes.c_int), @@ -515,10 +366,10 @@ class Output(ctypes.Structure): ('errors', Vector), ] - @contextlib.contextmanager def parse(text, **kwargs): options = Options() + container = kwargs.get("inner_html", Tag.LAST) for field_name, _ in Options._fields_: try: setattr(options, field_name, kwargs[field_name]) @@ -529,11 +380,11 @@ def parse(text, **kwargs): # call, it creates a temporary buffer which is destroyed when the call # completes, and then the original_text pointers point into invalid memory. text_ptr = ctypes.c_char_p(text.encode('utf-8')) - output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) + output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container) try: yield output finally: - _destroy_output(ctypes.byref(options), output) + _destroy_output(output) _DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions') @@ -541,6 +392,10 @@ def parse(text, **kwargs): _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] _parse_with_options.restype = _Ptr(Output) +_parse_fragment = _dll.gumbo_parse_fragment +_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag] +_parse_fragment.restype = _Ptr(Output) + _tag_from_original_text = _dll.gumbo_tag_from_original_text _tag_from_original_text.argtypes = [_Ptr(StringPiece)] _tag_from_original_text.restype = None @@ -550,13 +405,17 @@ def parse(text, **kwargs): _normalize_svg_tagname.restype = ctypes.c_char_p _destroy_output = _dll.gumbo_destroy_output -_destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)] +_destroy_output.argtypes = [_Ptr(Output)] _destroy_output.restype = None _tagname = _dll.gumbo_normalized_tagname _tagname.argtypes = [Tag] _tagname.restype = ctypes.c_char_p +_tag_enum = _dll.gumbo_tag_enum +_tag_enum.argtypes = [ctypes.c_char_p] +_tag_enum.restype = Tag + __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py new file mode 100644 index 00000000..c715b8ad --- /dev/null +++ b/python/gumbo/gumboc_tags.py @@ -0,0 +1,152 @@ +TagNames = [ + "HTML", + "HEAD", + "TITLE", + "BASE", + "LINK", + "META", + "STYLE", + "SCRIPT", + "NOSCRIPT", + "TEMPLATE", + "BODY", + "ARTICLE", + "SECTION", + "NAV", + "ASIDE", + "H1", + "H2", + "H3", + "H4", + "H5", + "H6", + "HGROUP", + "HEADER", + "FOOTER", + "ADDRESS", + "P", + "HR", + "PRE", + "BLOCKQUOTE", + "OL", + "UL", + "LI", + "DL", + "DT", + "DD", + "FIGURE", + "FIGCAPTION", + "MAIN", + "DIV", + "A", + "EM", + "STRONG", + "SMALL", + "S", + "CITE", + "Q", + "DFN", + "ABBR", + "DATA", + "TIME", + "CODE", + "VAR", + "SAMP", + "KBD", + "SUB", + "SUP", + "I", + "B", + "U", + "MARK", + "RUBY", + "RT", + "RP", + "BDI", + "BDO", + "SPAN", + "BR", + "WBR", + "INS", + "DEL", + "IMAGE", + "IMG", + "IFRAME", + "EMBED", + "OBJECT", + "PARAM", + "VIDEO", + "AUDIO", + "SOURCE", + "TRACK", + "CANVAS", + "MAP", + "AREA", + "MATH", + "MI", + "MO", + "MN", + "MS", + "MTEXT", + "MGLYPH", + "MALIGNMARK", + "ANNOTATION_XML", + "SVG", + "FOREIGNOBJECT", + "DESC", + "TABLE", + "CAPTION", + "COLGROUP", + "COL", + "TBODY", + "THEAD", + "TFOOT", + "TR", + "TD", + "TH", + "FORM", + "FIELDSET", + "LEGEND", + "LABEL", + "INPUT", + "BUTTON", + "SELECT", + "DATALIST", + "OPTGROUP", + "OPTION", + "TEXTAREA", + "KEYGEN", + "OUTPUT", + "PROGRESS", + "METER", + "DETAILS", + "SUMMARY", + "MENU", + "MENUITEM", + "APPLET", + "ACRONYM", + "BGSOUND", + "DIR", + "FRAME", + "FRAMESET", + "NOFRAMES", + "ISINDEX", + "LISTING", + "XMP", + "NEXTID", + "NOEMBED", + "PLAINTEXT", + "RB", + "STRIKE", + "BASEFONT", + "BIG", + "BLINK", + "CENTER", + "FONT", + "MARQUEE", + "MULTICOL", + "NOBR", + "SPACER", + "TT", + "RTC", +] diff --git a/python/gumbo/html5lib_adapter.py b/python/gumbo/html5lib_adapter.py index 2a968640..54d4fc17 100644 --- a/python/gumbo/html5lib_adapter.py +++ b/python/gumbo/html5lib_adapter.py @@ -58,7 +58,7 @@ def maybe_namespace(attr): def _convert_element(source_node): - if source_node.type != gumboc.NodeType.ELEMENT: + if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): # If-statement instead of assert so it runs with -O raise AssertionError( '_convert_element only works with elements; found %r' % @@ -70,12 +70,12 @@ def _convert_element(source_node): } -def _insert_root(treebuilder, source_node): +def _insert_root(treebuilder, source_node, pop_element = True): treebuilder.insertRoot(_convert_element(source_node)) for child_node in source_node.children: _insert_node(treebuilder, child_node) - treebuilder.openElements.pop() - + if pop_element: + treebuilder.openElements.pop() def _insert_node(treebuilder, source_node): assert source_node.type != gumboc.NodeType.DOCUMENT @@ -110,8 +110,24 @@ def parse(self, text_or_file, **kwargs): if node.type == gumboc.NodeType.COMMENT: self.tree.insertComment({'data': node.v.text.text.decode('utf-8')}, self.tree.document) - elif node.type == gumboc.NodeType.ELEMENT: + elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): _insert_root(self.tree, output.contents.root.contents) else: assert 'Only comments and nodes allowed at the root' return self.tree.getDocument() + + def parseFragment(self, text_or_file, inner_html, **kwargs): + try: + text = text_or_file.read() + except AttributeError: + # Assume a string. + text = text_or_file + inner_html = gumboc.Tag.from_str(inner_html) + + with gumboc.parse(text, inner_html=inner_html, **kwargs) as output: + for node in output.contents.document.contents.children: + if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): + _insert_root(self.tree, output.contents.root.contents, False) + else: + assert 'Malformed fragment parse (??)' + return self.tree.getFragment() diff --git a/python/gumbo/html5lib_adapter_test.py b/python/gumbo/html5lib_adapter_test.py index 2ab8c619..16908f92 100644 --- a/python/gumbo/html5lib_adapter_test.py +++ b/python/gumbo/html5lib_adapter_test.py @@ -91,6 +91,22 @@ def convertExpected(data, stripChars): rv.append(line) return "\n".join(rv) +def reformatTemplateContents(expected): + lines = expected.split('\n') + retval = [] + template_indents = [] + for line in lines: + indent = len(line) - len(line.strip()) + if 'content' in line: + template_indents.append(indent) + continue + elif template_indents and indent <= template_indents[-1]: + template_indents.pop() + elif template_indents: + line = line[2 * len(template_indents):] + retval.append(line) + return '\n'.join(retval) + class Html5libAdapterTest(unittest.TestCase): """Adapter between Gumbo and the html5lib tests. @@ -106,11 +122,11 @@ class Html5libAdapterTest(unittest.TestCase): def impl(self, inner_html, input, expected, errors): p = html5lib_adapter.HTMLParser( tree=TREEBUILDER(namespaceHTMLElements=True)) - if not inner_html: - # TODO(jdtang): Need to implement fragment parsing. - document = p.parse(StringIO.StringIO(input)) + + if inner_html: + document = p.parseFragment(StringIO.StringIO(input), inner_html) else: - return + document = p.parse(StringIO.StringIO(input)) with warnings.catch_warnings(): # Etree serializer in html5lib uses a deprecated getchildren() API. diff --git a/python/gumbo/soup_adapter.py b/python/gumbo/soup_adapter.py index 089f8918..9bfaed66 100644 --- a/python/gumbo/soup_adapter.py +++ b/python/gumbo/soup_adapter.py @@ -80,6 +80,7 @@ def add_text_internal(soup, element): _add_text(BeautifulSoup.CData), _add_text(BeautifulSoup.Comment), _add_text(BeautifulSoup.NavigableString), + _add_element, ] diff --git a/setup.py b/setup.py index 4f06749e..1c20cef5 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,22 @@ #!/usr/bin/env python +import sys from setuptools import setup from setuptools.command.sdist import sdist +_name_of_lib = 'libgumbo.so' +if sys.platform.startswith('darwin'): + _name_of_lib = 'libgumbo.dylib' +elif sys.platform.startswith('win'): + _name_of_lib = 'gumbo.dll' + class CustomSdistCommand(sdist): """Customized Sdist command, to copy libgumbo.so into the Python directory so that it can be installed with `pip install`.""" def run(self): try: import shutil - shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so') + shutil.copyfile('.libs/' + _name_of_lib, + 'python/gumbo/' + _name_of_lib) sdist.run(self) except IOError as e: print(e) @@ -172,6 +180,6 @@ def run(self): classifiers=CLASSIFIERS, packages=['gumbo'], package_dir={'': 'python'}, - package_data={'gumbo': ['libgumbo.so']}, + package_data={'gumbo': [_name_of_lib]}, cmdclass={ 'sdist': CustomSdistCommand }, zip_safe=False) diff --git a/src/attribute.c b/src/attribute.c index a008403d..9dc0eeed 100644 --- a/src/attribute.c +++ b/src/attribute.c @@ -22,6 +22,7 @@ #include #include "util.h" +#include "vector.h" struct GumboInternalParser; @@ -36,9 +37,38 @@ GumboAttribute* gumbo_get_attribute( return NULL; } -void gumbo_destroy_attribute( - struct GumboInternalParser* parser, GumboAttribute* attribute) { - gumbo_parser_deallocate(parser, (void*) attribute->name); - gumbo_parser_deallocate(parser, (void*) attribute->value); - gumbo_parser_deallocate(parser, (void*) attribute); +void gumbo_set_attribute_value(GumboAttribute *attr, const char *value) +{ + gumbo_free((void *)attr->value); + attr->value = gumbo_strdup(value); + attr->original_value = kGumboEmptyString; + attr->value_start = kGumboEmptySourcePosition; + attr->value_end = kGumboEmptySourcePosition; +} + +void gumbo_set_attribute( + GumboVector *attributes, const char *name, const char *value) +{ + GumboAttribute *attr = gumbo_get_attribute(attributes, name); + + if (!attr) { + attr = gumbo_malloc(sizeof(GumboAttribute)); + attr->value = NULL; + attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; + + attr->name = gumbo_strdup(name); + attr->original_name = kGumboEmptyString; + attr->name_start = kGumboEmptySourcePosition; + attr->name_end = kGumboEmptySourcePosition; + + gumbo_vector_add(attr, attributes); + } + + gumbo_set_attribute_value(attr, value); +} + +void gumbo_destroy_attribute(GumboAttribute* attribute) { + gumbo_free((void*) attribute->name); + gumbo_free((void*) attribute->value); + gumbo_free((void*) attribute); } diff --git a/src/attribute.h b/src/attribute.h index f9b8aea5..67dd12a0 100644 --- a/src/attribute.h +++ b/src/attribute.h @@ -25,10 +25,28 @@ extern "C" { struct GumboInternalParser; -// Release the memory used for an GumboAttribute, including the attribute -// itself. -void gumbo_destroy_attribute( - struct GumboInternalParser* parser, GumboAttribute* attribute); +/* + * Search for a GumboAttribute object in a vector of + * attributes. Matching is performed case-insensitively + */ +GumboAttribute* gumbo_get_attribute(const GumboVector* attributes, const char* name); + +/* + * Set the "value" of the given GumboAttribute object + */ +void gumbo_set_attribute_value(GumboAttribute *attr, const char *value); + +/* + * Add an attribute to an existing vector of attributes; + * if the attribute already exists, it will be updated in-place + */ +void gumbo_set_attribute(GumboVector *attributes, const char *name, const char *value); + +/* + * Release the memory used for an GumboAttribute, including the attribute + * itself. + */ +void gumbo_destroy_attribute(GumboAttribute* attribute); #ifdef __cplusplus } diff --git a/src/error.c b/src/error.c index 3239a0b6..a0274d28 100644 --- a/src/error.c +++ b/src/error.c @@ -32,8 +32,7 @@ static const size_t kMessageBufferSize = 256; // Prints a formatted message to a StringBuffer. This automatically resizes the // StringBuffer as necessary to fit the message. Returns the number of bytes // written. -static int print_message(GumboParser* parser, GumboStringBuffer* output, - const char* format, ...) { +static int print_message(GumboStringBuffer* output, const char* format, ...) { va_list args; va_start(args, format); int remaining_capacity = output->capacity - output->length; @@ -46,7 +45,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, // enough. In this case, we'll double the buffer size and hope it fits when // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. - gumbo_string_buffer_reserve(parser, output->capacity * 2, output); + gumbo_string_buffer_reserve(output->capacity * 2, output); int result = vsnprintf(output->data + output->length, remaining_capacity, format, args); va_end(args); @@ -61,8 +60,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, #endif if (bytes_written > remaining_capacity) { - gumbo_string_buffer_reserve( - parser, output->capacity + bytes_written, output); + gumbo_string_buffer_reserve(output->capacity + bytes_written, output); remaining_capacity = output->capacity - output->length; bytes_written = vsnprintf(output->data + output->length, remaining_capacity, format, args); @@ -72,59 +70,58 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, return bytes_written; } -static void print_tag_stack( - GumboParser* parser, const GumboParserError* error, - GumboStringBuffer* output) { - print_message(parser, output, " Currently open tags: "); +static void print_tag_stack(const GumboParserError* error, GumboStringBuffer* output) { + print_message(output, " Currently open tags: "); for (int i = 0; i < error->tag_stack.length; ++i) { if (i) { - print_message(parser, output, ", "); + print_message(output, ", "); } GumboTag tag = (GumboTag) error->tag_stack.data[i]; - print_message(parser, output, gumbo_normalized_tagname(tag)); + print_message(output, gumbo_normalized_tagname(tag)); } - gumbo_string_buffer_append_codepoint(parser, '.', output); + gumbo_string_buffer_append_codepoint('.', output); } -static void handle_parser_error(GumboParser* parser, - const GumboParserError* error, - GumboStringBuffer* output) { +static void handle_parser_error( + const GumboParserError* error, + GumboStringBuffer* output) { if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL && error->input_type != GUMBO_TOKEN_DOCTYPE) { - print_message(parser, output, + print_message(output, "The doctype must be the first token in the document"); return; } switch (error->input_type) { case GUMBO_TOKEN_DOCTYPE: - print_message(parser, output, "This is not a legal doctype"); + print_message(output, "This is not a legal doctype"); return; case GUMBO_TOKEN_COMMENT: // Should never happen; comments are always legal. assert(0); // But just in case... - print_message(parser, output, "Comments aren't legal here"); + print_message(output, "Comments aren't legal here"); return; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_WHITESPACE: case GUMBO_TOKEN_CHARACTER: - print_message(parser, output, "Character tokens aren't legal here"); + print_message(output, "Character tokens aren't legal here"); return; case GUMBO_TOKEN_NULL: - print_message(parser, output, "Null bytes are not allowed in HTML5"); + print_message(output, "Null bytes are not allowed in HTML5"); return; case GUMBO_TOKEN_EOF: if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) { - print_message(parser, output, "You must provide a doctype"); + print_message(output, "You must provide a doctype"); } else { - print_message(parser, output, "Premature end of file"); - print_tag_stack(parser, error, output); + print_message(output, "Premature end of file"); + print_tag_stack(error, output); } return; case GUMBO_TOKEN_START_TAG: case GUMBO_TOKEN_END_TAG: - print_message(parser, output, "That tag isn't allowed here"); - print_tag_stack(parser, error, output); + print_message(output, "That tag isn't allowed here"); + print_tag_stack(error, output); // TODO(jdtang): Give more specific messaging. return; } @@ -159,53 +156,53 @@ GumboError* gumbo_add_error(GumboParser* parser) { if (max_errors >= 0 && parser->_output->errors.length >= max_errors) { return NULL; } - GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError)); - gumbo_vector_add(parser, error, &parser->_output->errors); + GumboError* error = gumbo_malloc(sizeof(GumboError)); + gumbo_vector_add(error, &parser->_output->errors); return error; } void gumbo_error_to_string( - GumboParser* parser, const GumboError* error, GumboStringBuffer* output) { - print_message(parser, output, "@%d:%d: ", + const GumboError* error, GumboStringBuffer* output) { + print_message(output, "@%d:%d: ", error->position.line, error->position.column); switch (error->type) { case GUMBO_ERR_UTF8_INVALID: - print_message(parser, output, "Invalid UTF8 character 0x%x", + print_message(output, "Invalid UTF8 character 0x%x", error->v.codepoint); break; case GUMBO_ERR_UTF8_TRUNCATED: - print_message(parser, output, + print_message(output, "Input stream ends with a truncated UTF8 character 0x%x", error->v.codepoint); break; case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS: - print_message(parser, output, + print_message(output, "No digits after &# in numeric character reference"); break; case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON: - print_message(parser, output, + print_message(output, "The numeric character reference &#%d should be followed " "by a semicolon", error->v.codepoint); break; case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID: - print_message(parser, output, + print_message(output, "The numeric character reference &#%d; encodes an invalid " "unicode codepoint", error->v.codepoint); break; case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON: // The textual data came from one of the literal strings in the table, and // so it'll be null-terminated. - print_message(parser, output, + print_message(output, "The named character reference &%.*s should be followed by a " "semicolon", (int) error->v.text.length, error->v.text.data); break; case GUMBO_ERR_NAMED_CHAR_REF_INVALID: - print_message(parser, output, + print_message(output, "The named character reference &%.*s; is not a valid entity name", (int) error->v.text.length, error->v.text.data); break; case GUMBO_ERR_DUPLICATE_ATTR: - print_message(parser, output, + print_message(output, "Attribute %s occurs multiple times, at positions %d and %d", error->v.duplicate_attr.name, error->v.duplicate_attr.original_index, @@ -213,20 +210,19 @@ void gumbo_error_to_string( break; case GUMBO_ERR_PARSER: case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG: - handle_parser_error(parser, &error->v.parser, output); + handle_parser_error(&error->v.parser, output); break; default: - print_message(parser, output, + print_message(output, "Tokenizer error with an unimplemented error message"); break; } - gumbo_string_buffer_append_codepoint(parser, '.', output); + gumbo_string_buffer_append_codepoint('.', output); } -void gumbo_caret_diagnostic_to_string( - GumboParser* parser, const GumboError* error, +void gumbo_caret_diagnostic_to_string(const GumboError* error, const char* source_text, GumboStringBuffer* output) { - gumbo_error_to_string(parser, error, output); + gumbo_error_to_string(error, output); const char* line_start = find_last_newline(source_text, error->original_text); @@ -236,44 +232,44 @@ void gumbo_caret_diagnostic_to_string( original_line.data = line_start; original_line.length = line_end - line_start; - gumbo_string_buffer_append_codepoint(parser, '\n', output); - gumbo_string_buffer_append_string(parser, &original_line, output); - gumbo_string_buffer_append_codepoint(parser, '\n', output); + gumbo_string_buffer_append_codepoint('\n', output); + gumbo_string_buffer_append_string(&original_line, output); + gumbo_string_buffer_append_codepoint('\n', output); gumbo_string_buffer_reserve( - parser, output->length + error->position.column, output); + output->length + error->position.column, output); int num_spaces = error->position.column - 1; memset(output->data + output->length, ' ', num_spaces); output->length += num_spaces; - gumbo_string_buffer_append_codepoint(parser, '^', output); - gumbo_string_buffer_append_codepoint(parser, '\n', output); + gumbo_string_buffer_append_codepoint('^', output); + gumbo_string_buffer_append_codepoint('\n', output); } void gumbo_print_caret_diagnostic( - GumboParser* parser, const GumboError* error, const char* source_text) { + const GumboError* error, const char* source_text) { GumboStringBuffer text; - gumbo_string_buffer_init(parser, &text); - gumbo_caret_diagnostic_to_string(parser, error, source_text, &text); + gumbo_string_buffer_init(&text); + gumbo_caret_diagnostic_to_string(error, source_text, &text); printf("%.*s", (int) text.length, text.data); - gumbo_string_buffer_destroy(parser, &text); + gumbo_string_buffer_destroy(&text); } -void gumbo_error_destroy(GumboParser* parser, GumboError* error) { +void gumbo_error_destroy(GumboError* error) { if (error->type == GUMBO_ERR_PARSER || error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) { - gumbo_vector_destroy(parser, &error->v.parser.tag_stack); + gumbo_vector_destroy(&error->v.parser.tag_stack); } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) { - gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name); + gumbo_free((void*) error->v.duplicate_attr.name); } - gumbo_parser_deallocate(parser, error); + gumbo_free(error); } void gumbo_init_errors(GumboParser* parser) { - gumbo_vector_init(parser, 5, &parser->_output->errors); + gumbo_vector_init(5, &parser->_output->errors); } void gumbo_destroy_errors(GumboParser* parser) { for (int i = 0; i < parser->_output->errors.length; ++i) { - gumbo_error_destroy(parser, parser->_output->errors.data[i]); + gumbo_error_destroy(parser->_output->errors.data[i]); } - gumbo_vector_destroy(parser, &parser->_output->errors); + gumbo_vector_destroy(&parser->_output->errors); } diff --git a/src/error.h b/src/error.h index c22006ac..1c685a9d 100644 --- a/src/error.h +++ b/src/error.h @@ -194,31 +194,27 @@ void gumbo_init_errors(struct GumboInternalParser* errors); void gumbo_destroy_errors(struct GumboInternalParser* errors); // Frees the memory used for a single GumboError. -void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error); +void gumbo_error_destroy(GumboError* error); // Prints an error to a string. This fills an empty GumboStringBuffer with a // freshly-allocated buffer containing the error message text. The caller is // responsible for deleting the buffer. (Note that the buffer is allocated with // the allocator specified in the GumboParser config and hence should be freed -// by gumbo_parser_deallocate().) -void gumbo_error_to_string( - struct GumboInternalParser* parser, const GumboError* error, - GumboStringBuffer* output); +// by gumbo_free().) +void gumbo_error_to_string(const GumboError* error, GumboStringBuffer* output); // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer // with a freshly-allocated buffer containing the error message text. The // caller is responsible for deleting the buffer. (Note that the buffer is // allocated with the allocator specified in the GumboParser config and hence // should be freed by gumbo_parser_deallocate().) -void gumbo_caret_diagnostic_to_string( - struct GumboInternalParser* parser, const GumboError* error, +void gumbo_caret_diagnostic_to_string(const GumboError* error, const char* source_text, GumboStringBuffer* output); // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead // of writing to a string. void gumbo_print_caret_diagnostic( - struct GumboInternalParser* parser, const GumboError* error, - const char* source_text); + const GumboError* error, const char* source_text); #ifdef __cplusplus } diff --git a/src/gumbo.h b/src/gumbo.h index a1b9a036..eeb46c7a 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -141,7 +141,7 @@ extern const GumboVector kGumboEmptyVector; * Returns the first index at which an element appears in this vector (testing * by pointer equality), or -1 if it never does. */ -int gumbo_vector_index_of(GumboVector* vector, void* element); +int gumbo_vector_index_of(GumboVector* vector, const void* element); /** @@ -157,171 +157,8 @@ int gumbo_vector_index_of(GumboVector* vector, void* element); * strings. */ typedef enum { - // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element - GUMBO_TAG_HTML, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata - GUMBO_TAG_HEAD, - GUMBO_TAG_TITLE, - GUMBO_TAG_BASE, - GUMBO_TAG_LINK, - GUMBO_TAG_META, - GUMBO_TAG_STYLE, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1 - GUMBO_TAG_SCRIPT, - GUMBO_TAG_NOSCRIPT, - GUMBO_TAG_TEMPLATE, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections - GUMBO_TAG_BODY, - GUMBO_TAG_ARTICLE, - GUMBO_TAG_SECTION, - GUMBO_TAG_NAV, - GUMBO_TAG_ASIDE, - GUMBO_TAG_H1, - GUMBO_TAG_H2, - GUMBO_TAG_H3, - GUMBO_TAG_H4, - GUMBO_TAG_H5, - GUMBO_TAG_H6, - GUMBO_TAG_HGROUP, - GUMBO_TAG_HEADER, - GUMBO_TAG_FOOTER, - GUMBO_TAG_ADDRESS, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content - GUMBO_TAG_P, - GUMBO_TAG_HR, - GUMBO_TAG_PRE, - GUMBO_TAG_BLOCKQUOTE, - GUMBO_TAG_OL, - GUMBO_TAG_UL, - GUMBO_TAG_LI, - GUMBO_TAG_DL, - GUMBO_TAG_DT, - GUMBO_TAG_DD, - GUMBO_TAG_FIGURE, - GUMBO_TAG_FIGCAPTION, - GUMBO_TAG_MAIN, - GUMBO_TAG_DIV, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics - GUMBO_TAG_A, - GUMBO_TAG_EM, - GUMBO_TAG_STRONG, - GUMBO_TAG_SMALL, - GUMBO_TAG_S, - GUMBO_TAG_CITE, - GUMBO_TAG_Q, - GUMBO_TAG_DFN, - GUMBO_TAG_ABBR, - GUMBO_TAG_DATA, - GUMBO_TAG_TIME, - GUMBO_TAG_CODE, - GUMBO_TAG_VAR, - GUMBO_TAG_SAMP, - GUMBO_TAG_KBD, - GUMBO_TAG_SUB, - GUMBO_TAG_SUP, - GUMBO_TAG_I, - GUMBO_TAG_B, - GUMBO_TAG_U, - GUMBO_TAG_MARK, - GUMBO_TAG_RUBY, - GUMBO_TAG_RT, - GUMBO_TAG_RP, - GUMBO_TAG_BDI, - GUMBO_TAG_BDO, - GUMBO_TAG_SPAN, - GUMBO_TAG_BR, - GUMBO_TAG_WBR, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits - GUMBO_TAG_INS, - GUMBO_TAG_DEL, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1 - GUMBO_TAG_IMAGE, - GUMBO_TAG_IMG, - GUMBO_TAG_IFRAME, - GUMBO_TAG_EMBED, - GUMBO_TAG_OBJECT, - GUMBO_TAG_PARAM, - GUMBO_TAG_VIDEO, - GUMBO_TAG_AUDIO, - GUMBO_TAG_SOURCE, - GUMBO_TAG_TRACK, - GUMBO_TAG_CANVAS, - GUMBO_TAG_MAP, - GUMBO_TAG_AREA, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml - GUMBO_TAG_MATH, - GUMBO_TAG_MI, - GUMBO_TAG_MO, - GUMBO_TAG_MN, - GUMBO_TAG_MS, - GUMBO_TAG_MTEXT, - GUMBO_TAG_MGLYPH, - GUMBO_TAG_MALIGNMARK, - GUMBO_TAG_ANNOTATION_XML, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0 - GUMBO_TAG_SVG, - GUMBO_TAG_FOREIGNOBJECT, - GUMBO_TAG_DESC, - // SVG title tags will have GUMBO_TAG_TITLE as with HTML. - // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data - GUMBO_TAG_TABLE, - GUMBO_TAG_CAPTION, - GUMBO_TAG_COLGROUP, - GUMBO_TAG_COL, - GUMBO_TAG_TBODY, - GUMBO_TAG_THEAD, - GUMBO_TAG_TFOOT, - GUMBO_TAG_TR, - GUMBO_TAG_TD, - GUMBO_TAG_TH, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms - GUMBO_TAG_FORM, - GUMBO_TAG_FIELDSET, - GUMBO_TAG_LEGEND, - GUMBO_TAG_LABEL, - GUMBO_TAG_INPUT, - GUMBO_TAG_BUTTON, - GUMBO_TAG_SELECT, - GUMBO_TAG_DATALIST, - GUMBO_TAG_OPTGROUP, - GUMBO_TAG_OPTION, - GUMBO_TAG_TEXTAREA, - GUMBO_TAG_KEYGEN, - GUMBO_TAG_OUTPUT, - GUMBO_TAG_PROGRESS, - GUMBO_TAG_METER, - // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements - GUMBO_TAG_DETAILS, - GUMBO_TAG_SUMMARY, - GUMBO_TAG_MENU, - GUMBO_TAG_MENUITEM, - // Non-conforming elements that nonetheless appear in the HTML5 spec. - // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features - GUMBO_TAG_APPLET, - GUMBO_TAG_ACRONYM, - GUMBO_TAG_BGSOUND, - GUMBO_TAG_DIR, - GUMBO_TAG_FRAME, - GUMBO_TAG_FRAMESET, - GUMBO_TAG_NOFRAMES, - GUMBO_TAG_ISINDEX, - GUMBO_TAG_LISTING, - GUMBO_TAG_XMP, - GUMBO_TAG_NEXTID, - GUMBO_TAG_NOEMBED, - GUMBO_TAG_PLAINTEXT, - GUMBO_TAG_RB, - GUMBO_TAG_STRIKE, - GUMBO_TAG_BASEFONT, - GUMBO_TAG_BIG, - GUMBO_TAG_BLINK, - GUMBO_TAG_CENTER, - GUMBO_TAG_FONT, - GUMBO_TAG_MARQUEE, - GUMBO_TAG_MULTICOL, - GUMBO_TAG_NOBR, - GUMBO_TAG_SPACER, - GUMBO_TAG_TT, + // Load all the tags from an external source +# include "tag_enum.h" // Used for all tags that don't have special handling in HTML. GUMBO_TAG_UNKNOWN, // A marker value to indicate the end of the enum, for iterating over it. @@ -364,9 +201,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); /** * Converts a tag name string (which may be in upper or mixed case) to a tag - * enum. + * enum. The `tag` version expects `tagname` to be NULL-terminated */ GumboTag gumbo_tag_enum(const char* tagname); +GumboTag gumbo_tagn_enum(const char* tagname, int length); /** * Attribute namespaces. @@ -461,10 +299,16 @@ typedef enum { GUMBO_NODE_TEXT, /** CDATA node. v will be a GumboText. */ GUMBO_NODE_CDATA, - /** Comment node. v. will be a GumboText, excluding comment delimiters. */ + /** Comment node. v will be a GumboText, excluding comment delimiters. */ GUMBO_NODE_COMMENT, /** Text node, where all contents is whitespace. v will be a GumboText. */ - GUMBO_NODE_WHITESPACE + GUMBO_NODE_WHITESPACE, + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + GUMBO_NODE_TEMPLATE } GumboNodeType; /** @@ -718,18 +562,6 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); * Use kGumboDefaultOptions for sensible defaults, and only set what you need. */ typedef struct GumboInternalOptions { - /** A memory allocator function. Default: malloc. */ - GumboAllocatorFunction allocator; - - /** A memory deallocator function. Default: free. */ - GumboDeallocatorFunction deallocator; - - /** - * An opaque object that's passed in as the first argument to all callbacks - * used by this library. Default: NULL. - */ - void* userdata; - /** * The tab-stop size, for computing positions in source code that uses tabs. * Default: 8. @@ -795,10 +627,34 @@ GumboOutput* gumbo_parse(const char* buffer); GumboOutput* gumbo_parse_with_options( const GumboOptions* options, const char* buffer, size_t buffer_length); +/** + * Parse a chunk of HTML with the given fragment context. If `fragment_ctx` + * is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document. + */ +GumboOutput* gumbo_parse_fragment( + const GumboOptions* options, const char* buffer, size_t length, + const GumboTag fragment_ctx); + /** Release the memory used for the parse tree & parse errors. */ -void gumbo_destroy_output( - const GumboOptions* options, GumboOutput* output); +void gumbo_destroy_output(GumboOutput* output); +/** Create a new node object, unatached to any documents */ +GumboNode* gumbo_create_node(GumboNodeType type); + +/** Release the memory used by a single node */ +void gumbo_destroy_node(GumboNode* node); + +/** + * Set the memory allocator to be used by the library. + * allocator_p needs to be a `realloc`-compatible API + */ +void gumbo_memory_set_allocator(void *(*allocator_p)(void *, size_t)); + +/** + * Set the memory free function to be used by the library. + * free_p needs to be a `free`-compatible API + */ +void gumbo_memory_set_free(void (*free_p)(void *)); #ifdef __cplusplus } diff --git a/src/parser.c b/src/parser.c index 004639dc..b507c997 100644 --- a/src/parser.c +++ b/src/parser.c @@ -46,19 +46,13 @@ typedef char gumbo_tagset[GUMBO_TAG_LAST]; (tag < GUMBO_TAG_LAST && \ tagset[(int)tag] == (1 << (int)namespace)) - -static void* malloc_wrapper(void* unused, size_t size) { - return malloc(size); -} - -static void free_wrapper(void* unused, void* ptr) { - free(ptr); -} +// selected forward declarations as it is getting hard to find +// an appropriate order +static bool node_html_tag_is(const GumboNode*, GumboTag); +static GumboInsertionMode get_current_template_insertion_mode(const GumboParser*); +static bool handle_in_template(GumboParser*, GumboToken*); const GumboOptions kGumboDefaultOptions = { - &malloc_wrapper, - &free_wrapper, - NULL, 8, false, -1, @@ -190,7 +184,7 @@ typedef struct _ReplacementEntry { { GUMBO_STRING(from), GUMBO_STRING(to) } // Static data for SVG attribute replacements. -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes +// https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes static const ReplacementEntry kSvgAttributeReplacements[] = { REPLACEMENT_ENTRY("attributename", "attributeName"), REPLACEMENT_ENTRY("attributetype", "attributeType"), @@ -198,12 +192,14 @@ static const ReplacementEntry kSvgAttributeReplacements[] = { REPLACEMENT_ENTRY("baseprofile", "baseProfile"), REPLACEMENT_ENTRY("calcmode", "calcMode"), REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"), +#ifndef GUMBO_HTML5_TIP REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"), REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"), - REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), - REPLACEMENT_ENTRY("edgemode", "edgeMode"), REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"), REPLACEMENT_ENTRY("filterres", "filterRes"), +#endif + REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), + REPLACEMENT_ENTRY("edgemode", "edgeMode"), REPLACEMENT_ENTRY("filterunits", "filterUnits"), REPLACEMENT_ENTRY("glyphref", "glyphRef"), REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"), @@ -345,7 +341,7 @@ typedef struct _TextNodeBufferState { // The source position of the start of this text node. GumboSourcePosition _start_position; - // The type of node that will be inserted (TEXT or WHITESPACE). + // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). GumboNodeType _type; } TextNodeBufferState; @@ -371,6 +367,9 @@ typedef struct GumboInternalParserState { GumboNode* _head_element; GumboNode* _form_element; + // The element used as fragment context when parsing in fragment mode + GumboNode* _fragment_ctx; + // The flag for when the spec says "Reprocess the current token in..." bool _reprocess_current_token; @@ -450,8 +449,8 @@ static void set_frameset_not_ok(GumboParser* parser) { parser->_parser_state->_frameset_ok = false; } -static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { - GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode)); +GumboNode* gumbo_create_node(GumboNodeType type) { + GumboNode* node = gumbo_malloc(sizeof(GumboNode)); node->parent = NULL; node->index_within_parent = -1; node->type = type; @@ -459,11 +458,10 @@ static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { return node; } -static GumboNode* new_document_node(GumboParser* parser) { - GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT); +static GumboNode* new_document_node(void) { + GumboNode* document_node = gumbo_create_node(GUMBO_NODE_DOCUMENT); document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; - gumbo_vector_init( - parser, 1, &document_node->v.document.children); + gumbo_vector_init(1, &document_node->v.document.children); // Must be initialized explicitly, as there's no guarantee that we'll see a // doc type token. @@ -476,28 +474,29 @@ static GumboNode* new_document_node(GumboParser* parser) { } static void output_init(GumboParser* parser) { - GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); + GumboOutput* output = gumbo_malloc(sizeof(GumboOutput)); output->root = NULL; - output->document = new_document_node(parser); + output->document = new_document_node(); parser->_output = output; gumbo_init_errors(parser); } static void parser_state_init(GumboParser* parser) { GumboParserState* parser_state = - gumbo_parser_allocate(parser, sizeof(GumboParserState)); + gumbo_malloc(sizeof(GumboParserState)); parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; parser_state->_reprocess_current_token = false; parser_state->_frameset_ok = true; parser_state->_ignore_next_linefeed = false; parser_state->_foster_parent_insertions = false; parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; - gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer); - gumbo_vector_init(parser, 10, &parser_state->_open_elements); - gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements); - gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes); + gumbo_string_buffer_init(&parser_state->_text_node._buffer); + gumbo_vector_init(10, &parser_state->_open_elements); + gumbo_vector_init(5, &parser_state->_active_formatting_elements); + gumbo_vector_init(5, &parser_state->_template_insertion_modes); parser_state->_head_element = NULL; parser_state->_form_element = NULL; + parser_state->_fragment_ctx = NULL; parser_state->_current_token = NULL; parser_state->_closed_body_tag = false; parser_state->_closed_html_tag = false; @@ -506,17 +505,23 @@ static void parser_state_init(GumboParser* parser) { static void parser_state_destroy(GumboParser* parser) { GumboParserState* state = parser->_parser_state; - gumbo_vector_destroy(parser, &state->_active_formatting_elements); - gumbo_vector_destroy(parser, &state->_open_elements); - gumbo_vector_destroy(parser, &state->_template_insertion_modes); - gumbo_string_buffer_destroy(parser, &state->_text_node._buffer); - gumbo_parser_deallocate(parser, state); + if (state->_fragment_ctx) + gumbo_destroy_node(state->_fragment_ctx); + gumbo_vector_destroy(&state->_active_formatting_elements); + gumbo_vector_destroy(&state->_open_elements); + gumbo_vector_destroy(&state->_template_insertion_modes); + gumbo_string_buffer_destroy(&state->_text_node._buffer); + gumbo_free(state); } static GumboNode* get_document_node(GumboParser* parser) { return parser->_output->document; } +static bool is_fragment_parser(const GumboParser *parser) { + return !!parser->_parser_state->_fragment_ctx; +} + // Returns the node at the bottom of the stack of open elements, or NULL if no // elements have been added yet. static GumboNode* get_current_node(GumboParser* parser) { @@ -530,6 +535,13 @@ static GumboNode* get_current_node(GumboParser* parser) { return open_elements->data[open_elements->length - 1]; } +static GumboNode* get_adjusted_current_node(GumboParser* parser) { + GumboParserState *state = parser->_parser_state; + if (state->_open_elements.length == 1 && state->_fragment_ctx) + return state->_fragment_ctx; + return get_current_node(parser); +} + // Returns true if the given needle is in the given array of literal // GumboStringPieces. If exact_match is true, this requires that they match // exactly; otherwise, this performs a prefix match to check if any of the @@ -550,55 +562,79 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { parser->_parser_state->_insertion_mode = mode; } + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately // This is a helper function that returns the appropriate insertion mode instead // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to // indicate that there is no appropriate insertion mode, and the loop should // continue. -static GumboInsertionMode get_appropriate_insertion_mode( - const GumboNode* node, bool is_last) { - assert(node->type == GUMBO_NODE_ELEMENT); - - if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) { - switch (node->v.element.tag) { - case GUMBO_TAG_SELECT: - return GUMBO_INSERTION_MODE_IN_SELECT; - case GUMBO_TAG_TD: - case GUMBO_TAG_TH: - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL; - case GUMBO_TAG_TR: - return GUMBO_INSERTION_MODE_IN_ROW; - case GUMBO_TAG_TBODY: - case GUMBO_TAG_THEAD: - case GUMBO_TAG_TFOOT: - return GUMBO_INSERTION_MODE_IN_TABLE_BODY; - case GUMBO_TAG_CAPTION: - return GUMBO_INSERTION_MODE_IN_CAPTION; - case GUMBO_TAG_COLGROUP: - return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; - case GUMBO_TAG_TABLE: - return GUMBO_INSERTION_MODE_IN_TABLE; - case GUMBO_TAG_HEAD: - case GUMBO_TAG_BODY: - return GUMBO_INSERTION_MODE_IN_BODY; - case GUMBO_TAG_FRAMESET: - return GUMBO_INSERTION_MODE_IN_FRAMESET; - case GUMBO_TAG_HTML: - return GUMBO_INSERTION_MODE_BEFORE_HEAD; - default: - break; - } - } - return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; +static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + const GumboNode* node = open_elements->data[index]; + const bool is_last = index == 0; + + if (is_last && is_fragment_parser(parser)) + node = parser->_parser_state->_fragment_ctx; + + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + switch (node->v.element.tag) { + case GUMBO_TAG_SELECT: { + if (is_last) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + for (int i = index; i > 0; --i) { + const GumboNode* ancestor = open_elements->data[i]; + if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { + return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; + } + } + return GUMBO_INSERTION_MODE_IN_SELECT; + } + case GUMBO_TAG_TD: + case GUMBO_TAG_TH: + if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; + break; + case GUMBO_TAG_TR: + return GUMBO_INSERTION_MODE_IN_ROW; + case GUMBO_TAG_TBODY: + case GUMBO_TAG_THEAD: + case GUMBO_TAG_TFOOT: + return GUMBO_INSERTION_MODE_IN_TABLE_BODY; + case GUMBO_TAG_CAPTION: + return GUMBO_INSERTION_MODE_IN_CAPTION; + case GUMBO_TAG_COLGROUP: + return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; + case GUMBO_TAG_TABLE: + return GUMBO_INSERTION_MODE_IN_TABLE; + case GUMBO_TAG_TEMPLATE: + return get_current_template_insertion_mode(parser); + case GUMBO_TAG_HEAD: + if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; + break; + case GUMBO_TAG_BODY: + return GUMBO_INSERTION_MODE_IN_BODY; + case GUMBO_TAG_FRAMESET: + return GUMBO_INSERTION_MODE_IN_FRAMESET; + case GUMBO_TAG_HTML: + return parser->_parser_state->_head_element ? + GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD; + default: + break; + } + return is_last ? + GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; } + // This performs the actual "reset the insertion mode" loop. static void reset_insertion_mode_appropriately(GumboParser* parser) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (int i = open_elements->length; --i >= 0; ) { GumboInsertionMode mode = - get_appropriate_insertion_mode(open_elements->data[i], i == 0); + get_appropriate_insertion_mode(parser, i); if (mode != GUMBO_INSERTION_MODE_INITIAL) { set_insertion_mode(parser, mode); return; @@ -628,12 +664,12 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken* } GumboParserState* state = parser->_parser_state; extra_data->parser_state = state->_insertion_mode; - gumbo_vector_init(parser, state->_open_elements.length, + gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack); for (int i = 0; i < state->_open_elements.length; ++i) { const GumboNode* node = state->_open_elements.data[i]; - assert(node->type == GUMBO_NODE_ELEMENT); - gumbo_vector_add(parser, (void*) node->v.element.tag, + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + gumbo_vector_add((void*) node->v.element.tag, &extra_data->tag_stack); } return error; @@ -669,7 +705,7 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { // Like tag_in, but checks for the tag of a node, rather than a token. static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { assert(node != NULL); - if (node->type != GUMBO_NODE_ELEMENT) { + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { return false; } return TAGSET_INCLUDES(tags, node->v.element.tag_namespace, node->v.element.tag); @@ -678,7 +714,7 @@ static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { // Like node_tag_in, but for the single-tag case. static bool node_qualified_tag_is(const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { - return node->type == GUMBO_NODE_ELEMENT && + return (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) && node->v.element.tag == tag && node->v.element.tag_namespace == ns; } @@ -689,6 +725,23 @@ static bool node_html_tag_is(const GumboNode* node, GumboTag tag) return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); } +static void push_template_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { + gumbo_vector_add((void*) mode, &parser->_parser_state->_template_insertion_modes); +} + +static void pop_template_insertion_mode(GumboParser* parser) { + gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes); +} + +// Returns the current template insertion mode. If the stack of template +// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. +static GumboInsertionMode get_current_template_insertion_mode(const GumboParser* parser) { + GumboVector* template_insertion_modes = &parser->_parser_state->_template_insertion_modes; + if (template_insertion_modes->length == 0) { + return GUMBO_INSERTION_MODE_INITIAL; + } + return (GumboInsertionMode) template_insertion_modes->data[(template_insertion_modes->length - 1)]; +} // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point static bool is_mathml_integration_point(const GumboNode* node) { @@ -706,14 +759,70 @@ static bool is_html_integration_point(const GumboNode* node) { "encoding", "application/xhtml+xml"))); } + +// This represents a place to insert a node, consisting of a target parent and a +// child index within that parent. If the node should be inserted at the end of +// the parent's child, index will be -1. +typedef struct { + GumboNode* target; + int index; +} InsertionLocation; + +InsertionLocation get_appropriate_insertion_location(GumboParser* parser, GumboNode* override_target) { + InsertionLocation retval = { override_target, -1 }; + if (retval.target == NULL) { + // No override target; default to the current node, but special-case the + // root node since get_current_node() assumes the stack of open elements is + // non-empty. + retval.target = parser->_output->root != NULL ? + get_current_node(parser) : get_document_node(parser); + } + if (!parser->_parser_state->_foster_parent_insertions || + !node_tag_in_set(retval.target, (gumbo_tagset) { TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR) })) { + return retval; + } + + // Foster-parenting case. + int last_template_index = -1; + int last_table_index = -1; + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = 0; i < open_elements->length; ++i) { + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { + last_template_index = i; + } + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { + last_table_index = i; + } + } + if (last_template_index != -1 && + (last_table_index == -1 || last_template_index > last_table_index)) { + retval.target = open_elements->data[last_template_index]; + return retval; + } + if (last_table_index == -1) { + retval.target = open_elements->data[0]; + return retval; + } + GumboNode* last_table = open_elements->data[last_table_index]; + if (last_table->parent != NULL) { + retval.target = last_table->parent; + retval.index = last_table->index_within_parent; + return retval; + } + + retval.target = open_elements->data[last_table_index - 1]; + return retval; +} + + // Appends a node to the end of its parent, setting the "parent" and // "index_within_parent" fields appropriately. -static void append_node( - GumboParser* parser, GumboNode* parent, GumboNode* node) { +static void append_node(GumboNode* parent, GumboNode* node) { assert(node->parent == NULL); assert(node->index_within_parent == -1); GumboVector* children; - if (parent->type == GUMBO_NODE_ELEMENT) { + if (parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE) { children = &parent->v.element.children; } else { assert(parent->type == GUMBO_NODE_DOCUMENT); @@ -721,70 +830,47 @@ static void append_node( } node->parent = parent; node->index_within_parent = children->length; - gumbo_vector_add(parser, (void*) node, children); + gumbo_vector_add((void*) node, children); assert(node->index_within_parent < children->length); } -// Inserts a node at the specified index within its parent, updating the +// Inserts a node at the specified InsertionLocation, updating the // "parent" and "index_within_parent" fields of it and all its siblings. -static void insert_node( - GumboParser* parser, GumboNode* parent, int index, GumboNode* node) { +// If the index of the location is -1, this calls append_node. +static void insert_node(GumboNode* node, InsertionLocation location) { assert(node->parent == NULL); assert(node->index_within_parent == -1); - assert(parent->type == GUMBO_NODE_ELEMENT); - GumboVector* children = &parent->v.element.children; - assert(index >= 0); - assert(index < children->length); - node->parent = parent; - node->index_within_parent = index; - gumbo_vector_insert_at(parser, (void*) node, index, children); - assert(node->index_within_parent < children->length); - for (int i = index + 1; i < children->length; ++i) { - GumboNode* sibling = children->data[i]; - sibling->index_within_parent = i; - assert(sibling->index_within_parent < children->length); - } -} + GumboNode* parent = location.target; + int index = location.index; + if (index != -1) { + GumboVector* children = NULL; + if (parent->type == GUMBO_NODE_ELEMENT || + parent->type == GUMBO_NODE_TEMPLATE) { + children = &parent->v.element.children; + } else if (parent->type == GUMBO_NODE_DOCUMENT) { + children = &parent->v.document.children; + assert(children->length == 0); + } else { + assert(0); + } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting -static void foster_parent_element(GumboParser* parser, GumboNode* node) { - GumboVector* open_elements = &parser->_parser_state->_open_elements; - assert(open_elements->length > 2); - - node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED; - GumboNode* foster_parent_element = open_elements->data[0]; - assert(foster_parent_element->type == GUMBO_NODE_ELEMENT); - assert(node_html_tag_is(foster_parent_element, GUMBO_TAG_HTML)); - for (int i = open_elements->length; --i > 1; ) { - GumboNode* table_element = open_elements->data[i]; - if (node_html_tag_is(table_element, GUMBO_TAG_TABLE)) { - foster_parent_element = table_element->parent; - if (!foster_parent_element || - foster_parent_element->type != GUMBO_NODE_ELEMENT) { - // Table has no parent; spec says it's possible if a script manipulated - // the DOM, although I don't think we have to worry about this case. - gumbo_debug("Table has no parent.\n"); - foster_parent_element = open_elements->data[i - 1]; - break; - } - assert(foster_parent_element->type == GUMBO_NODE_ELEMENT); - gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n", - table_element, i, gumbo_normalized_tagname( - foster_parent_element->v.element.tag), - table_element->index_within_parent); - assert(foster_parent_element->v.element.children.data[ - table_element->index_within_parent] == table_element); - insert_node(parser, foster_parent_element, - table_element->index_within_parent, node); - return; + assert(index >= 0); + assert(index < children->length); + node->parent = parent; + node->index_within_parent = index; + gumbo_vector_insert_at((void*) node, index, children); + assert(node->index_within_parent < children->length); + for (int i = index + 1; i < children->length; ++i) { + GumboNode* sibling = children->data[i]; + sibling->index_within_parent = i; + assert(sibling->index_within_parent < children->length); } + } else { + append_node(parent, node); } - if (node->type == GUMBO_NODE_ELEMENT) { - gumbo_vector_add(parser, (void*) node, open_elements); - } - append_node(parser, foster_parent_element, node); } + static void maybe_flush_text_node_buffer(GumboParser* parser) { GumboParserState* state = parser->_parser_state; TextNodeBufferState* buffer_state = &state->_text_node; @@ -793,30 +879,31 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { } assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || - buffer_state->_type == GUMBO_NODE_TEXT); - GumboNode* text_node = create_node(parser, buffer_state->_type); + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); + GumboNode* text_node = gumbo_create_node(buffer_state->_type); GumboText* text_node_data = &text_node->v.text; - text_node_data->text = gumbo_string_buffer_to_string( - parser, &buffer_state->_buffer); + text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer); text_node_data->original_text.data = buffer_state->_start_original_text; text_node_data->original_text.length = state->_current_token->original_text.data - buffer_state->_start_original_text; text_node_data->start_pos = buffer_state->_start_position; - if (state->_foster_parent_insertions && - node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(TABLE), TAG(TBODY), TAG(TFOOT), - TAG(THEAD), TAG(TR) })) { - foster_parent_element(parser, text_node); - } else { - append_node( - parser, parser->_output->root ? - get_current_node(parser) : parser->_output->document, text_node); - } + gumbo_debug("Flushing text node buffer of %.*s.\n", (int) buffer_state->_buffer.length, buffer_state->_buffer.data); - gumbo_string_buffer_destroy(parser, &buffer_state->_buffer); - gumbo_string_buffer_init(parser, &buffer_state->_buffer); + InsertionLocation location = get_appropriate_insertion_location(parser, NULL); + if (location.target->type == GUMBO_NODE_DOCUMENT) { + // The DOM does not allow Document nodes to have Text children, so per the + // spec, they are dropped on the floor. + gumbo_destroy_node(text_node); + } else { + insert_node(text_node, location); + } + + gumbo_string_buffer_destroy(&buffer_state->_buffer); + gumbo_string_buffer_init(&buffer_state->_buffer); buffer_state->_type = GUMBO_NODE_WHITESPACE; assert(buffer_state->_buffer.length == 0); } @@ -838,12 +925,12 @@ static GumboNode* pop_current_node(GumboParser* parser) { "Popping %s node.\n", gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); } - GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements); + GumboNode* current_node = gumbo_vector_pop(&state->_open_elements); if (!current_node) { assert(state->_open_elements.length == 0); return NULL; } - assert(current_node->type == GUMBO_NODE_ELEMENT); + assert(current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE); bool is_closed_body_or_html_tag = (node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) || (node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag); @@ -861,25 +948,25 @@ static GumboNode* pop_current_node(GumboParser* parser) { static void append_comment_node( GumboParser* parser, GumboNode* node, const GumboToken* token) { maybe_flush_text_node_buffer(parser); - GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT); + GumboNode* comment = gumbo_create_node(GUMBO_NODE_COMMENT); comment->type = GUMBO_NODE_COMMENT; comment->parse_flags = GUMBO_INSERTION_NORMAL; comment->v.text.text = token->v.text; comment->v.text.original_text = token->original_text; comment->v.text.start_pos = token->position; - append_node(parser, node, comment); + append_node(node, comment); } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR) })) { + while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { pop_current_node(parser); } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE) } )) { + while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE), TAG(TEMPLATE) } )) { pop_current_node(parser); } } @@ -887,35 +974,41 @@ static void clear_stack_to_table_context(GumboParser* parser) { // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context void clear_stack_to_table_body_context(GumboParser* parser) { while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TBODY), - TAG(TFOOT), TAG(THEAD) })) { + TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) })) { pop_current_node(parser); } } // Creates a parser-inserted element in the HTML namespace and returns it. static GumboNode* create_element(GumboParser* parser, GumboTag tag) { - GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboNode* node = gumbo_create_node(GUMBO_NODE_ELEMENT); GumboElement* element = &node->v.element; - gumbo_vector_init(parser, 1, &element->children); - gumbo_vector_init(parser, 0, &element->attributes); + gumbo_vector_init(1, &element->children); + gumbo_vector_init(0, &element->attributes); element->tag = tag; element->tag_namespace = GUMBO_NAMESPACE_HTML; element->original_tag = kGumboEmptyString; element->original_end_tag = kGumboEmptyString; - element->start_pos = parser->_parser_state->_current_token->position; + element->start_pos = (parser->_parser_state->_current_token) ? + parser->_parser_state->_current_token->position : kGumboEmptySourcePosition; element->end_pos = kGumboEmptySourcePosition; return node; } // Constructs an element from the given start tag token. static GumboNode* create_element_from_token( - GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { + GumboToken* token, GumboNamespaceEnum tag_namespace) { assert(token->type == GUMBO_TOKEN_START_TAG); GumboTokenStartTag* start_tag = &token->v.start_tag; - GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboNodeType type = ( + tag_namespace == GUMBO_NAMESPACE_HTML && + start_tag->tag == GUMBO_TAG_TEMPLATE) + ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT; + + GumboNode* node = gumbo_create_node(type); GumboElement* element = &node->v.element; - gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(1, &element->children); element->attributes = start_tag->attributes; element->tag = start_tag->tag; element->tag_namespace = tag_namespace; @@ -950,21 +1043,10 @@ static void insert_element(GumboParser* parser, GumboNode* node, if (!is_reconstructing_formatting_elements) { maybe_flush_text_node_buffer(parser); } - if (state->_foster_parent_insertions && - node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(TABLE), TAG(TBODY), TAG(TFOOT), - TAG(THEAD), TAG(TR) } )) { - foster_parent_element(parser, node); - gumbo_vector_add(parser, (void*) node, &state->_open_elements); - return; - } - - // This is called to insert the root HTML element, but get_current_node - // assumes the stack of open elements is non-empty, so we need special - // handling for this case. - append_node( - parser, parser->_output->root ? - get_current_node(parser) : parser->_output->document, node); - gumbo_vector_add(parser, (void*) node, &state->_open_elements); + InsertionLocation location = + get_appropriate_insertion_location(parser, NULL); + insert_node(node, location); + gumbo_vector_add((void*) node, &state->_open_elements); } // Convenience method that combines create_element_from_token and @@ -973,7 +1055,7 @@ static void insert_element(GumboParser* parser, GumboNode* node, static GumboNode* insert_element_from_token( GumboParser* parser, GumboToken* token) { GumboNode* element = - create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML); + create_element_from_token(token, GUMBO_NAMESPACE_HTML); insert_element(parser, element, false); gumbo_debug("Inserting <%s> element (@%x) from token.\n", gumbo_normalized_tagname(element->v.element.tag), element); @@ -998,7 +1080,7 @@ static GumboNode* insert_element_of_tag_type( static GumboNode* insert_foreign_element( GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { assert(token->type == GUMBO_TOKEN_START_TAG); - GumboNode* element = create_element_from_token(parser, token, tag_namespace); + GumboNode* element = create_element_from_token(token, tag_namespace); insert_element(parser, element, false); if (token_has_attribute(token, "xmlns") && !attribute_matches_case_sensitive( @@ -1019,7 +1101,9 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER); + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_NULL || + token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { // Initialize position fields. @@ -1027,9 +1111,11 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) { buffer_state->_start_position = token->position; } gumbo_string_buffer_append_codepoint( - parser, token->v.character, &buffer_state->_buffer); + token->v.character, &buffer_state->_buffer); if (token->type == GUMBO_TOKEN_CHARACTER) { buffer_state->_type = GUMBO_NODE_TEXT; + } else if (token->type == GUMBO_TOKEN_CDATA) { + buffer_state->_type = GUMBO_NODE_CDATA; } gumbo_debug("Inserting text token '%c'.\n", token->v.character); } @@ -1111,10 +1197,10 @@ static void add_formatting_element(GumboParser* parser, const GumboNode* node) { if (num_identical_elements >= 3) { gumbo_debug("Noah's ark clause: removing element at %d.\n", earliest_identical_element); - gumbo_vector_remove_at(parser, earliest_identical_element, elements); + gumbo_vector_remove_at(earliest_identical_element, elements); } - gumbo_vector_add(parser, (void*) node, elements); + gumbo_vector_add((void*) node, elements); } static bool is_open_element(GumboParser* parser, const GumboNode* node) { @@ -1130,10 +1216,9 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) { // Clones attributes, tags, etc. of a node, but does not copy the content. The // clone shares no structure with the original node: all owned strings and // values are fresh copies. -GumboNode* clone_node( - GumboParser* parser, const GumboNode* node, GumboParseFlags reason) { - assert(node->type == GUMBO_NODE_ELEMENT); - GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode)); +GumboNode* clone_node(const GumboNode* node, GumboParseFlags reason) { + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + GumboNode* new_node = gumbo_malloc(sizeof(GumboNode)); *new_node = *node; new_node->parent = NULL; new_node->index_within_parent = -1; @@ -1142,18 +1227,17 @@ GumboNode* clone_node( new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG; new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER; GumboElement* element = &new_node->v.element; - gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(1, &element->children); const GumboVector* old_attributes = &node->v.element.attributes; - gumbo_vector_init(parser, old_attributes->length, &element->attributes); + gumbo_vector_init(old_attributes->length, &element->attributes); for (int i = 0; i < old_attributes->length; ++i) { const GumboAttribute* old_attr = old_attributes->data[i]; - GumboAttribute* attr = - gumbo_parser_allocate(parser, sizeof(GumboAttribute)); + GumboAttribute* attr = gumbo_malloc(sizeof(GumboAttribute)); *attr = *old_attr; - attr->name = gumbo_copy_stringz(parser, old_attr->name); - attr->value = gumbo_copy_stringz(parser, old_attr->value); - gumbo_vector_add(parser, attr, &element->attributes); + attr->name = gumbo_strdup(old_attr->name); + attr->value = gumbo_strdup(old_attr->value); + gumbo_vector_add(attr, &element->attributes); } return new_node; } @@ -1199,10 +1283,12 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) { assert(i < elements->length); element = elements->data[i]; assert(element != &kActiveFormattingScopeMarker); - GumboNode* clone = clone_node( - parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT); + GumboNode* clone = clone_node(element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT); // Step 9. - insert_element(parser, clone, true); + InsertionLocation location = get_appropriate_insertion_location(parser, NULL); + insert_node(clone, location); + gumbo_vector_add((void*) clone, &parser->_parser_state->_open_elements); + // Step 10. elements->data[i] = clone; gumbo_debug("Reconstructed %s element at %d.\n", @@ -1215,7 +1301,7 @@ static void clear_active_formatting_elements(GumboParser* parser) { int num_elements_cleared = 0; const GumboNode* node; do { - node = gumbo_vector_pop(parser, elements); + node = gumbo_vector_pop(elements); ++num_elements_cleared; } while(node && node != &kActiveFormattingScopeMarker); gumbo_debug("Cleared %d elements from active formatting list.\n", @@ -1256,37 +1342,40 @@ static GumboQuirksModeEnum compute_quirks_mode( // names. For example, "has an element in list scope" looks for an element of // the given qualified name within the nearest enclosing
    or