From a74d295dfb5ef18580c8dc5e893005bb7160aa25 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 16 Feb 2015 19:53:21 +0100 Subject: [PATCH] parser: Implement fragment parsing The HTML5 fragment parsing algorithm has been implemented using a new API, `gumbo_parse_fragment`. The old APIs are maintained for backwards compatibility, although passing `GUMBO_TAG_LAST` as the inner_html context to `parse_fragment` will cause it to parse the buffer as a full document (same functionality as `gumbo_parse_with_options`). The HTML5lib adapter code has been modified to support fragment parsing tests (the tests are passing 100%). --- python/gumbo/gumboc.py | 23 ++-- python/gumbo/html5lib_adapter.py | 22 +++- python/gumbo/html5lib_adapter_test.py | 12 +-- src/gumbo.h | 8 ++ src/parser.c | 150 +++++++++++++++++++++----- 5 files changed, 172 insertions(+), 43 deletions(-) diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index a16afe9f..9ab3e12e 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -246,6 +246,11 @@ def to_url(self): class Tag(Enum): + @staticmethod + def from_str(tagname): + text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) + return _tag_enum(text_ptr) + _values_ = [ 'HTML', 'HEAD', @@ -398,6 +403,7 @@ class Tag(Enum): 'SPACER', 'TT', 'UNKNOWN', + 'LAST' ] @@ -498,11 +504,6 @@ def __repr__(self): class Options(ctypes.Structure): _fields_ = [ - # TODO(jdtang): Allow the Python API to set the allocator/deallocator - # function. Right now these are treated as opaque void pointers. - ('allocator', ctypes.c_void_p), - ('deallocator', ctypes.c_void_p), - ('userdata', ctypes.c_void_p), ('tab_stop', ctypes.c_int), ('stop_on_first_error', ctypes.c_bool), ('max_errors', ctypes.c_int), @@ -517,10 +518,10 @@ class Output(ctypes.Structure): ('errors', Vector), ] - @contextlib.contextmanager def parse(text, **kwargs): options = Options() + container = kwargs.get("inner_html", Tag.LAST) for field_name, _ in Options._fields_: try: setattr(options, field_name, kwargs[field_name]) @@ -531,7 +532,7 @@ def parse(text, **kwargs): # call, it creates a temporary buffer which is destroyed when the call # completes, and then the original_text pointers point into invalid memory. text_ptr = ctypes.c_char_p(text.encode('utf-8')) - output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) + output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container) try: yield output finally: @@ -543,6 +544,10 @@ def parse(text, **kwargs): _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] _parse_with_options.restype = _Ptr(Output) +_parse_fragment = _dll.gumbo_parse_fragment +_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag] +_parse_fragment.restype = _Ptr(Output) + _tag_from_original_text = _dll.gumbo_tag_from_original_text _tag_from_original_text.argtypes = [_Ptr(StringPiece)] _tag_from_original_text.restype = None @@ -559,6 +564,10 @@ def parse(text, **kwargs): _tagname.argtypes = [Tag] _tagname.restype = ctypes.c_char_p +_tag_enum = _dll.gumbo_tag_enum +_tag_enum.argtypes = [ctypes.c_char_p] +_tag_enum.restype = Tag + __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', diff --git a/python/gumbo/html5lib_adapter.py b/python/gumbo/html5lib_adapter.py index 7615814a..54d4fc17 100644 --- a/python/gumbo/html5lib_adapter.py +++ b/python/gumbo/html5lib_adapter.py @@ -70,12 +70,12 @@ def _convert_element(source_node): } -def _insert_root(treebuilder, source_node): +def _insert_root(treebuilder, source_node, pop_element = True): treebuilder.insertRoot(_convert_element(source_node)) for child_node in source_node.children: _insert_node(treebuilder, child_node) - treebuilder.openElements.pop() - + if pop_element: + treebuilder.openElements.pop() def _insert_node(treebuilder, source_node): assert source_node.type != gumboc.NodeType.DOCUMENT @@ -115,3 +115,19 @@ def parse(self, text_or_file, **kwargs): else: assert 'Only comments and nodes allowed at the root' return self.tree.getDocument() + + def parseFragment(self, text_or_file, inner_html, **kwargs): + try: + text = text_or_file.read() + except AttributeError: + # Assume a string. + text = text_or_file + inner_html = gumboc.Tag.from_str(inner_html) + + with gumboc.parse(text, inner_html=inner_html, **kwargs) as output: + for node in output.contents.document.contents.children: + if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): + _insert_root(self.tree, output.contents.root.contents, False) + else: + assert 'Malformed fragment parse (??)' + return self.tree.getFragment() diff --git a/python/gumbo/html5lib_adapter_test.py b/python/gumbo/html5lib_adapter_test.py index b1d8bc81..16908f92 100644 --- a/python/gumbo/html5lib_adapter_test.py +++ b/python/gumbo/html5lib_adapter_test.py @@ -123,11 +123,10 @@ def impl(self, inner_html, input, expected, errors): p = html5lib_adapter.HTMLParser( tree=TREEBUILDER(namespaceHTMLElements=True)) - if not inner_html: - # TODO(jdtang): Need to implement fragment parsing. - document = p.parse(StringIO.StringIO(input)) + if inner_html: + document = p.parseFragment(StringIO.StringIO(input), inner_html) else: - return + document = p.parse(StringIO.StringIO(input)) with warnings.catch_warnings(): # Etree serializer in html5lib uses a deprecated getchildren() API. @@ -137,11 +136,6 @@ def impl(self, inner_html, input, expected, errors): expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub( r'\1', convertExpected(expected, 2)) - # html5lib doesn't yet support the template tag, but it appears in the - # tests with the expectation that the template contents will be under the - # word 'contents', so we need to reformat that string a bit. - expected = reformatTemplateContents(expected) - error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected, '\nReceived:', output]) self.assertEquals(expected, output, diff --git a/src/gumbo.h b/src/gumbo.h index e4ce1995..d5c5acc7 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -791,6 +791,14 @@ GumboOutput* gumbo_parse(const char* buffer); GumboOutput* gumbo_parse_with_options( const GumboOptions* options, const char* buffer, size_t buffer_length); +/** + * Parse a chunk of HTML with the given fragment context. If `fragment_ctx` + * is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document. + */ +GumboOutput* gumbo_parse_fragment( + const GumboOptions* options, const char* buffer, size_t length, + const GumboTag fragment_ctx); + /** Release the memory used for the parse tree & parse errors. */ void gumbo_destroy_output(GumboOutput* output); diff --git a/src/parser.c b/src/parser.c index c2eae936..e9d2fcba 100644 --- a/src/parser.c +++ b/src/parser.c @@ -365,6 +365,9 @@ typedef struct GumboInternalParserState { GumboNode* _head_element; GumboNode* _form_element; + // The element used as fragment context when parsing in fragment mode + GumboNode* _fragment_ctx; + // The flag for when the spec says "Reprocess the current token in..." bool _reprocess_current_token; @@ -491,6 +494,7 @@ static void parser_state_init(GumboParser* parser) { gumbo_vector_init(5, &parser_state->_template_insertion_modes); parser_state->_head_element = NULL; parser_state->_form_element = NULL; + parser_state->_fragment_ctx = NULL; parser_state->_current_token = NULL; parser_state->_closed_body_tag = false; parser_state->_closed_html_tag = false; @@ -499,6 +503,8 @@ static void parser_state_init(GumboParser* parser) { static void parser_state_destroy(GumboParser* parser) { GumboParserState* state = parser->_parser_state; + if (state->_fragment_ctx) + gumbo_destroy_node(state->_fragment_ctx); gumbo_vector_destroy(&state->_active_formatting_elements); gumbo_vector_destroy(&state->_open_elements); gumbo_vector_destroy(&state->_template_insertion_modes); @@ -510,6 +516,10 @@ static GumboNode* get_document_node(GumboParser* parser) { return parser->_output->document; } +static bool is_fragment_parser(const GumboParser *parser) { + return !!parser->_parser_state->_fragment_ctx; +} + // Returns the node at the bottom of the stack of open elements, or NULL if no // elements have been added yet. static GumboNode* get_current_node(GumboParser* parser) { @@ -523,6 +533,13 @@ static GumboNode* get_current_node(GumboParser* parser) { return open_elements->data[open_elements->length - 1]; } +static GumboNode* get_adjusted_current_node(GumboParser* parser) { + GumboParserState *state = parser->_parser_state; + if (state->_open_elements.length == 1 && state->_fragment_ctx) + return state->_fragment_ctx; + return get_current_node(parser); +} + // Returns true if the given needle is in the given array of literal // GumboStringPieces. If exact_match is true, this requires that they match // exactly; otherwise, this performs a prefix match to check if any of the @@ -552,7 +569,11 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; const GumboNode* node = open_elements->data[index]; - bool is_last = index == 0; + const bool is_last = index == 0; + + if (is_last && is_fragment_parser(parser)) + node = parser->_parser_state->_fragment_ctx; + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); switch (node->v.element.tag) { case GUMBO_TAG_SELECT: { @@ -572,8 +593,8 @@ static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* pars } case GUMBO_TAG_TD: case GUMBO_TAG_TH: - return is_last ? - GUMBO_INSERTION_MODE_INITIAL : GUMBO_INSERTION_MODE_IN_CELL; + if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; + break; case GUMBO_TAG_TR: return GUMBO_INSERTION_MODE_IN_ROW; case GUMBO_TAG_TBODY: @@ -589,9 +610,8 @@ static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* pars case GUMBO_TAG_TEMPLATE: return get_current_template_insertion_mode(parser); case GUMBO_TAG_HEAD: - // return is_last ? GUMBO_INSERTION_MODE_INITIAL : GUMBO_INSERTION_MODE_IN_HEAD; - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_HEAD; + if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; + break; case GUMBO_TAG_BODY: return GUMBO_INSERTION_MODE_IN_BODY; case GUMBO_TAG_FRAMESET: @@ -600,9 +620,10 @@ static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* pars return parser->_parser_state->_head_element ? GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD; default: - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; + break; } + return is_last ? + GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; } @@ -965,7 +986,8 @@ static GumboNode* create_element(GumboParser* parser, GumboTag tag) { element->tag_namespace = GUMBO_NAMESPACE_HTML; element->original_tag = kGumboEmptyString; element->original_end_tag = kGumboEmptyString; - element->start_pos = parser->_parser_state->_current_token->position; + element->start_pos = (parser->_parser_state->_current_token) ? + parser->_parser_state->_current_token->position : kGumboEmptySourcePosition; element->end_pos = kGumboEmptySourcePosition; return node; } @@ -3527,7 +3549,12 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) { ignore_token(parser); return false; } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { - // TODO(jdtang): Handle fragment parsing algorithm case. + /* fragment case: ignore the closing HTML token */ + if (is_fragment_parser(parser)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY); GumboNode* html = parser->_parser_state->_open_elements.data[0]; assert(node_html_tag_is(html, GUMBO_TAG_HTML)); @@ -3568,9 +3595,8 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { return false; } pop_current_node(parser); - // TODO(jdtang): Add a condition to ignore this for the fragment parsing - // algorithm. - if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) { + if (!is_fragment_parser(parser) && + !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET); } return true; @@ -3744,18 +3770,32 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { token_has_attribute(token, "color") || token_has_attribute(token, "face") || token_has_attribute(token, "size")))) { + + /* Parse error */ parser_add_parse_error(parser, token); - do { - pop_current_node(parser); - } while(!(is_mathml_integration_point(get_current_node(parser)) || - is_html_integration_point(get_current_node(parser)) || - get_current_node(parser)->v.element.tag_namespace == - GUMBO_NAMESPACE_HTML)); - parser->_parser_state->_reprocess_current_token = true; - return false; - } else if (token->type == GUMBO_TOKEN_START_TAG) { + + /* + * Fragment case: If the parser was originally created for the HTML + * fragment parsing algorithm, then act as described in the "any other + * start tag" entry below. + */ + if (!is_fragment_parser(parser)) { + do { + pop_current_node(parser); + } while(!(is_mathml_integration_point(get_current_node(parser)) || + is_html_integration_point(get_current_node(parser)) || + get_current_node(parser)->v.element.tag_namespace == + GUMBO_NAMESPACE_HTML)); + parser->_parser_state->_reprocess_current_token = true; + return false; + } + + assert(token->type == GUMBO_TOKEN_START_TAG); + } + + if (token->type == GUMBO_TOKEN_START_TAG) { const GumboNamespaceEnum current_namespace = - get_current_node(parser)->v.element.tag_namespace; + get_adjusted_current_node(parser)->v.element.tag_namespace; if (current_namespace == GUMBO_NAMESPACE_MATHML) { adjust_mathml_attributes(token); } @@ -3844,7 +3884,7 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { parser->_parser_state->_closed_html_tag = true; } - const GumboNode* current_node = get_current_node(parser); + const GumboNode* current_node = get_adjusted_current_node(parser); assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE); @@ -3875,6 +3915,59 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { } } +static void fragment_parser_init(GumboParser *parser, GumboTag fragment_ctx) { + GumboNode *root; + + assert(fragment_ctx != GUMBO_TAG_LAST); + + // 3 + parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx); + + // 4 + switch (fragment_ctx) { + case GUMBO_TAG_TITLE: + case GUMBO_TAG_TEXTAREA: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); + break; + + case GUMBO_TAG_STYLE: + case GUMBO_TAG_XMP: + case GUMBO_TAG_IFRAME: + case GUMBO_TAG_NOEMBED: + case GUMBO_TAG_NOFRAMES: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); + break; + + case GUMBO_TAG_SCRIPT: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); + break; + + case GUMBO_TAG_NOSCRIPT: + /* scripting is disabled in Gumbo, so leave the tokenizer + * in the default data state */ + break; + + case GUMBO_TAG_PLAINTEXT: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); + break; + + default: + /* default data state */ + break; + } + + // 5. 6. 7. + root = insert_element_of_tag_type(parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); + parser->_output->root = root; + + // 8. + if (fragment_ctx == GUMBO_TAG_TEMPLATE) + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + + // 10. + reset_insertion_mode_appropriately(parser); +} + GumboOutput* gumbo_parse(const char* buffer) { return gumbo_parse_with_options( &kGumboDefaultOptions, buffer, strlen(buffer)); @@ -3882,12 +3975,21 @@ GumboOutput* gumbo_parse(const char* buffer) { GumboOutput* gumbo_parse_with_options( const GumboOptions* options, const char* buffer, size_t length) { + return gumbo_parse_fragment(options, buffer, length, GUMBO_TAG_LAST); +} + +GumboOutput* gumbo_parse_fragment( + const GumboOptions* options, const char* buffer, size_t length, + const GumboTag fragment_ctx) { GumboParser parser; parser._options = options; output_init(&parser); gumbo_tokenizer_state_init(&parser, buffer, length); parser_state_init(&parser); + if (fragment_ctx != GUMBO_TAG_LAST) + fragment_parser_init(&parser, fragment_ctx); + GumboParserState* state = parser._parser_state; gumbo_debug("Parsing %.*s.\n", length, buffer);