Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

gumbo-next #295

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
0340cad
Add a token type for CDATA.
nostrademons Nov 9, 2014
f9a515f
Add a state flag for whether the tokenizer is in a cdata section, and…
nostrademons Nov 9, 2014
58d5fad
Add CDATA handling to parser, including a test for it.
nostrademons Nov 10, 2014
fa3a71d
Add test for CDATA sections not in foreign content.
nostrademons Nov 10, 2014
2b804fa
Fix a couple comment issues (line-wrapping, unfinished comments) in u…
nostrademons Nov 10, 2014
8b867b4
Print the decimal value of the current character in the debug output …
nostrademons Nov 10, 2014
3f6012a
Add test for unsafe cdata.
nostrademons Dec 15, 2014
fe28c18
Fix missing case statement for GUMBO_TOKEN_CDATA in handle_parser_err…
nostrademons Feb 11, 2015
b6c9617
Additional debugging instructions.
nostrademons Feb 11, 2015
adc4c76
Add a test for utf8iterator_maybe_consume_match followed by a null.
nostrademons Feb 17, 2015
29f48f2
Update parser and tokenizer tests with testcases for null CDATA, and …
nostrademons Feb 17, 2015
7fea4b5
Fix handling of nulls in CDATA sections.
nostrademons Feb 17, 2015
4383a40
First pass at getting template changes on top of new master
kevinhendricks Feb 14, 2015
d8f369d
Update python interface for template changes
kevinhendricks Feb 14, 2015
975cfcf
Add in template parser.cc tests and fixes for parser.c
kevinhendricks Feb 14, 2015
ac84d02
Recognize templates in serialize and prettyprint
kevinhendricks Feb 14, 2015
ed9c9e5
Add in - Fix additional html5lib tests #291
kevinhendricks Feb 14, 2015
4d1efca
Fix template not handled to spec in handle_in_table
kevinhendricks Feb 14, 2015
61fc188
Fix bug in handle_in_template to meet spec
kevinhendricks Feb 14, 2015
f236a8c
Add in require rtc tag
kevinhendricks Feb 15, 2015
7d433e0
Fix bug in reset appropriate insertion mode for select
kevinhendricks Feb 15, 2015
befeb12
Merge in implementation of get_appropriate_insertion_location
kevinhendricks Feb 15, 2015
a2f9e41
Add get_appropriate_insertion_location to reconstruct active formatti…
kevinhendricks Feb 15, 2015
723a5f7
In body properly handle html tag when template exists
kevinhendricks Feb 15, 2015
57bce0f
Spec Fixes handle_in_column_group
kevinhendricks Feb 15, 2015
328c9e1
Fix handling of EOF token in handle_in_table to be spec
kevinhendricks Feb 15, 2015
49a5194
Fix EOF token handling to meet spec in handle_in_select
kevinhendricks Feb 16, 2015
d24c9d4
memory: Simplify the memory allocator implementation
vmg Feb 16, 2015
c34e2d9
tags: Use a perfect hash for lookups
vmg Feb 16, 2015
4d8ae0b
parser: Simplify the `element_in_specific_scope` calls
vmg Feb 16, 2015
72a2be1
parser: Implement fragment parsing
vmg Feb 16, 2015
d59e569
parser: Enable these SVG attribute replacements
vmg Feb 16, 2015
2df0efc
travis: Use GTest 1.7.0
vmg Feb 16, 2015
ee05f9f
Fix compilation in Mac OS X
vmg Feb 17, 2015
a87add3
tags: Automatically generate tag data
vmg Feb 17, 2015
62fd3e2
tokenizer: Refactor ASCII-only helpers
vmg Feb 17, 2015
b6dcb36
parser: Export create_node
vmg Feb 17, 2015
37479c5
attribute: Export the attribute helpers
vmg Feb 17, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ os:
- osx

install:
- wget 'https://googletest.googlecode.com/files/gtest-1.6.0.zip'
- unzip gtest-1.6.0.zip
- ln -s gtest-1.6.0 gtest
- wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip'
- unzip gtest-1.7.0.zip
- ln -s gtest-1.7.0 gtest
- sudo pip install BeautifulSoup
- sudo pip install html5lib==0.95
- ln -s `python -c 'import html5lib, os; print os.path.dirname(html5lib.__file__)'`/tests/testdata .
Expand Down
3 changes: 3 additions & 0 deletions DEBUGGING.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core

The same goes for core dumps in other example binaries.

To run only a single unit test, pass the --gtest_filter='TestName' flag to the
lt-gumbo_test binary.

Assertions
==========

Expand Down
11 changes: 11 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ clean-local:

endif !HAVE_SHARED_LIBGTEST

src/tag_strings.h: src/tag.in
@sed 's/\(.*\)/"\1",/g' <$< >$@

src/tag_enum.h: src/tag.in
@sed 's/\(.*\)/GUMBO_TAG_\U\1,/g;s/-/_/g' <$< >$@

python/gumbo/gumboc_tags.py: src/tag.in
@sed -e '1i TagNames = [' -e 's/\(.*\)/\t"\U\1",/g' -e 's/-/_/g' -e "\$$a]" <$< >$@

lib_LTLIBRARIES = libgumbo.la
libgumbo_la_CFLAGS = -Wall
libgumbo_la_LDFLAGS = -version-info 1:0:0 -no-undefined
Expand All @@ -55,6 +64,8 @@ libgumbo_la_SOURCES = \
src/string_piece.c \
src/string_piece.h \
src/tag.c \
src/tag_enum.h \
src/tag_strings.h \
src/token_type.h \
src/tokenizer.c \
src/tokenizer.h \
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ int main(int argc, char** argv) {
clock_t start_time = clock();
for (int i = 0; i < kNumReps; ++i) {
GumboOutput* output = gumbo_parse(contents.c_str());
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
clock_t end_time = clock();
std::cout << filename << ": "
Expand Down
2 changes: 1 addition & 1 deletion examples/clean_text.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,5 @@ int main(int argc, char** argv) {

GumboOutput* output = gumbo_parse(contents.c_str());
std::cout << cleantext(output->root) << std::endl;
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
2 changes: 1 addition & 1 deletion examples/find_links.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,5 @@ int main(int argc, char** argv) {

GumboOutput* output = gumbo_parse(contents.c_str());
search_for_links(output->root);
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
2 changes: 1 addition & 1 deletion examples/get_title.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,6 @@ int main(int argc, const char** argv) {
&kGumboDefaultOptions, input, input_length);
const char* title = find_title(output->root);
printf("%s\n", title);
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
free(input);
}
2 changes: 1 addition & 1 deletion examples/positions_of_class.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ int main(int argc, char** argv) {
GumboOutput* output = gumbo_parse_with_options(
&kGumboDefaultOptions, contents.data(), contents.length());
search_for_class(output->root, contents, cls);
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
4 changes: 2 additions & 2 deletions examples/prettyprint.cc
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ static std::string prettyprint_contents(GumboNode* node, int lvl, const std::str
contents.append(val);


} else if (child->type == GUMBO_NODE_ELEMENT) {
} else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE)) {

std::string val = prettyprint(child, lvl, indent_chars);

Expand Down Expand Up @@ -351,5 +351,5 @@ int main(int argc, char** argv) {
GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length());
std::string indent_chars = " ";
std::cout << prettyprint(output->document, 0, indent_chars) << std::endl;
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
4 changes: 2 additions & 2 deletions examples/serialize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ static std::string serialize_contents(GumboNode* node) {
contents.append(substitute_xml_entities_into_text(std::string(child->v.text.text)));
}

} else if (child->type == GUMBO_NODE_ELEMENT) {
} else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) {
contents.append(serialize(child));

} else if (child->type == GUMBO_NODE_WHITESPACE) {
Expand Down Expand Up @@ -283,5 +283,5 @@ int main(int argc, char** argv) {

GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length());
std::cout << serialize(output->document) << std::endl;
gumbo_destroy_output(&kGumboDefaultOptions, output);
gumbo_destroy_output(output);
}
185 changes: 22 additions & 163 deletions python/gumbo/gumboc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import contextlib
import ctypes
import os.path
import gumboc_tags

_name_of_lib = 'libgumbo.so'
if sys.platform.startswith('darwin'):
Expand Down Expand Up @@ -246,158 +247,12 @@ def to_url(self):


class Tag(Enum):
_values_ = [
'HTML',
'HEAD',
'TITLE',
'BASE',
'LINK',
'META',
'STYLE',
'SCRIPT',
'NOSCRIPT',
'TEMPLATE',
'BODY',
'ARTICLE',
'SECTION',
'NAV',
'ASIDE',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'HGROUP',
'HEADER',
'FOOTER',
'ADDRESS',
'P',
'HR',
'PRE',
'BLOCKQUOTE',
'OL',
'UL',
'LI',
'DL',
'DT',
'DD',
'FIGURE',
'FIGCAPTION',
'MAIN',
'DIV',
'A',
'EM',
'STRONG',
'SMALL',
'S',
'CITE',
'Q',
'DFN',
'ABBR',
'DATA',
'TIME',
'CODE',
'VAR',
'SAMP',
'KBD',
'SUB',
'SUP',
'I',
'B',
'U',
'MARK',
'RUBY',
'RT',
'RP',
'BDI',
'BDO',
'SPAN',
'BR',
'WBR',
'INS',
'DEL',
'IMAGE',
'IMG',
'IFRAME',
'EMBED',
'OBJECT',
'PARAM',
'VIDEO',
'AUDIO',
'SOURCE',
'TRACK',
'CANVAS',
'MAP',
'AREA',
'MATH',
'MI',
'MO',
'MN',
'MS',
'MTEXT',
'MGLYPH',
'MALIGNMARK',
'ANNOTATION_XML',
'SVG',
'FOREIGNOBJECT',
'DESC',
'TABLE',
'CAPTION',
'COLGROUP',
'COL',
'TBODY',
'THEAD',
'TFOOT',
'TR',
'TD',
'TH',
'FORM',
'FIELDSET',
'LEGEND',
'LABEL',
'INPUT',
'BUTTON',
'SELECT',
'DATALIST',
'OPTGROUP',
'OPTION',
'TEXTAREA',
'KEYGEN',
'OUTPUT',
'PROGRESS',
'METER',
'DETAILS',
'SUMMARY',
'MENU',
'MENUITEM',
'APPLET',
'ACRONYM',
'BGSOUND',
'DIR',
'FRAME',
'FRAMESET',
'NOFRAMES',
'ISINDEX',
'LISTING',
'XMP',
'NEXTID',
'NOEMBED',
'PLAINTEXT',
'RB',
'STRIKE',
'BASEFONT',
'BIG',
'BLINK',
'CENTER',
'FONT',
'MARQUEE',
'MULTICOL',
'NOBR',
'SPACER',
'TT',
'UNKNOWN',
]
@staticmethod
def from_str(tagname):
text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
return _tag_enum(text_ptr)

_values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST']


class Element(ctypes.Structure):
Expand Down Expand Up @@ -444,7 +299,8 @@ def __repr__(self):


class NodeType(Enum):
_values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', 'COMMENT', 'WHITESPACE']
_values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA',
'COMMENT', 'WHITESPACE', 'TEMPLATE']


class NodeUnion(ctypes.Union):
Expand All @@ -463,7 +319,7 @@ def _contents(self):
# __getattr__, so we factor it out to a helper.
if self.type == NodeType.DOCUMENT:
return self.v.document
elif self.type == NodeType.ELEMENT:
elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE):
return self.v.element
else:
return self.v.text
Expand Down Expand Up @@ -496,11 +352,6 @@ def __repr__(self):

class Options(ctypes.Structure):
_fields_ = [
# TODO(jdtang): Allow the Python API to set the allocator/deallocator
# function. Right now these are treated as opaque void pointers.
('allocator', ctypes.c_void_p),
('deallocator', ctypes.c_void_p),
('userdata', ctypes.c_void_p),
('tab_stop', ctypes.c_int),
('stop_on_first_error', ctypes.c_bool),
('max_errors', ctypes.c_int),
Expand All @@ -515,10 +366,10 @@ class Output(ctypes.Structure):
('errors', Vector),
]


@contextlib.contextmanager
def parse(text, **kwargs):
options = Options()
container = kwargs.get("inner_html", Tag.LAST)
for field_name, _ in Options._fields_:
try:
setattr(options, field_name, kwargs[field_name])
Expand All @@ -529,18 +380,22 @@ def parse(text, **kwargs):
# call, it creates a temporary buffer which is destroyed when the call
# completes, and then the original_text pointers point into invalid memory.
text_ptr = ctypes.c_char_p(text.encode('utf-8'))
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container)
try:
yield output
finally:
_destroy_output(ctypes.byref(options), output)
_destroy_output(output)

_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions')

_parse_with_options = _dll.gumbo_parse_with_options
_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
_parse_with_options.restype = _Ptr(Output)

_parse_fragment = _dll.gumbo_parse_fragment
_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag]
_parse_fragment.restype = _Ptr(Output)

_tag_from_original_text = _dll.gumbo_tag_from_original_text
_tag_from_original_text.argtypes = [_Ptr(StringPiece)]
_tag_from_original_text.restype = None
Expand All @@ -550,13 +405,17 @@ def parse(text, **kwargs):
_normalize_svg_tagname.restype = ctypes.c_char_p

_destroy_output = _dll.gumbo_destroy_output
_destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)]
_destroy_output.argtypes = [_Ptr(Output)]
_destroy_output.restype = None

_tagname = _dll.gumbo_normalized_tagname
_tagname.argtypes = [Tag]
_tagname.restype = ctypes.c_char_p

_tag_enum = _dll.gumbo_tag_enum
_tag_enum.argtypes = [ctypes.c_char_p]
_tag_enum.restype = Tag

__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
Expand Down
Loading