Skip to content

Commit

Permalink
Merge branch 'master' into feat/coma-separated-version
Browse files Browse the repository at this point in the history
  • Loading branch information
Sylvain Pace authored Sep 3, 2020
2 parents d175070 + bf87105 commit fd99732
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 139 deletions.
58 changes: 20 additions & 38 deletions scraper/src/tests/default_strategy/get_records_from_dom_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,7 @@ def test_selector_contains_elements(self):

def test_text_with_only_three_levels(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': 'h1',
'lvl1': 'h2',
'lvl2': 'h3',
'text': 'p'
}
})
strategy = get_strategy()

strategy.dom = lxml.html.fromstring("""
<html><body>
Expand All @@ -164,12 +157,6 @@ def test_text_with_only_three_levels(self):
def test_backward_compatibility_selectors(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"text": "p"
},
'strip_chars': ',.'
})

Expand Down Expand Up @@ -371,12 +358,6 @@ def test_keep_tags(self):
def test_stop_content(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
},
'start_urls': [
'http://test.com/docs/guides'
],
Expand All @@ -403,12 +384,6 @@ def test_stop_content(self):
def test_selectors_exclude_tail(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"text": "p"
},
'selectors_exclude': ['.test'],
'start_urls': [
'http://test.com/docs/guides'
Expand Down Expand Up @@ -436,12 +411,6 @@ def test_selectors_exclude_tail(self):
def test_selectors_exclude_tail2(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"text": "p"
},
'selectors_exclude': ['.test'],
'start_urls': [
'http://test.com/docs/guides'
Expand Down Expand Up @@ -496,12 +465,6 @@ def test_objectID(self):
def test_current_level(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
},
'start_urls': [
'http://test.com/docs/guides'
]
Expand All @@ -523,3 +486,22 @@ def test_current_level(self):

# Then
assert len(actual) == 3

def test_text_with_empty_content(self):
# Given
strategy = get_strategy()

strategy.dom = lxml.html.fromstring("""
<html><body>
<h1>Foo</h1>
<h2>Bar</h2>
<h3>Baz</h3>
<p></p>
</body></html>
""")

# When
actual = strategy.get_records_from_dom()

# Then
assert len(actual) == 3
45 changes: 5 additions & 40 deletions scraper/src/tests/default_strategy/meta_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,7 @@
class TestMeta:
def test_meta_number(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': "h1",
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p'
}
})
strategy = get_strategy()
strategy.dom = lxml.html.fromstring("""
<html>
<header>
Expand Down Expand Up @@ -41,14 +34,7 @@ def test_meta_number(self):

def test_meta_json_without_content(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': "h1",
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p'
}
})
strategy = get_strategy()
strategy.dom = lxml.html.fromstring("""
<html>
<header>
Expand Down Expand Up @@ -76,14 +62,7 @@ def test_meta_json_without_content(self):

def test_meta_json(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': "h1",
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p'
}
})
strategy = get_strategy()
strategy.dom = lxml.html.fromstring("""
<html>
<header>
Expand Down Expand Up @@ -123,14 +102,7 @@ def test_meta_json(self):

def test_meta_version(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': "h1",
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p'
}
})
strategy = get_strategy()
strategy.dom = lxml.html.fromstring("""
<html>
<header>
Expand Down Expand Up @@ -221,14 +193,7 @@ def test_meta_decimal_version(self):

def test_meta_escaped_string(self):
# Given
strategy = get_strategy({
'selectors': {
'lvl0': "h1",
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p'
}
})
strategy = get_strategy()
strategy.dom = lxml.html.fromstring("""
<html>
<header>
Expand Down
6 changes: 0 additions & 6 deletions scraper/src/tests/default_strategy/min_indexed_level_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ def test_test_default_value_with_global(self):
""" Should be able to not index the n first levels """
# Given
strategy = get_strategy({
'selectors': {
'lvl0': 'h1',
'lvl1': 'h2',
'lvl2': 'h3',
'content': 'p',
},
'min_indexed_level': 2
})

Expand Down
33 changes: 4 additions & 29 deletions scraper/src/tests/default_strategy/page_rank_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,7 @@
class TestPageRank:
def test_default_page_rank_should_be_zero(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
})
strategy = get_strategy()

strategy.dom = lxml.html.fromstring("""
<html><body>
Expand All @@ -33,13 +26,7 @@ def test_positive_page_rank(self):
'start_urls': [{
'url': 'http://foo.bar/api',
'page_rank': 1
}],
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
}]
})

strategy.dom = lxml.html.fromstring("""
Expand All @@ -60,13 +47,7 @@ def test_positive_sub_page_page_rank(self):
'start_urls': [{
'url': 'http://foo.bar/api',
'page_rank': 1
}],
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
}]
})

strategy.dom = lxml.html.fromstring("""
Expand All @@ -87,13 +68,7 @@ def test_negative_page_rank(self):
'start_urls': [{
'url': 'http://foo.bar/api',
'page_rank': -1
}],
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
}]
})

strategy.dom = lxml.html.fromstring("""
Expand Down
6 changes: 0 additions & 6 deletions scraper/src/tests/default_strategy/strip_chars_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@ class TestGetRecordsFromDomWithStripChars:
def test_strip_chars(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
},
'strip_chars': ',.'
})
strategy.dom = lxml.html.fromstring("""
Expand Down
22 changes: 2 additions & 20 deletions scraper/src/tests/default_strategy/tags_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@ def test_adding_tags_for_page(self):
'start_urls': [{
'url': 'http://foo.bar/api',
'tags': ["test"]
}],
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
}]
})

strategy.dom = lxml.html.fromstring("""
Expand All @@ -34,12 +28,6 @@ def test_adding_tags_for_page(self):
def test_adding_tags_for_subpage(self):
# Given
strategy = get_strategy({
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
},
'start_urls': [{
'url': 'http://foo.bar/api',
'tags': ["test"]
Expand Down Expand Up @@ -67,13 +55,7 @@ def test_regex_start_urls(self):
'url': 'http://foo.bar/.*',
'tags': ["test"]
}
],
'selectors': {
"lvl0": "h1",
"lvl1": "h2",
"lvl2": "h3",
"content": "p"
}
]
})

strategy.dom = lxml.html.fromstring("""
Expand Down

0 comments on commit fd99732

Please sign in to comment.