Merge branch 'master' into feat/coma-separated-version

algolia · Sep 3, 2020 · fd99732 · fd99732
2 parents d175070 + bf87105
commit fd99732
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 139 deletions.
diff --git a/scraper/src/tests/default_strategy/get_records_from_dom_test.py b/scraper/src/tests/default_strategy/get_records_from_dom_test.py
@@ -133,14 +133,7 @@ def test_selector_contains_elements(self):
 
     def test_text_with_only_three_levels(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': 'h1',
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'text': 'p'
-            }
-        })
+        strategy = get_strategy()
 
         strategy.dom = lxml.html.fromstring("""
         <html><body>
@@ -164,12 +157,6 @@ def test_text_with_only_three_levels(self):
     def test_backward_compatibility_selectors(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "text": "p"
-            },
             'strip_chars': ',.'
         })
 
@@ -371,12 +358,6 @@ def test_keep_tags(self):
     def test_stop_content(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            },
             'start_urls': [
                 'http://test.com/docs/guides'
             ],
@@ -403,12 +384,6 @@ def test_stop_content(self):
     def test_selectors_exclude_tail(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "text": "p"
-            },
             'selectors_exclude': ['.test'],
             'start_urls': [
                 'http://test.com/docs/guides'
@@ -436,12 +411,6 @@ def test_selectors_exclude_tail(self):
     def test_selectors_exclude_tail2(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "text": "p"
-            },
             'selectors_exclude': ['.test'],
             'start_urls': [
                 'http://test.com/docs/guides'
@@ -496,12 +465,6 @@ def test_objectID(self):
     def test_current_level(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            },
             'start_urls': [
                 'http://test.com/docs/guides'
             ]
@@ -523,3 +486,22 @@ def test_current_level(self):
 
         # Then
         assert len(actual) == 3
+
+    def test_text_with_empty_content(self):
+        # Given
+        strategy = get_strategy()
+
+        strategy.dom = lxml.html.fromstring("""
+            <html><body>
+                <h1>Foo</h1>
+                <h2>Bar</h2>
+                <h3>Baz</h3>
+                <p></p>
+            </body></html>
+            """)
+
+        # When
+        actual = strategy.get_records_from_dom()
+
+        # Then
+        assert len(actual) == 3
diff --git a/scraper/src/tests/default_strategy/meta_test.py b/scraper/src/tests/default_strategy/meta_test.py
@@ -6,14 +6,7 @@
 class TestMeta:
     def test_meta_number(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': "h1",
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p'
-            }
-        })
+        strategy = get_strategy()
         strategy.dom = lxml.html.fromstring("""
         <html>
             <header>
@@ -41,14 +34,7 @@ def test_meta_number(self):
 
     def test_meta_json_without_content(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': "h1",
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p'
-            }
-        })
+        strategy = get_strategy()
         strategy.dom = lxml.html.fromstring("""
            <html>
                <header>
@@ -76,14 +62,7 @@ def test_meta_json_without_content(self):
 
     def test_meta_json(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': "h1",
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p'
-            }
-        })
+        strategy = get_strategy()
         strategy.dom = lxml.html.fromstring("""
            <html>
                <header>
@@ -123,14 +102,7 @@ def test_meta_json(self):
 
     def test_meta_version(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': "h1",
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p'
-            }
-        })
+        strategy = get_strategy()
         strategy.dom = lxml.html.fromstring("""
         <html>
             <header>
@@ -221,14 +193,7 @@ def test_meta_decimal_version(self):
 
     def test_meta_escaped_string(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                'lvl0': "h1",
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p'
-            }
-        })
+        strategy = get_strategy()
         strategy.dom = lxml.html.fromstring("""
         <html>
             <header>

diff --git a/scraper/src/tests/default_strategy/min_indexed_level_test.py b/scraper/src/tests/default_strategy/min_indexed_level_test.py
@@ -8,12 +8,6 @@ def test_test_default_value_with_global(self):
         """ Should be able to not index the n first levels """
         # Given
         strategy = get_strategy({
-            'selectors': {
-                'lvl0': 'h1',
-                'lvl1': 'h2',
-                'lvl2': 'h3',
-                'content': 'p',
-            },
             'min_indexed_level': 2
         })
 

diff --git a/scraper/src/tests/default_strategy/page_rank_test.py b/scraper/src/tests/default_strategy/page_rank_test.py
@@ -6,14 +6,7 @@
 class TestPageRank:
     def test_default_page_rank_should_be_zero(self):
         # Given
-        strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
-        })
+        strategy = get_strategy()
 
         strategy.dom = lxml.html.fromstring("""
         <html><body>
@@ -33,13 +26,7 @@ def test_positive_page_rank(self):
             'start_urls': [{
                 'url': 'http://foo.bar/api',
                 'page_rank': 1
-            }],
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
+            }]
         })
 
         strategy.dom = lxml.html.fromstring("""
@@ -60,13 +47,7 @@ def test_positive_sub_page_page_rank(self):
             'start_urls': [{
                 'url': 'http://foo.bar/api',
                 'page_rank': 1
-            }],
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
+            }]
         })
 
         strategy.dom = lxml.html.fromstring("""
@@ -87,13 +68,7 @@ def test_negative_page_rank(self):
             'start_urls': [{
                 'url': 'http://foo.bar/api',
                 'page_rank': -1
-            }],
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
+            }]
         })
 
         strategy.dom = lxml.html.fromstring("""

diff --git a/scraper/src/tests/default_strategy/strip_chars_test.py b/scraper/src/tests/default_strategy/strip_chars_test.py
@@ -7,12 +7,6 @@ class TestGetRecordsFromDomWithStripChars:
     def test_strip_chars(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            },
             'strip_chars': ',.'
         })
         strategy.dom = lxml.html.fromstring("""

diff --git a/scraper/src/tests/default_strategy/tags_test.py b/scraper/src/tests/default_strategy/tags_test.py
@@ -10,13 +10,7 @@ def test_adding_tags_for_page(self):
             'start_urls': [{
                 'url': 'http://foo.bar/api',
                 'tags': ["test"]
-            }],
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
+            }]
         })
 
         strategy.dom = lxml.html.fromstring("""
@@ -34,12 +28,6 @@ def test_adding_tags_for_page(self):
     def test_adding_tags_for_subpage(self):
         # Given
         strategy = get_strategy({
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            },
             'start_urls': [{
                 'url': 'http://foo.bar/api',
                 'tags': ["test"]
@@ -67,13 +55,7 @@ def test_regex_start_urls(self):
                     'url': 'http://foo.bar/.*',
                     'tags': ["test"]
                 }
-            ],
-            'selectors': {
-                "lvl0": "h1",
-                "lvl1": "h2",
-                "lvl2": "h3",
-                "content": "p"
-            }
+            ]
         })
 
         strategy.dom = lxml.html.fromstring("""