2023.05.19 / post-review (#37)

* [WIP] 2023.05.19 (#35) * chore: save html when running test_search script * fix: snippet text and knowledge panel content * chore: rename scripts to demo_ to avoid being detected by pytest * chore: bump version * chore: first pass at screenshot-style testing * chore: bump requirements * test: use syrupy * chore: configure boulder, co snapshot * chore: update top stories * chore: add 'complementary results' to knowledge component * chore: update perspectives result classifier * docs: update README with testing info * chore: update video classifying and parsing * test: add new html files to test against * chore: add another snapshot test to the tests file * chore: configure snapshot testing for each html page * test: commit issues in snapshots to git * chore: update subelement parsing * fix: double-counted element * chore: lint * docs: move testing section, add info for testing one file * chore: clean up saving to html * chore: nit, update readme * chore: clean up 2d div flattening * ci: add pytest github action * chore: punt on triple news component * chore: stop tracking html and json files see #38
gitronald · Jul 7, 2023 · e22eeca · e22eeca
1 parent 3c8955e
commit e22eeca
Show file tree

Hide file tree

Showing 18 changed files with 186 additions and 24 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -0,0 +1,23 @@
+name: Snapshot Testing
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Insall WebSearcher
+        run: |
+          pip install -e .
+      - name: Run pytest
+        run: |
+          pytest --snapshot-warn-unused
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,12 @@ build
 dist
 *.egg-info
 *__pycache__
+
+# ignores testing html and json files, see pr #38
+tests/html_pages/*
+tests/__snapshots__/*
+
+# generated script files
+test_response_save.html
+test_results_save.json
+test_serp_save.json
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ optimized on mid-to-late 2020 data, and is also available as version `2020.0.0`.
   - [Contributing](#contributing)
     - [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
     - [Add a Parser](#add-a-parser)
+    - [Testing](#testing)
   - [Recent Changes](#recent-changes)
   - [Similar Packages](#similar-packages)
   - [License](#license)
@@ -252,6 +253,26 @@ Coming next:
 2. Add parser file in `/component_parsers` as `cmpt_name.py`, with function `parse_cmpt_name`.
 3. Add import for `parse_cmpt_name` in `/component_parsers/__init__.py`
 
+### Testing
+Run tests:
+```
+pytest
+```
+
+Update snapshots:
+```
+pytest --snapshot-update
+```
+
+Running pytest with the `-vv` flag will show a diff of the snapshots that have changed:
+```
+pytest -vv
+```
+
+With the `-k` flag you can run a test for a specific html file:
+```
+pytest -k "1684837514.html"
+```
 
 ---
 ## Recent Changes

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2022.07.08-a'
+__version__ = "2023.05.19"
 from .searchers import SearchEngine
 from .parsers import parse_serp, extract_components
 from .locations import download_locations

diff --git a/WebSearcher/component_classifier.py b/WebSearcher/component_classifier.py
@@ -44,9 +44,10 @@ def classify_type(cmpt):
             cmpt_type = "general"
 
             if cmpt.find("block-component"):
+                # this can also be a "related results box"
                 # Check for image card block
                 cmpt_type = "img_cards"
-                
+
     # Twitter subtype
     if twitter or cmpt_type == "twitter":
         cmpt_type = "twitter_cards" if carousel else "twitter_result"
@@ -114,6 +115,7 @@ def classify_header(cmpt, level):
             'Map Results': 'map_results',
             'People also ask': 'people_also_ask',
             'Perspectives & opinions': 'perspectives',
+            'Perspectives': 'perspectives',
             'Related searches': 'searches_related',
             'Resultado de traducción': 'knowledge',
             'Resultados de la Web': 'general',
@@ -124,7 +126,9 @@ def classify_header(cmpt, level):
             'Unit Converter': 'knowledge',
             'Weather Result': 'knowledge',
             'Web Result with Site Links': 'general',
-            'Web results': 'general'
+            'Web results': 'general',
+            'Complementary Results': 'knowledge',
+            'Videos': 'videos',
         }
     elif level == 3:
         header_dict = {

diff --git a/WebSearcher/component_parsers/general.py b/WebSearcher/component_parsers/general.py
@@ -17,8 +17,25 @@ def parse_general_results(cmpt):
 
     # Legacy compatibility
     subs = cmpt.find_all('div', {'class':'g'})
-    subs = subs if subs else [cmpt] 
-
+
+    # 2023.05.09 - finds subs
+    if cmpt.find_all('div', {'class': 'd4rhi'}):
+        # this means that there is a sub-element, with class d4rhi
+        # the first div child of the div.g is the first sub element
+        first = cmpt.find('div')
+        additional = cmpt.find_all('div', {'class': 'd4rhi'})
+        subs = [first] + additional
+
+    # 2023.05.09 - handles duplicate .g tags within one component
+    if cmpt.find('div', {'class':'g'}):
+        parent_g = cmpt.find('div', {'class':'g'})
+        if parent_g.find_all('div', {'class':'g'}):
+            # this means that there is a .g element inside of another .g element,
+            # and it would otherwise get double-counted
+            # we just want to keep the parent .g element in this case
+            subs = [parent_g]
+    subs = subs if subs else [cmpt]
+
     return [parse_general_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
 
 def parse_general_result(sub, sub_rank=0):
@@ -67,13 +84,18 @@ def parse_general_result(sub, sub_rank=0):
             parsed['url'] = title_div.find('a')['href']
 
     # Get snippet text
-    body = sub.find('span', {'class':'st'})
+    body = sub.find('span', {'class':'st'}) or sub.find('div', {'class': 'VwiC3b'})
     if body:
         if ' - ' in body.text[:20]:
             split_body = body.text.split(' - ')
             timestamp = split_body[0]
             parsed['text'] = ' - '.join(split_body[1:])
             parsed['timestamp'] = timestamp
+        if ' \u2014 ' in body.text[:23]:
+            split_body = body.text.split(' \u2014 ')
+            timestamp = split_body[0]
+            parsed['text'] = ' \u2014 '.join(split_body[1:])
+            parsed['timestamp'] = timestamp
         else:
             parsed['text'] = body.text
             parsed['timestamp'] = None

diff --git a/WebSearcher/component_parsers/knowledge_rhs.py b/WebSearcher/component_parsers/knowledge_rhs.py
@@ -52,6 +52,12 @@ def parse_knowledge_rhs_main(cmpt, sub_rank=0):
         if description.parent.find('a') and 'href' in description.parent.find('a').attrs:
             parsed['url'] = description.parent.find('a')['href']
 
+    description = cmpt.find('div', {'class':'kno-rdesc'})
+    if description:
+        parsed['text'] = description.find('span').text
+        if description.find('a') and 'href' in description.find('a').attrs:
+            parsed['url'] = description.find('a')['href']
+
     # submenu
     if description and description.parent:
         alinks = description.parent.find_all('a')

diff --git a/WebSearcher/component_parsers/top_stories.py b/WebSearcher/component_parsers/top_stories.py
@@ -15,19 +15,23 @@ def parse_top_stories(cmpt, ctype='top_stories'):
     Returns:
         list : list of parsed subcomponent dictionaries
     """
-    # Known div structures
+    # Known div structures, this returns a 2d list of divs
     div_list = [
         find_all_divs(cmpt, 'g-inner-card'),
-        find_children(cmpt, 'div', {'class':'qmv19b'}),
-        find_all_divs(cmpt, 'div', {'class':'MkXWrd'}), # quad
-        find_all_divs(cmpt, 'div', {'class':'JJZKK'}),  # perspectives
-        [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
+        find_children(cmpt, 'div', {'class': 'qmv19b'}),
+        # TODO: choose one of these stragegies
+        # cmpt.select('div.Dnzdlc > div'), # triple
+        # [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
+        find_all_divs(cmpt, 'div', {'class': 'MkXWrd'}), # quad
+        find_all_divs(cmpt, 'div', {'class': 'JJZKK'}),  # perspectives
     ]
 
-    # If any known div structures exist, parse subcomponents
-    for divs in filter(None, div_list):
-        return [parse_top_story(div, ctype, i) for i, div in enumerate(divs)]
-
+    # flatten 2d div list
+    subcomponent_divs = [div for divs in div_list for div in divs]
+
+    if len(div_list) > 1:
+        return [parse_top_story(div, ctype, i) for i, div in enumerate(subcomponent_divs)]
+
     return [{'type': ctype, 'sub_rank': 0, 'error': 'No subcomponents found'}]
 
 
@@ -40,7 +44,7 @@ def parse_top_story(sub, ctype, sub_rank=0):
     Returns:
         dict: A parsed subresult
     """
-    parsed = {'type':ctype, 'sub_rank':sub_rank}
+    parsed = {'type': ctype, 'sub_rank': sub_rank}
 
     parsed['title'] = get_text(sub, 'a', separator=' | ')
     parsed['url'] = get_link(sub, key='href')

diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py
@@ -19,6 +19,7 @@ def parse_videos(cmpt):
     """
     subs = cmpt.find_all('g-inner-card')
     subs = cmpt.find_all('div', {'class':'VibNM'}) if not subs else subs
+    subs = cmpt.find_all('div', {'class':'sI5x9c'}) if not subs else subs
     return [parse_video(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
 
 def parse_video(sub, sub_rank=0):
@@ -31,10 +32,15 @@ def parse_video(sub, sub_rank=0):
         dict : parsed subresult
     """
     parsed = {'type':'videos', 'sub_rank':sub_rank}
-    parsed['url'] = sub.find('a')['href']
+
+    all_urls = sub.find_all('a')
+    # remove urls if they start with '#'
+    non_hash_urls = [url for url in all_urls if not url['href'].startswith('#')]
+    parsed['url'] = non_hash_urls[0]['href'] if non_hash_urls else None
+
     parsed['title'] = sub.find('div', {'role':'heading'}).text
 
-    details = sub.find_all('div',{'class':'MjS0Lc'})
+    details = sub.find_all('div', {'class':'MjS0Lc'})
     if details:
         text_div, citetime_div = details
         parsed['text'] = text_div.text if text_div else None
@@ -50,7 +56,9 @@ def parse_video(sub, sub_rank=0):
             else:
                 parsed['cite'] = citetime[0].text
     else:
-        parsed['cite'] = sub.find('span', {'class':'ocUPSd'}).text
+        cite_span = sub.find('span', {'class':'ocUPSd'})
+        parsed['cite'] = sub.text if cite_span else None
+
         parsed['timestamp'] = get_div_text(sub, {'class':'rjmdhd'})
 
     parsed['details'] = {} 

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
@@ -174,6 +174,21 @@ def search(self, qry, location='', serp_id=''):
         self.snapshot()
         self.handle_response()
 
+    def mock_search(self, html, qry='testing_query', location='', serp_id=''):
+        """Conducts a mock search, where we pass the html to the method instead
+        of fetching it. For testing.
+
+        Args:
+            html (str): HTML content as string
+            qry (str, optional): The search query. Optional because this is for testing.
+            location (str, optional): A location's Canonical Name.
+            serp_id (str, optional): A unique identifier for this SERP
+        """
+        self.prepare_url(qry, location=location)
+        self.serp_id = serp_id if serp_id else hash_id(qry + location)
+        self.timestamp = utc_stamp()
+        self.html = html
+
     def unzip_html(self):
         """Unzip brotli zipped html 
 
@@ -250,6 +265,24 @@ def save_results(self, save_dir='.', append_to=False):
                 utils.write_lines(self.results, fp)
         else:
             self.log.info(f'No parsed results for serp_id {self.serp_id}')
+
+    def save_response_as_html(self, filename=None, save_dir='.'):
+        """Save response text as html
+
+        Args:
+            filename (str, optional): Filename to save as, defaults to `test_response_save_{datetime}.html`
+            save_dir (str, optional): Directory to save to, defaults to current directory
+        """
+        if not filename:
+            filename = f'response_{datetime.now().strftime("%Y%m%d%H%M%S")}.html'
+
+        # Save response text
+        if self.response.text:
+            with open(os.path.join(save_dir, filename), 'w') as outfile:
+                outfile.write(self.response.text)
+        else:
+            self.log.info(f'No response text for serp_id {self.serp_id}')
+
 
     def scrape_results_html(self, save_dir='.', append_to=''):
         """Scrape and save all unique, non-internal URLs parsed from the SERP

diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py
@@ -83,7 +83,7 @@ def check_dict_value(d, key, value):
 def get_link(soup, kwargs=None, key='href'):
     """Utility for `soup.find('a')['href']` with null key handling"""
     link = get_div(soup, 'a', kwargs)
-    return link.attrs[key] if link.attrs and key in link.attrs else None
+    return link.attrs.get(key, None) if link else None
 
 
 def get_div(soup, name, attrs=None):
@@ -96,13 +96,13 @@ def find_all_divs(soup, name, attr=None, filter_empty=True):
     if filter_empty:
         divs = [c for c in divs if c]
         divs = [c for c in divs if c.text != '']
-    return divs if divs else None
+    return divs
 
 
 def find_children(soup, name, attr=None):
     """Find all children of a div with a given name and attribute"""
     div = get_div(soup, name, attr)
-    return div.children if div else None
+    return div.children if div else []
 
 
 def get_text(soup, name=None, kwargs=None, separator=" "):

diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,5 @@ six==1.16.0
 soupsieve==2.2.1
 tldextract==3.1.0
 urllib3==1.26.5
+pytest==7.3.1
+syrupy==4.0.2
diff --git a/scripts/test_locations.py → scripts/demo_locations.py b/scripts/test_locations.py → scripts/demo_locations.py
diff --git a/scripts/test_parse.py → scripts/demo_parse.py b/scripts/test_parse.py → scripts/demo_parse.py
diff --git a/scripts/test_search.py → scripts/demo_search.py b/scripts/test_search.py → scripts/demo_search.py
@@ -30,5 +30,6 @@
     try:
         se.save_serp(append_to='test_serp_save.json')
         se.save_results(append_to='test_results_save.json')
+        se.save_response_as_html()
     except Exception as e:
-        print('Save error', e)
+        print('Save error', e)
diff --git a/scripts/test_searches.py → scripts/demo_searches.py b/scripts/test_searches.py → scripts/demo_searches.py
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2):
 
 setuptools.setup(
     name='WebSearcher',
-    version='2022.07.08-a',
+    version='2023.05.19',
     url='http://github.com/gitronald/WebSearcher',
     author='Ronald E. Robertson',
     author_email='[email protected]',

diff --git a/tests/test_serp_generation.py b/tests/test_serp_generation.py
@@ -0,0 +1,29 @@
+import pytest
+import glob
+import WebSearcher as ws
+
+from syrupy.extensions.json import JSONSnapshotExtension
+
+@pytest.fixture
+def snapshot_json(snapshot):
+    return snapshot.use_extension(JSONSnapshotExtension)
+
+def pytest_generate_tests(metafunc):
+    file_list = glob.glob('./tests/html_pages/*.html')
+    metafunc.parametrize("file_name", file_list )
+
+def test_parsing(snapshot_json, file_name):
+    # read html
+    with open(file_name) as file:
+        html = file.read()
+
+    # Initialize crawler
+    se = ws.SearchEngine()
+
+    # Conduct Search
+    se.mock_search(html)
+
+    # Parse Results
+    se.parse_results()
+
+    assert se.results == snapshot_json