From e22eeca3f8d1c8f8dfb95733d743966fc063e8ff Mon Sep 17 00:00:00 2001 From: Andrew Schwartz Date: Fri, 7 Jul 2023 17:29:40 +0200 Subject: [PATCH] 2023.05.19 / post-review (#37) * [WIP] 2023.05.19 (#35) * chore: save html when running test_search script * fix: snippet text and knowledge panel content * chore: rename scripts to demo_ to avoid being detected by pytest * chore: bump version * chore: first pass at screenshot-style testing * chore: bump requirements * test: use syrupy * chore: configure boulder, co snapshot * chore: update top stories * chore: add 'complementary results' to knowledge component * chore: update perspectives result classifier * docs: update README with testing info * chore: update video classifying and parsing * test: add new html files to test against * chore: add another snapshot test to the tests file * chore: configure snapshot testing for each html page * test: commit issues in snapshots to git * chore: update subelement parsing * fix: double-counted element * chore: lint * docs: move testing section, add info for testing one file * chore: clean up saving to html * chore: nit, update readme * chore: clean up 2d div flattening * ci: add pytest github action * chore: punt on triple news component * chore: stop tracking html and json files see #38 --- .github/workflows/CI.yml | 23 +++++++++++++ .gitignore | 9 +++++ README.md | 21 ++++++++++++ WebSearcher/__init__.py | 2 +- WebSearcher/component_classifier.py | 8 +++-- WebSearcher/component_parsers/general.py | 28 ++++++++++++++-- .../component_parsers/knowledge_rhs.py | 6 ++++ WebSearcher/component_parsers/top_stories.py | 24 ++++++++------ WebSearcher/component_parsers/videos.py | 14 ++++++-- WebSearcher/searchers.py | 33 +++++++++++++++++++ WebSearcher/webutils.py | 6 ++-- requirements.txt | 2 ++ .../{test_locations.py => demo_locations.py} | 0 scripts/{test_parse.py => demo_parse.py} | 0 scripts/{test_search.py => demo_search.py} | 3 +- .../{test_searches.py => demo_searches.py} | 0 setup.py | 2 +- tests/test_serp_generation.py | 29 ++++++++++++++++ 18 files changed, 186 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/CI.yml rename scripts/{test_locations.py => demo_locations.py} (100%) rename scripts/{test_parse.py => demo_parse.py} (100%) rename scripts/{test_search.py => demo_search.py} (92%) rename scripts/{test_searches.py => demo_searches.py} (100%) create mode 100644 tests/test_serp_generation.py diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..b2020b6 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,23 @@ +name: Snapshot Testing + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Insall WebSearcher + run: | + pip install -e . + - name: Run pytest + run: | + pytest --snapshot-warn-unused diff --git a/.gitignore b/.gitignore index 6465f5b..95602a2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,12 @@ build dist *.egg-info *__pycache__ + +# ignores testing html and json files, see pr #38 +tests/html_pages/* +tests/__snapshots__/* + +# generated script files +test_response_save.html +test_results_save.json +test_serp_save.json diff --git a/README.md b/README.md index ea23ed7..f9372da 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ optimized on mid-to-late 2020 data, and is also available as version `2020.0.0`. - [Contributing](#contributing) - [Repair or Enhance a Parser](#repair-or-enhance-a-parser) - [Add a Parser](#add-a-parser) + - [Testing](#testing) - [Recent Changes](#recent-changes) - [Similar Packages](#similar-packages) - [License](#license) @@ -252,6 +253,26 @@ Coming next: 2. Add parser file in `/component_parsers` as `cmpt_name.py`, with function `parse_cmpt_name`. 3. Add import for `parse_cmpt_name` in `/component_parsers/__init__.py` +### Testing +Run tests: +``` +pytest +``` + +Update snapshots: +``` +pytest --snapshot-update +``` + +Running pytest with the `-vv` flag will show a diff of the snapshots that have changed: +``` +pytest -vv +``` + +With the `-k` flag you can run a test for a specific html file: +``` +pytest -k "1684837514.html" +``` --- ## Recent Changes diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 2c51415..50b38b2 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = '2022.07.08-a' +__version__ = "2023.05.19" from .searchers import SearchEngine from .parsers import parse_serp, extract_components from .locations import download_locations diff --git a/WebSearcher/component_classifier.py b/WebSearcher/component_classifier.py index db4a5aa..f4185f2 100644 --- a/WebSearcher/component_classifier.py +++ b/WebSearcher/component_classifier.py @@ -44,9 +44,10 @@ def classify_type(cmpt): cmpt_type = "general" if cmpt.find("block-component"): + # this can also be a "related results box" # Check for image card block cmpt_type = "img_cards" - + # Twitter subtype if twitter or cmpt_type == "twitter": cmpt_type = "twitter_cards" if carousel else "twitter_result" @@ -114,6 +115,7 @@ def classify_header(cmpt, level): 'Map Results': 'map_results', 'People also ask': 'people_also_ask', 'Perspectives & opinions': 'perspectives', + 'Perspectives': 'perspectives', 'Related searches': 'searches_related', 'Resultado de traducción': 'knowledge', 'Resultados de la Web': 'general', @@ -124,7 +126,9 @@ def classify_header(cmpt, level): 'Unit Converter': 'knowledge', 'Weather Result': 'knowledge', 'Web Result with Site Links': 'general', - 'Web results': 'general' + 'Web results': 'general', + 'Complementary Results': 'knowledge', + 'Videos': 'videos', } elif level == 3: header_dict = { diff --git a/WebSearcher/component_parsers/general.py b/WebSearcher/component_parsers/general.py index 40a91c7..b11b76f 100644 --- a/WebSearcher/component_parsers/general.py +++ b/WebSearcher/component_parsers/general.py @@ -17,8 +17,25 @@ def parse_general_results(cmpt): # Legacy compatibility subs = cmpt.find_all('div', {'class':'g'}) - subs = subs if subs else [cmpt] - + + # 2023.05.09 - finds subs + if cmpt.find_all('div', {'class': 'd4rhi'}): + # this means that there is a sub-element, with class d4rhi + # the first div child of the div.g is the first sub element + first = cmpt.find('div') + additional = cmpt.find_all('div', {'class': 'd4rhi'}) + subs = [first] + additional + + # 2023.05.09 - handles duplicate .g tags within one component + if cmpt.find('div', {'class':'g'}): + parent_g = cmpt.find('div', {'class':'g'}) + if parent_g.find_all('div', {'class':'g'}): + # this means that there is a .g element inside of another .g element, + # and it would otherwise get double-counted + # we just want to keep the parent .g element in this case + subs = [parent_g] + subs = subs if subs else [cmpt] + return [parse_general_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)] def parse_general_result(sub, sub_rank=0): @@ -67,13 +84,18 @@ def parse_general_result(sub, sub_rank=0): parsed['url'] = title_div.find('a')['href'] # Get snippet text - body = sub.find('span', {'class':'st'}) + body = sub.find('span', {'class':'st'}) or sub.find('div', {'class': 'VwiC3b'}) if body: if ' - ' in body.text[:20]: split_body = body.text.split(' - ') timestamp = split_body[0] parsed['text'] = ' - '.join(split_body[1:]) parsed['timestamp'] = timestamp + if ' \u2014 ' in body.text[:23]: + split_body = body.text.split(' \u2014 ') + timestamp = split_body[0] + parsed['text'] = ' \u2014 '.join(split_body[1:]) + parsed['timestamp'] = timestamp else: parsed['text'] = body.text parsed['timestamp'] = None diff --git a/WebSearcher/component_parsers/knowledge_rhs.py b/WebSearcher/component_parsers/knowledge_rhs.py index 3e8e4f4..572bf60 100644 --- a/WebSearcher/component_parsers/knowledge_rhs.py +++ b/WebSearcher/component_parsers/knowledge_rhs.py @@ -52,6 +52,12 @@ def parse_knowledge_rhs_main(cmpt, sub_rank=0): if description.parent.find('a') and 'href' in description.parent.find('a').attrs: parsed['url'] = description.parent.find('a')['href'] + description = cmpt.find('div', {'class':'kno-rdesc'}) + if description: + parsed['text'] = description.find('span').text + if description.find('a') and 'href' in description.find('a').attrs: + parsed['url'] = description.find('a')['href'] + # submenu if description and description.parent: alinks = description.parent.find_all('a') diff --git a/WebSearcher/component_parsers/top_stories.py b/WebSearcher/component_parsers/top_stories.py index 371c3a4..b0ae094 100644 --- a/WebSearcher/component_parsers/top_stories.py +++ b/WebSearcher/component_parsers/top_stories.py @@ -15,19 +15,23 @@ def parse_top_stories(cmpt, ctype='top_stories'): Returns: list : list of parsed subcomponent dictionaries """ - # Known div structures + # Known div structures, this returns a 2d list of divs div_list = [ find_all_divs(cmpt, 'g-inner-card'), - find_children(cmpt, 'div', {'class':'qmv19b'}), - find_all_divs(cmpt, 'div', {'class':'MkXWrd'}), # quad - find_all_divs(cmpt, 'div', {'class':'JJZKK'}), # perspectives - [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple + find_children(cmpt, 'div', {'class': 'qmv19b'}), + # TODO: choose one of these stragegies + # cmpt.select('div.Dnzdlc > div'), # triple + # [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple + find_all_divs(cmpt, 'div', {'class': 'MkXWrd'}), # quad + find_all_divs(cmpt, 'div', {'class': 'JJZKK'}), # perspectives ] - # If any known div structures exist, parse subcomponents - for divs in filter(None, div_list): - return [parse_top_story(div, ctype, i) for i, div in enumerate(divs)] - + # flatten 2d div list + subcomponent_divs = [div for divs in div_list for div in divs] + + if len(div_list) > 1: + return [parse_top_story(div, ctype, i) for i, div in enumerate(subcomponent_divs)] + return [{'type': ctype, 'sub_rank': 0, 'error': 'No subcomponents found'}] @@ -40,7 +44,7 @@ def parse_top_story(sub, ctype, sub_rank=0): Returns: dict: A parsed subresult """ - parsed = {'type':ctype, 'sub_rank':sub_rank} + parsed = {'type': ctype, 'sub_rank': sub_rank} parsed['title'] = get_text(sub, 'a', separator=' | ') parsed['url'] = get_link(sub, key='href') diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py index f1b68ba..87905eb 100644 --- a/WebSearcher/component_parsers/videos.py +++ b/WebSearcher/component_parsers/videos.py @@ -19,6 +19,7 @@ def parse_videos(cmpt): """ subs = cmpt.find_all('g-inner-card') subs = cmpt.find_all('div', {'class':'VibNM'}) if not subs else subs + subs = cmpt.find_all('div', {'class':'sI5x9c'}) if not subs else subs return [parse_video(sub, sub_rank) for sub_rank, sub in enumerate(subs)] def parse_video(sub, sub_rank=0): @@ -31,10 +32,15 @@ def parse_video(sub, sub_rank=0): dict : parsed subresult """ parsed = {'type':'videos', 'sub_rank':sub_rank} - parsed['url'] = sub.find('a')['href'] + + all_urls = sub.find_all('a') + # remove urls if they start with '#' + non_hash_urls = [url for url in all_urls if not url['href'].startswith('#')] + parsed['url'] = non_hash_urls[0]['href'] if non_hash_urls else None + parsed['title'] = sub.find('div', {'role':'heading'}).text - details = sub.find_all('div',{'class':'MjS0Lc'}) + details = sub.find_all('div', {'class':'MjS0Lc'}) if details: text_div, citetime_div = details parsed['text'] = text_div.text if text_div else None @@ -50,7 +56,9 @@ def parse_video(sub, sub_rank=0): else: parsed['cite'] = citetime[0].text else: - parsed['cite'] = sub.find('span', {'class':'ocUPSd'}).text + cite_span = sub.find('span', {'class':'ocUPSd'}) + parsed['cite'] = sub.text if cite_span else None + parsed['timestamp'] = get_div_text(sub, {'class':'rjmdhd'}) parsed['details'] = {} diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 68053bd..6d6b7e6 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -174,6 +174,21 @@ def search(self, qry, location='', serp_id=''): self.snapshot() self.handle_response() + def mock_search(self, html, qry='testing_query', location='', serp_id=''): + """Conducts a mock search, where we pass the html to the method instead + of fetching it. For testing. + + Args: + html (str): HTML content as string + qry (str, optional): The search query. Optional because this is for testing. + location (str, optional): A location's Canonical Name. + serp_id (str, optional): A unique identifier for this SERP + """ + self.prepare_url(qry, location=location) + self.serp_id = serp_id if serp_id else hash_id(qry + location) + self.timestamp = utc_stamp() + self.html = html + def unzip_html(self): """Unzip brotli zipped html @@ -250,6 +265,24 @@ def save_results(self, save_dir='.', append_to=False): utils.write_lines(self.results, fp) else: self.log.info(f'No parsed results for serp_id {self.serp_id}') + + def save_response_as_html(self, filename=None, save_dir='.'): + """Save response text as html + + Args: + filename (str, optional): Filename to save as, defaults to `test_response_save_{datetime}.html` + save_dir (str, optional): Directory to save to, defaults to current directory + """ + if not filename: + filename = f'response_{datetime.now().strftime("%Y%m%d%H%M%S")}.html' + + # Save response text + if self.response.text: + with open(os.path.join(save_dir, filename), 'w') as outfile: + outfile.write(self.response.text) + else: + self.log.info(f'No response text for serp_id {self.serp_id}') + def scrape_results_html(self, save_dir='.', append_to=''): """Scrape and save all unique, non-internal URLs parsed from the SERP diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py index 85f6bb2..034423f 100644 --- a/WebSearcher/webutils.py +++ b/WebSearcher/webutils.py @@ -83,7 +83,7 @@ def check_dict_value(d, key, value): def get_link(soup, kwargs=None, key='href'): """Utility for `soup.find('a')['href']` with null key handling""" link = get_div(soup, 'a', kwargs) - return link.attrs[key] if link.attrs and key in link.attrs else None + return link.attrs.get(key, None) if link else None def get_div(soup, name, attrs=None): @@ -96,13 +96,13 @@ def find_all_divs(soup, name, attr=None, filter_empty=True): if filter_empty: divs = [c for c in divs if c] divs = [c for c in divs if c.text != ''] - return divs if divs else None + return divs def find_children(soup, name, attr=None): """Find all children of a div with a given name and attribute""" div = get_div(soup, name, attr) - return div.children if div else None + return div.children if div else [] def get_text(soup, name=None, kwargs=None, separator=" "): diff --git a/requirements.txt b/requirements.txt index f62b2ca..0c1c691 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,5 @@ six==1.16.0 soupsieve==2.2.1 tldextract==3.1.0 urllib3==1.26.5 +pytest==7.3.1 +syrupy==4.0.2 diff --git a/scripts/test_locations.py b/scripts/demo_locations.py similarity index 100% rename from scripts/test_locations.py rename to scripts/demo_locations.py diff --git a/scripts/test_parse.py b/scripts/demo_parse.py similarity index 100% rename from scripts/test_parse.py rename to scripts/demo_parse.py diff --git a/scripts/test_search.py b/scripts/demo_search.py similarity index 92% rename from scripts/test_search.py rename to scripts/demo_search.py index 9167330..d717f10 100644 --- a/scripts/test_search.py +++ b/scripts/demo_search.py @@ -30,5 +30,6 @@ try: se.save_serp(append_to='test_serp_save.json') se.save_results(append_to='test_results_save.json') + se.save_response_as_html() except Exception as e: - print('Save error', e) \ No newline at end of file + print('Save error', e) diff --git a/scripts/test_searches.py b/scripts/demo_searches.py similarity index 100% rename from scripts/test_searches.py rename to scripts/demo_searches.py diff --git a/setup.py b/setup.py index a112138..230b784 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2): setuptools.setup( name='WebSearcher', - version='2022.07.08-a', + version='2023.05.19', url='http://github.com/gitronald/WebSearcher', author='Ronald E. Robertson', author_email='rer@ccs.neu.edu', diff --git a/tests/test_serp_generation.py b/tests/test_serp_generation.py new file mode 100644 index 0000000..fdadf82 --- /dev/null +++ b/tests/test_serp_generation.py @@ -0,0 +1,29 @@ +import pytest +import glob +import WebSearcher as ws + +from syrupy.extensions.json import JSONSnapshotExtension + +@pytest.fixture +def snapshot_json(snapshot): + return snapshot.use_extension(JSONSnapshotExtension) + +def pytest_generate_tests(metafunc): + file_list = glob.glob('./tests/html_pages/*.html') + metafunc.parametrize("file_name", file_list ) + +def test_parsing(snapshot_json, file_name): + # read html + with open(file_name) as file: + html = file.read() + + # Initialize crawler + se = ws.SearchEngine() + + # Conduct Search + se.mock_search(html) + + # Parse Results + se.parse_results() + + assert se.results == snapshot_json