Skip to content

Commit

Permalink
2023.05.19 / post-review (#37)
Browse files Browse the repository at this point in the history
* [WIP] 2023.05.19 (#35)

* chore: save html when running test_search script

* fix: snippet text and knowledge panel content

* chore: rename scripts to demo_ to avoid being detected by pytest

* chore: bump version

* chore: first pass at screenshot-style testing

* chore: bump requirements

* test: use syrupy

* chore: configure boulder, co snapshot

* chore: update top stories

* chore: add 'complementary results' to knowledge component

* chore: update perspectives result classifier

* docs: update README with testing info

* chore: update video classifying and parsing

* test: add new html files to test against

* chore: add another snapshot test to the tests file

* chore: configure snapshot testing for each html page

* test: commit issues in snapshots to git

* chore: update subelement parsing

* fix: double-counted element

* chore: lint

* docs: move testing section, add info for testing one file

* chore: clean up saving to html

* chore: nit, update readme

* chore: clean up 2d div flattening

* ci: add pytest github action

* chore: punt on triple news component

* chore: stop tracking html and json files

see #38
  • Loading branch information
schwartzadev authored Jul 7, 2023
1 parent 3c8955e commit e22eeca
Show file tree
Hide file tree
Showing 18 changed files with 186 additions and 24 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Snapshot Testing

on: [push, pull_request]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v3
with:
python-version: "3.11"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Insall WebSearcher
run: |
pip install -e .
- name: Run pytest
run: |
pytest --snapshot-warn-unused
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,12 @@ build
dist
*.egg-info
*__pycache__

# ignores testing html and json files, see pr #38
tests/html_pages/*
tests/__snapshots__/*

# generated script files
test_response_save.html
test_results_save.json
test_serp_save.json
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ optimized on mid-to-late 2020 data, and is also available as version `2020.0.0`.
- [Contributing](#contributing)
- [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
- [Add a Parser](#add-a-parser)
- [Testing](#testing)
- [Recent Changes](#recent-changes)
- [Similar Packages](#similar-packages)
- [License](#license)
Expand Down Expand Up @@ -252,6 +253,26 @@ Coming next:
2. Add parser file in `/component_parsers` as `cmpt_name.py`, with function `parse_cmpt_name`.
3. Add import for `parse_cmpt_name` in `/component_parsers/__init__.py`

### Testing
Run tests:
```
pytest
```

Update snapshots:
```
pytest --snapshot-update
```

Running pytest with the `-vv` flag will show a diff of the snapshots that have changed:
```
pytest -vv
```

With the `-k` flag you can run a test for a specific html file:
```
pytest -k "1684837514.html"
```

---
## Recent Changes
Expand Down
2 changes: 1 addition & 1 deletion WebSearcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2022.07.08-a'
__version__ = "2023.05.19"
from .searchers import SearchEngine
from .parsers import parse_serp, extract_components
from .locations import download_locations
Expand Down
8 changes: 6 additions & 2 deletions WebSearcher/component_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,10 @@ def classify_type(cmpt):
cmpt_type = "general"

if cmpt.find("block-component"):
# this can also be a "related results box"
# Check for image card block
cmpt_type = "img_cards"

# Twitter subtype
if twitter or cmpt_type == "twitter":
cmpt_type = "twitter_cards" if carousel else "twitter_result"
Expand Down Expand Up @@ -114,6 +115,7 @@ def classify_header(cmpt, level):
'Map Results': 'map_results',
'People also ask': 'people_also_ask',
'Perspectives & opinions': 'perspectives',
'Perspectives': 'perspectives',
'Related searches': 'searches_related',
'Resultado de traducción': 'knowledge',
'Resultados de la Web': 'general',
Expand All @@ -124,7 +126,9 @@ def classify_header(cmpt, level):
'Unit Converter': 'knowledge',
'Weather Result': 'knowledge',
'Web Result with Site Links': 'general',
'Web results': 'general'
'Web results': 'general',
'Complementary Results': 'knowledge',
'Videos': 'videos',
}
elif level == 3:
header_dict = {
Expand Down
28 changes: 25 additions & 3 deletions WebSearcher/component_parsers/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,25 @@ def parse_general_results(cmpt):

# Legacy compatibility
subs = cmpt.find_all('div', {'class':'g'})
subs = subs if subs else [cmpt]


# 2023.05.09 - finds subs
if cmpt.find_all('div', {'class': 'd4rhi'}):
# this means that there is a sub-element, with class d4rhi
# the first div child of the div.g is the first sub element
first = cmpt.find('div')
additional = cmpt.find_all('div', {'class': 'd4rhi'})
subs = [first] + additional

# 2023.05.09 - handles duplicate .g tags within one component
if cmpt.find('div', {'class':'g'}):
parent_g = cmpt.find('div', {'class':'g'})
if parent_g.find_all('div', {'class':'g'}):
# this means that there is a .g element inside of another .g element,
# and it would otherwise get double-counted
# we just want to keep the parent .g element in this case
subs = [parent_g]
subs = subs if subs else [cmpt]

return [parse_general_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]

def parse_general_result(sub, sub_rank=0):
Expand Down Expand Up @@ -67,13 +84,18 @@ def parse_general_result(sub, sub_rank=0):
parsed['url'] = title_div.find('a')['href']

# Get snippet text
body = sub.find('span', {'class':'st'})
body = sub.find('span', {'class':'st'}) or sub.find('div', {'class': 'VwiC3b'})
if body:
if ' - ' in body.text[:20]:
split_body = body.text.split(' - ')
timestamp = split_body[0]
parsed['text'] = ' - '.join(split_body[1:])
parsed['timestamp'] = timestamp
if ' \u2014 ' in body.text[:23]:
split_body = body.text.split(' \u2014 ')
timestamp = split_body[0]
parsed['text'] = ' \u2014 '.join(split_body[1:])
parsed['timestamp'] = timestamp
else:
parsed['text'] = body.text
parsed['timestamp'] = None
Expand Down
6 changes: 6 additions & 0 deletions WebSearcher/component_parsers/knowledge_rhs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ def parse_knowledge_rhs_main(cmpt, sub_rank=0):
if description.parent.find('a') and 'href' in description.parent.find('a').attrs:
parsed['url'] = description.parent.find('a')['href']

description = cmpt.find('div', {'class':'kno-rdesc'})
if description:
parsed['text'] = description.find('span').text
if description.find('a') and 'href' in description.find('a').attrs:
parsed['url'] = description.find('a')['href']

# submenu
if description and description.parent:
alinks = description.parent.find_all('a')
Expand Down
24 changes: 14 additions & 10 deletions WebSearcher/component_parsers/top_stories.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,23 @@ def parse_top_stories(cmpt, ctype='top_stories'):
Returns:
list : list of parsed subcomponent dictionaries
"""
# Known div structures
# Known div structures, this returns a 2d list of divs
div_list = [
find_all_divs(cmpt, 'g-inner-card'),
find_children(cmpt, 'div', {'class':'qmv19b'}),
find_all_divs(cmpt, 'div', {'class':'MkXWrd'}), # quad
find_all_divs(cmpt, 'div', {'class':'JJZKK'}), # perspectives
[c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
find_children(cmpt, 'div', {'class': 'qmv19b'}),
# TODO: choose one of these stragegies
# cmpt.select('div.Dnzdlc > div'), # triple
# [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
find_all_divs(cmpt, 'div', {'class': 'MkXWrd'}), # quad
find_all_divs(cmpt, 'div', {'class': 'JJZKK'}), # perspectives
]

# If any known div structures exist, parse subcomponents
for divs in filter(None, div_list):
return [parse_top_story(div, ctype, i) for i, div in enumerate(divs)]

# flatten 2d div list
subcomponent_divs = [div for divs in div_list for div in divs]

if len(div_list) > 1:
return [parse_top_story(div, ctype, i) for i, div in enumerate(subcomponent_divs)]

return [{'type': ctype, 'sub_rank': 0, 'error': 'No subcomponents found'}]


Expand All @@ -40,7 +44,7 @@ def parse_top_story(sub, ctype, sub_rank=0):
Returns:
dict: A parsed subresult
"""
parsed = {'type':ctype, 'sub_rank':sub_rank}
parsed = {'type': ctype, 'sub_rank': sub_rank}

parsed['title'] = get_text(sub, 'a', separator=' | ')
parsed['url'] = get_link(sub, key='href')
Expand Down
14 changes: 11 additions & 3 deletions WebSearcher/component_parsers/videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def parse_videos(cmpt):
"""
subs = cmpt.find_all('g-inner-card')
subs = cmpt.find_all('div', {'class':'VibNM'}) if not subs else subs
subs = cmpt.find_all('div', {'class':'sI5x9c'}) if not subs else subs
return [parse_video(sub, sub_rank) for sub_rank, sub in enumerate(subs)]

def parse_video(sub, sub_rank=0):
Expand All @@ -31,10 +32,15 @@ def parse_video(sub, sub_rank=0):
dict : parsed subresult
"""
parsed = {'type':'videos', 'sub_rank':sub_rank}
parsed['url'] = sub.find('a')['href']

all_urls = sub.find_all('a')
# remove urls if they start with '#'
non_hash_urls = [url for url in all_urls if not url['href'].startswith('#')]
parsed['url'] = non_hash_urls[0]['href'] if non_hash_urls else None

parsed['title'] = sub.find('div', {'role':'heading'}).text

details = sub.find_all('div',{'class':'MjS0Lc'})
details = sub.find_all('div', {'class':'MjS0Lc'})
if details:
text_div, citetime_div = details
parsed['text'] = text_div.text if text_div else None
Expand All @@ -50,7 +56,9 @@ def parse_video(sub, sub_rank=0):
else:
parsed['cite'] = citetime[0].text
else:
parsed['cite'] = sub.find('span', {'class':'ocUPSd'}).text
cite_span = sub.find('span', {'class':'ocUPSd'})
parsed['cite'] = sub.text if cite_span else None

parsed['timestamp'] = get_div_text(sub, {'class':'rjmdhd'})

parsed['details'] = {}
Expand Down
33 changes: 33 additions & 0 deletions WebSearcher/searchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,21 @@ def search(self, qry, location='', serp_id=''):
self.snapshot()
self.handle_response()

def mock_search(self, html, qry='testing_query', location='', serp_id=''):
"""Conducts a mock search, where we pass the html to the method instead
of fetching it. For testing.
Args:
html (str): HTML content as string
qry (str, optional): The search query. Optional because this is for testing.
location (str, optional): A location's Canonical Name.
serp_id (str, optional): A unique identifier for this SERP
"""
self.prepare_url(qry, location=location)
self.serp_id = serp_id if serp_id else hash_id(qry + location)
self.timestamp = utc_stamp()
self.html = html

def unzip_html(self):
"""Unzip brotli zipped html
Expand Down Expand Up @@ -250,6 +265,24 @@ def save_results(self, save_dir='.', append_to=False):
utils.write_lines(self.results, fp)
else:
self.log.info(f'No parsed results for serp_id {self.serp_id}')

def save_response_as_html(self, filename=None, save_dir='.'):
"""Save response text as html
Args:
filename (str, optional): Filename to save as, defaults to `test_response_save_{datetime}.html`
save_dir (str, optional): Directory to save to, defaults to current directory
"""
if not filename:
filename = f'response_{datetime.now().strftime("%Y%m%d%H%M%S")}.html'

# Save response text
if self.response.text:
with open(os.path.join(save_dir, filename), 'w') as outfile:
outfile.write(self.response.text)
else:
self.log.info(f'No response text for serp_id {self.serp_id}')


def scrape_results_html(self, save_dir='.', append_to=''):
"""Scrape and save all unique, non-internal URLs parsed from the SERP
Expand Down
6 changes: 3 additions & 3 deletions WebSearcher/webutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def check_dict_value(d, key, value):
def get_link(soup, kwargs=None, key='href'):
"""Utility for `soup.find('a')['href']` with null key handling"""
link = get_div(soup, 'a', kwargs)
return link.attrs[key] if link.attrs and key in link.attrs else None
return link.attrs.get(key, None) if link else None


def get_div(soup, name, attrs=None):
Expand All @@ -96,13 +96,13 @@ def find_all_divs(soup, name, attr=None, filter_empty=True):
if filter_empty:
divs = [c for c in divs if c]
divs = [c for c in divs if c.text != '']
return divs if divs else None
return divs


def find_children(soup, name, attr=None):
"""Find all children of a div with a given name and attribute"""
div = get_div(soup, name, attr)
return div.children if div else None
return div.children if div else []


def get_text(soup, name=None, kwargs=None, separator=" "):
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ six==1.16.0
soupsieve==2.2.1
tldextract==3.1.0
urllib3==1.26.5
pytest==7.3.1
syrupy==4.0.2
File renamed without changes.
File renamed without changes.
3 changes: 2 additions & 1 deletion scripts/test_search.py → scripts/demo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@
try:
se.save_serp(append_to='test_serp_save.json')
se.save_results(append_to='test_results_save.json')
se.save_response_as_html()
except Exception as e:
print('Save error', e)
print('Save error', e)
File renamed without changes.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2):

setuptools.setup(
name='WebSearcher',
version='2022.07.08-a',
version='2023.05.19',
url='http://github.com/gitronald/WebSearcher',
author='Ronald E. Robertson',
author_email='[email protected]',
Expand Down
29 changes: 29 additions & 0 deletions tests/test_serp_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest
import glob
import WebSearcher as ws

from syrupy.extensions.json import JSONSnapshotExtension

@pytest.fixture
def snapshot_json(snapshot):
return snapshot.use_extension(JSONSnapshotExtension)

def pytest_generate_tests(metafunc):
file_list = glob.glob('./tests/html_pages/*.html')
metafunc.parametrize("file_name", file_list )

def test_parsing(snapshot_json, file_name):
# read html
with open(file_name) as file:
html = file.read()

# Initialize crawler
se = ws.SearchEngine()

# Conduct Search
se.mock_search(html)

# Parse Results
se.parse_results()

assert se.results == snapshot_json

0 comments on commit e22eeca

Please sign in to comment.