Skip to content

Commit

Permalink
fix: top stories subcomponent filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
gitronald committed Jul 13, 2023
1 parent e22eeca commit 7e04755
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 10 deletions.
11 changes: 4 additions & 7 deletions WebSearcher/component_parsers/top_stories.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,15 @@ def parse_top_stories(cmpt, ctype='top_stories'):
div_list = [
find_all_divs(cmpt, 'g-inner-card'),
find_children(cmpt, 'div', {'class': 'qmv19b'}),
# TODO: choose one of these stragegies
# TODO: choose one of these strategies
# cmpt.select('div.Dnzdlc > div'), # triple
# [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
[c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple
find_all_divs(cmpt, 'div', {'class': 'MkXWrd'}), # quad
find_all_divs(cmpt, 'div', {'class': 'JJZKK'}), # perspectives
]

# flatten 2d div list
subcomponent_divs = [div for divs in div_list for div in divs]

if len(div_list) > 1:
return [parse_top_story(div, ctype, i) for i, div in enumerate(subcomponent_divs)]
for divs in filter(None, div_list):
return [parse_top_story(div, ctype, i) for i, div in enumerate(divs)]

return [{'type': ctype, 'sub_rank': 0, 'error': 'No subcomponents found'}]

Expand Down
4 changes: 2 additions & 2 deletions WebSearcher/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ def extract_results_column(soup):
"""
# Check if layout contains left side bar
left_side_bar = soup.find('div', {'class': 'OeVqAd'})
rso = soup.find('div', {'id':'rso'})

if not left_side_bar:
if not left_side_bar and rso:
# Extract results from single div
rso = soup.find('div', {'id':'rso'})
drop_tags = {'script', 'style', None}
column = [('main', c) for c in rso.children if c.name not in drop_tags]

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2):
version='2023.05.19',
url='http://github.com/gitronald/WebSearcher',
author='Ronald E. Robertson',
author_email='rer@ccs.neu.edu',
author_email='rer@acm.org',
license='BSD-3-Clause',
classifiers=[
'Programming Language :: Python :: 3',
Expand Down

0 comments on commit 7e04755

Please sign in to comment.