diff --git a/WebSearcher/component_parsers/top_stories.py b/WebSearcher/component_parsers/top_stories.py index b0ae094..c4569d2 100644 --- a/WebSearcher/component_parsers/top_stories.py +++ b/WebSearcher/component_parsers/top_stories.py @@ -19,18 +19,15 @@ def parse_top_stories(cmpt, ctype='top_stories'): div_list = [ find_all_divs(cmpt, 'g-inner-card'), find_children(cmpt, 'div', {'class': 'qmv19b'}), - # TODO: choose one of these stragegies + # TODO: choose one of these strategies # cmpt.select('div.Dnzdlc > div'), # triple - # [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple + [c for c in cmpt.find_all('div') if 'data-hveid' in c.attrs], # triple find_all_divs(cmpt, 'div', {'class': 'MkXWrd'}), # quad find_all_divs(cmpt, 'div', {'class': 'JJZKK'}), # perspectives ] - # flatten 2d div list - subcomponent_divs = [div for divs in div_list for div in divs] - - if len(div_list) > 1: - return [parse_top_story(div, ctype, i) for i, div in enumerate(subcomponent_divs)] + for divs in filter(None, div_list): + return [parse_top_story(div, ctype, i) for i, div in enumerate(divs)] return [{'type': ctype, 'sub_rank': 0, 'error': 'No subcomponents found'}] diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index 822387b..d5131a7 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -52,10 +52,10 @@ def extract_results_column(soup): """ # Check if layout contains left side bar left_side_bar = soup.find('div', {'class': 'OeVqAd'}) + rso = soup.find('div', {'id':'rso'}) - if not left_side_bar: + if not left_side_bar and rso: # Extract results from single div - rso = soup.find('div', {'id':'rso'}) drop_tags = {'script', 'style', None} column = [('main', c) for c in rso.children if c.name not in drop_tags] diff --git a/setup.py b/setup.py index 230b784..4d650b8 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2): version='2023.05.19', url='http://github.com/gitronald/WebSearcher', author='Ronald E. Robertson', - author_email='rer@ccs.neu.edu', + author_email='rer@acm.org', license='BSD-3-Clause', classifiers=[ 'Programming Language :: Python :: 3',