From 5255e71b9a01e9b5e45e3e82bf6c1e463049a6b4 Mon Sep 17 00:00:00 2001 From: "Ronald E. Robertson" Date: Mon, 20 Nov 2023 14:20:24 -0800 Subject: [PATCH] Update extractors in new file, more model usage, logging defaults (#45) * update: use model, no timestamp, sub_type funcs * add: general result with video sub type * update: consistent args,docs in html parsing utils * fix: optional parsed values * fix: use no attrs safe get_link * update: parse 'complementary results' with general * add: standalone models file * update: people also ask, use model * update: use models, limit details, recent SERPs * update: filter out hidden footer components, usually PAA * update: use models, limit details, recent SERPs * Bump to 0.3.3 * fix: tw card classifying, better header classifier * update: use models, current SERPs * update: limit details * update: optional ID attrs at result level * update: classifier for map and local * update: use models, modern SERPs * update: minimal serp attrs, ignore hidden surveys * add: filter-notice, filter hidden footer cmpts * update: classify knowledge components before general * update: grab children of blank div in main column * update: account for vertical top stories * fix: convert filter to list for empty check * Bump to 0.3.4 * update: turn off console log if log file provided * update: extractors in new file * update: keep only text ppa suggestions, need selenium for more * update: use models in main parsers * update: handle layout shift extraction * add: local news wrapper for top stories * update: additional video div class * update: include parse lang in webutils * update: switch main init to .extractors * Bump to 0.3.5 --- WebSearcher/__init__.py | 5 +- WebSearcher/component_classifier.py | 3 +- WebSearcher/component_parsers/__init__.py | 2 + WebSearcher/component_parsers/local_news.py | 14 ++ .../component_parsers/people_also_ask.py | 36 +--- WebSearcher/component_parsers/videos.py | 1 + WebSearcher/extractors.py | 135 +++++++++++++ WebSearcher/parsers.py | 182 ++---------------- WebSearcher/searchers.py | 3 +- WebSearcher/webutils.py | 11 ++ setup.py | 2 +- 11 files changed, 191 insertions(+), 203 deletions(-) create mode 100644 WebSearcher/component_parsers/local_news.py create mode 100644 WebSearcher/extractors.py diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 45951d0..a1593e5 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,6 +1,7 @@ -__version__ = "0.3.4" +__version__ = "0.3.5" from .searchers import SearchEngine -from .parsers import parse_serp, extract_components +from .parsers import parse_serp +from .extractors import extract_components from .locations import download_locations from .component_classifier import classify_type from .webutils import load_html, make_soup, load_soup diff --git a/WebSearcher/component_classifier.py b/WebSearcher/component_classifier.py index 9050575..66108f5 100644 --- a/WebSearcher/component_classifier.py +++ b/WebSearcher/component_classifier.py @@ -79,6 +79,7 @@ def classify_header(cmpt: bs4.element.Tag, level): 'Resultados de la Web': 'general', 'Sports Results': 'knowledge', 'Top stories': 'top_stories', + 'Local news': 'local_news', 'Translation Result': 'knowledge', 'Twitter Results': 'twitter', 'Unit Converter': 'knowledge', @@ -111,7 +112,7 @@ def classify_header(cmpt: bs4.element.Tag, level): # Check for string matches in header text e.g. `h2.text` for header in filter(None, header_list): for text, label in header_dict.items(): - if header.text.startswith(text): + if header.text.strip().startswith(text): return label # Return unknown if no matches diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py index f4bea68..a35de7c 100644 --- a/WebSearcher/component_parsers/__init__.py +++ b/WebSearcher/component_parsers/__init__.py @@ -20,6 +20,7 @@ from .knowledge_rhs import parse_knowledge_rhs from .shopping_ads import parse_shopping_ads from .perspectives import parse_perspectives +from .local_news import parse_local_news from .banner import parse_banner # Component details dataframe @@ -34,6 +35,7 @@ ('general_subresult', parse_general_results, 'General Subresult'), ('available_on', parse_available_on, 'Available On'), ('top_stories', parse_top_stories, 'Top Stories'), + ('local_news', parse_local_news, 'Local News'), ('latest_from', parse_latest_from, 'Latest From'), ('view_more_news', parse_view_more_news, 'View More News'), ('news_quotes', parse_news_quotes, 'News Quotes'), diff --git a/WebSearcher/component_parsers/local_news.py b/WebSearcher/component_parsers/local_news.py new file mode 100644 index 0000000..ed879da --- /dev/null +++ b/WebSearcher/component_parsers/local_news.py @@ -0,0 +1,14 @@ +from . import parse_top_stories + +def parse_local_news(cmpt): + """Parse a "Perspectives & opinions" component + + These components are the same as Top Stories, but have a different heading. + + Args: + cmpt (bs4 object): A latest from component + + Returns: + dict : parsed result + """ + return parse_top_stories(cmpt, ctype='local_news') diff --git a/WebSearcher/component_parsers/people_also_ask.py b/WebSearcher/component_parsers/people_also_ask.py index 7fe99f7..5fab95b 100644 --- a/WebSearcher/component_parsers/people_also_ask.py +++ b/WebSearcher/component_parsers/people_also_ask.py @@ -15,26 +15,23 @@ def parse_people_also_ask(cmpt, sub_rank=0): Returns: list : list of parsed subcomponent dictionaries """ + + # questions = cmpt.find_all('g-accordion-expander') + # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'}) + questions = cmpt.find_all("div", {"class":"related-question-pair"}) + details = [parse_question(q) for q in questions] if questions else None + parsed = BaseResult( type='people_also_ask', sub_rank=sub_rank, + details=details, ) - # questions = cmpt.find_all('g-accordion-expander') - # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'}) - questions = cmpt.find_all("div", {"class":"related-question-pair"}) - parsed.details = [parse_question(q) for q in questions] if questions else None return [parsed.model_dump()] def parse_question(question): """Parse an individual question in a "People Also Ask" component""" - - # Get query and URL fragments - parsed = { - 'title': None, - 'url': None, - } # Get title title_divs = [ @@ -43,20 +40,5 @@ def parse_question(question): question.find('div', {'class': 'JlqpRe'}), # 2023-11-16 ] for title_div in filter(None, title_divs): - parsed['title'] = webutils.get_text(title_div) - parsed['url'] = webutils.get_link(title_div) - - # Get citation - parsed['cite'] = webutils.get_text(question, 'cite') - - # Get text - replace = ['qry', 'title', 'cite'] - text = question.text.replace('Search for: ', '') - for key in replace: - if key in parsed.keys() and parsed[key]: - text = text.replace(parsed[key], '') - parsed['text'] = text if text else None - - return parsed - - + text = webutils.get_text(title_div) + return text diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py index 8a295e0..6578dbb 100644 --- a/WebSearcher/component_parsers/videos.py +++ b/WebSearcher/component_parsers/videos.py @@ -26,6 +26,7 @@ def parse_videos(cmpt): divs.extend(webutils.find_all_divs(cmpt, 'g-inner-card')) divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'VibNM'})) divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'mLmaBd'})) + divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'RzdJxc'})) # divs.extend(cmpt.find_all('div', {'class':'sI5x9c'})) # Selects a level too low, missing links. divs = list(filter(None, divs)) diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py new file mode 100644 index 0000000..bd8f398 --- /dev/null +++ b/WebSearcher/extractors.py @@ -0,0 +1,135 @@ +from .component_parsers.footer import extract_footer, extract_footer_components + +def extract_results_column(soup): + """Extract SERP components + + Args: + soup (bs4): BeautifulSoup SERP + + Returns: + list: a list of HTML result components + """ + + # Drop tags + drop_tags = {'script', 'style', None} + + # Check if layout contains left side bar + layout_shift = [ + soup.find('div', {'class': 'OeVqAd'}), # left side bar + soup.find('div', {'class': 'M8OgIe'}), # top bar + ] + rso = soup.find('div', {'id':'rso'}) + column = [] + + if not any(layout_shift) and rso: + for child in rso.children: + if child.name in drop_tags: + continue + if not child.attrs: + column.extend(child.contents) + else: + column.append(child) + elif rso: + # Extract results from two div sections + + # Find section 1 results and append to rso list + column = rso.find_all('div', {'class':'sATSHe'}) + column = [c for c in column if c.name not in drop_tags] + + else: + section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'}) + for div in section1: + + # Conditional handling for Twitter result + if div.find('h2') and div.find('h2').text == "Twitter Results": + column.append(div.find('div').parent) + + # Conditional handling for g-section with header + elif div.find('g-section-with-header'): + column.append(div.find('g-section-with-header').parent) + + # Include divs with a "View more" type of button + elif div.find('g-more-link'): + column.append(div) + + # Include footer components that appear in the main column + elif div.find('div', {'class':'oIk2Cb'}): + column.append(div) + + else: + # Handle general results + for child in div.find_all('div', {'class':'g'}): + column.append(child) + + # Find section 2 results and append to column list + section2 = soup.find('div', {'class':'WvKfwe a3spGf'}) + if section2: + for child in section2.children: + column.append(child) + column = [c for c in column if c.name not in drop_tags] + + # Drop empty components + drop_text = { + "Main results", # Remove empty rso component; hidden

header + "Twitter Results", # Remove empty Twitter component + "", # Remove empty divs + } + column = [c for c in column if c.text not in drop_text] + column = list(zip(['main']*len(column), column)) + + + return column + + +def extract_components(soup): + """Extract SERP components + + Args: + soup (bs4): BeautifulSoup SERP + + Returns: + list: a rank ordered top-to-bottom and left-to-right list of + (component location, component soup) tuples + """ + + cmpts = [] + + # Top Image Carousel + top_bar = soup.find('div', {'id':'appbar'}) + if top_bar: + has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) + if top_bar.find('g-scrolling-carousel') and has_img: + cmpts.append(('top_image_carousel', top_bar)) + + # Shopping Ads + shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'}) + if shopping_ads: + cmpts.append(('shopping_ad', shopping_ads)) + + # Top Ads + ads = soup.find('div', {'id':'tads'}) + if ads: + cmpts.append(('ad', ads)) + + column = extract_results_column(soup) + cmpts.extend(column) + + # Bottom Ads + ads = soup.find('div', {'id':'tadsb'}) + if ads: + cmpts.append(('ad', ads)) + + # Footer results + footer = extract_footer(soup) + if footer and extract_footer_components(footer): + cmpts.append(('footer', footer)) + + # RHS Knowledge Panel + rhs = soup.find('div', {'id': 'rhs'}) + if rhs: + rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']}) + if rhs_kp: + # reading from top-to-bottom, left-to-right + cmpts.append(('knowledge_rhs', rhs_kp)) + + return cmpts diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index 1659824..8ee6d8e 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -1,30 +1,14 @@ from . import webutils from .component_classifier import classify_type from .component_parsers import type_functions -from .component_parsers.footer import extract_footer, extract_footer_components -from . import logger -log = logger.Logger().start(__name__) +from .extractors import extract_components +from .models import BaseResult +from .logger import Logger +log = Logger().start(__name__) import traceback from bs4 import BeautifulSoup -UNKNOWN_COMPONENT = { - 'sub_rank':0, - 'type': 'unknown' -} - -def parse_query(soup): - """Parse query from title of html soup""" - title = str(soup.html.find('title')) - return webutils.strip_html_tags(title).split(" - ")[0] - -def parse_lang(soup): - """Parse language from html tags""" - try: - return soup.find('html').attrs['lang'] - except Exception as e: - log.exception('Error while parsing language') - return None def get_component_parser(cmpt_type, cmpt_funcs=type_functions): """Returns the parser for a given component type""" @@ -33,150 +17,14 @@ def get_component_parser(cmpt_type, cmpt_funcs=type_functions): except KeyError as e: return not_implemented + def not_implemented(cmpt): """Placeholder function for component parsers that are not implemented""" - parsed = UNKNOWN_COMPONENT.copy() - parsed['type'] = classify_type(cmpt) + parsed = BaseResult(type=classify_type(cmpt), sub_rank=0).model_dump() parsed['error'] = 'not implemented' return [parsed] -def extract_results_column(soup): - """Extract SERP components - - Args: - soup (bs4): BeautifulSoup SERP - - Returns: - list: a list of HTML result components - """ - # Check if layout contains left side bar - left_side_bar = soup.find('div', {'class': 'OeVqAd'}) - rso = soup.find('div', {'id':'rso'}) - - if not left_side_bar and rso: - # Extract results from single div - drop_tags = {'script', 'style', None} - column = [] - for child in rso.children: - if child.name in drop_tags: - continue - if not child.attrs: - column.extend(child.contents) - else: - column.append(child) - column = list(zip(['main']*len(column), column)) - - else: - # Extract results from two div sections - rso = [] - # rso = soup.find('div', {'id':'rso'}) - - # Find section 1 results and append to rso list - section1 = soup.find_all('div', {'class':'sATSHe'}) - # section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'}) - for div in section1: - - # Conditional handling for Twitter result - if div.find('h2') and div.find('h2').text == "Twitter Results": - rso.append(div.find('div').parent) - - # Conditional handling for g-section with header - elif div.find('g-section-with-header'): - rso.append(div.find('g-section-with-header').parent) - - # Include divs with a "View more" type of button - elif div.find('g-more-link'): - rso.append(div) - - # Include footer components that appear in the main column - elif div.find('div', {'class':'oIk2Cb'}): - rso.append(div) - - else: - # Handle general results - for child in div.find_all('div', {'class':'g'}): - rso.append(child) - - # Find section 2 results and append to rso list - section2 = soup.find('div', {'class':'WvKfwe a3spGf'}) - if section2: - for child in section2.children: - rso.append(child) - - drop_tags = {'script', 'style'} - column = [('main', c) for c in rso if c.name not in drop_tags] - - # Legacy parsing - # div_class = {'class':['g','bkWMgd']} - # column = [('main', r) for r in soup.find_all('div', div_class)] - - # Remove empty rso component; hidden

header - drop_text = {"Main results"} - column = [(cloc, c) for (cloc, c) in column if c.text not in drop_text] - - # Hacky fix removing named Twitter component without content, possible G error - # Another fix for empty components, e.g. -
- drop_text = {'Twitter Results', ''} - column = [(cloc, c) for (cloc, c) in column if c.text not in drop_text] - return column - - - -def extract_components(soup): - """Extract SERP components - - Args: - soup (bs4): BeautifulSoup SERP - - Returns: - list: a rank ordered top-to-bottom and left-to-right list of - (component location, component soup) tuples - """ - - cmpts = [] - - # Top Image Carousel - top_bar = soup.find('div', {'id':'appbar'}) - if top_bar: - has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) - if top_bar.find('g-scrolling-carousel') and has_img: - cmpts.append(('top_image_carousel', top_bar)) - - # Shopping Ads - shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'}) - if shopping_ads: - cmpts.append(('shopping_ad', shopping_ads)) - - # Top Ads - ads = soup.find('div', {'id':'tads'}) - if ads: - cmpts.append(('ad', ads)) - - column = extract_results_column(soup) - cmpts.extend(column) - - # Bottom Ads - ads = soup.find('div', {'id':'tadsb'}) - if ads: - cmpts.append(('ad', ads)) - - # Footer results - footer = extract_footer(soup) - if footer and extract_footer_components(footer): - cmpts.append(('footer', footer)) - - # RHS Knowledge Panel - rhs = soup.find('div', {'id': 'rhs'}) - if rhs: - rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']}) - if rhs_kp: - # reading from top-to-bottom, left-to-right - cmpts.append(('knowledge_rhs', rhs_kp)) - - return cmpts - - def parse_component(cmpt, cmpt_type='', cmpt_rank=0): """Parse a SERP component @@ -188,18 +36,16 @@ def parse_component(cmpt, cmpt_type='', cmpt_rank=0): Returns: dict: The parsed results and/or subresults """ + # Classify Component cmpt_type = cmpt_type if cmpt_type else classify_type(cmpt) assert cmpt_type, 'Null component type' - # if cmpt_type == 'directions': - # print(cmpt) - # Return unknown components if cmpt_type == 'unknown': - unknown_component = UNKNOWN_COMPONENT.copy() - unknown_component['cmpt_rank'] = 0 - return [unknown_component] + parsed = BaseResult(type='unknown', sub_rank=0).model_dump() + parsed['cmpt_rank'] = cmpt_rank + return [parsed] # Parse component try: @@ -220,6 +66,7 @@ def parse_component(cmpt, cmpt_type='', cmpt_rank=0): return parsed_cmpt + def parse_serp(serp, serp_id=None, crawl_id=None, verbose=False, make_soup=False): """Parse a Search Engine Result Page (SERP) @@ -267,13 +114,6 @@ def parse_serp(serp, serp_id=None, crawl_id=None, verbose=False, make_soup=False if crawl_id: serp_attrs['crawl_id'] = crawl_id - # Deprecated: Unused; can quickly get via regex post-parse - # serp_attrs.update({ - # 'qry': parse_query(soup), - # 'lang': parse_lang(soup), - # 'lhs_bar': soup.find('div', {'class': 'OeVqAd'}) is not None, - # }) - for serp_rank, p in enumerate(parsed): p['serp_rank'] = serp_rank p.update(serp_attrs) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index f689ab8..420a68b 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -59,7 +59,8 @@ def __init__(self, # Set a log file, prints to console by default self.log = logger.Logger( file_name=log_fp, - file_mode=log_mode + file_mode=log_mode, + console=True if not log_fp else False, ).start(__name__) # Set an SSH tunnel - conducting the search from somewhere else diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py index 72b9f7b..7f2c29e 100644 --- a/WebSearcher/webutils.py +++ b/WebSearcher/webutils.py @@ -81,10 +81,21 @@ def parse_hashtags(text): hashtags = [re.sub(r"(\W+)$", "", h, flags = re.UNICODE) for h in hashtags] return list(set(hashtags)) + +def parse_lang(soup): + """Parse language from html tags""" + try: + return soup.find('html').attrs['lang'] + except Exception as e: + log.exception('Error while parsing language') + return None + + # Deprecated: text processing should be done after parsing not during # def parse_emojis(text): # return [emoji.demojize(e['emoji']) for e in emoji.emoji_lis(text)] + # Get divs, links, and text ---------------------------------------------------- diff --git a/setup.py b/setup.py index ba02ac9..6130aac 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2): setuptools.setup( name='WebSearcher', - version='0.3.4', + version='0.3.5', url='http://github.com/gitronald/WebSearcher', author='Ronald E. Robertson', author_email='rer@acm.org',