From 5255e71b9a01e9b5e45e3e82bf6c1e463049a6b4 Mon Sep 17 00:00:00 2001
From: "Ronald E. Robertson" <gitronald@users.noreply.github.com>
Date: Mon, 20 Nov 2023 14:20:24 -0800
Subject: [PATCH] Update extractors in new file, more model usage, logging
 defaults (#45)

* update: use model, no timestamp, sub_type funcs

* add: general result with video sub type

* update: consistent args,docs in html parsing utils

* fix: optional parsed values

* fix: use no attrs safe get_link

* update: parse 'complementary results' with general

* add: standalone models file

* update: people also ask, use model

* update: use models, limit details, recent SERPs

* update: filter out hidden footer components, usually PAA

* update: use models, limit details, recent SERPs

* Bump to 0.3.3

* fix: tw card classifying, better header classifier

* update: use models, current SERPs

* update: limit details

* update: optional ID attrs at result level

* update: classifier for map and local

* update: use models, modern SERPs

* update: minimal serp attrs, ignore hidden surveys

* add: filter-notice, filter hidden footer cmpts

* update: classify knowledge components before general

* update: grab children of blank div in main column

* update: account for vertical top stories

* fix: convert filter to list for empty check

* Bump to 0.3.4

* update: turn off console log if log file provided

* update: extractors in new file

* update: keep only text ppa suggestions, need selenium for more

* update: use models in main parsers

* update: handle layout shift extraction

* add: local news wrapper for top stories

* update: additional video div class

* update: include parse lang in webutils

* update: switch main init to .extractors

* Bump to 0.3.5
---
 WebSearcher/__init__.py                       |   5 +-
 WebSearcher/component_classifier.py           |   3 +-
 WebSearcher/component_parsers/__init__.py     |   2 +
 WebSearcher/component_parsers/local_news.py   |  14 ++
 .../component_parsers/people_also_ask.py      |  36 +---
 WebSearcher/component_parsers/videos.py       |   1 +
 WebSearcher/extractors.py                     | 135 +++++++++++++
 WebSearcher/parsers.py                        | 182 ++----------------
 WebSearcher/searchers.py                      |   3 +-
 WebSearcher/webutils.py                       |  11 ++
 setup.py                                      |   2 +-
 11 files changed, 191 insertions(+), 203 deletions(-)
 create mode 100644 WebSearcher/component_parsers/local_news.py
 create mode 100644 WebSearcher/extractors.py

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 45951d0..a1593e5 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,6 +1,7 @@
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 from .searchers import SearchEngine
-from .parsers import parse_serp, extract_components
+from .parsers import parse_serp
+from .extractors import extract_components
 from .locations import download_locations
 from .component_classifier import classify_type
 from .webutils import load_html, make_soup, load_soup
diff --git a/WebSearcher/component_classifier.py b/WebSearcher/component_classifier.py
index 9050575..66108f5 100644
--- a/WebSearcher/component_classifier.py
+++ b/WebSearcher/component_classifier.py
@@ -79,6 +79,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
             'Resultados de la Web': 'general',
             'Sports Results': 'knowledge',
             'Top stories': 'top_stories',
+            'Local news': 'local_news',
             'Translation Result': 'knowledge',
             'Twitter Results': 'twitter',
             'Unit Converter': 'knowledge',
@@ -111,7 +112,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
    # Check for string matches in header text e.g. `h2.text`
     for header in filter(None, header_list):
         for text, label in header_dict.items():
-            if header.text.startswith(text):
+            if header.text.strip().startswith(text):
                 return label
 
     # Return unknown if no matches
diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py
index f4bea68..a35de7c 100644
--- a/WebSearcher/component_parsers/__init__.py
+++ b/WebSearcher/component_parsers/__init__.py
@@ -20,6 +20,7 @@
 from .knowledge_rhs import parse_knowledge_rhs
 from .shopping_ads import parse_shopping_ads
 from .perspectives import parse_perspectives
+from .local_news import parse_local_news
 from .banner import parse_banner
 
 # Component details dataframe
@@ -34,6 +35,7 @@
     ('general_subresult', parse_general_results, 'General Subresult'),
     ('available_on', parse_available_on, 'Available On'),
     ('top_stories', parse_top_stories, 'Top Stories'),
+    ('local_news', parse_local_news, 'Local News'),
     ('latest_from', parse_latest_from, 'Latest From'),
     ('view_more_news', parse_view_more_news, 'View More News'),
     ('news_quotes', parse_news_quotes, 'News Quotes'),
diff --git a/WebSearcher/component_parsers/local_news.py b/WebSearcher/component_parsers/local_news.py
new file mode 100644
index 0000000..ed879da
--- /dev/null
+++ b/WebSearcher/component_parsers/local_news.py
@@ -0,0 +1,14 @@
+from . import parse_top_stories
+
+def parse_local_news(cmpt):
+    """Parse a "Perspectives & opinions" component
+
+    These components are the same as Top Stories, but have a different heading.
+    
+    Args:
+        cmpt (bs4 object): A latest from component
+    
+    Returns:
+        dict : parsed result
+    """
+    return parse_top_stories(cmpt, ctype='local_news')
diff --git a/WebSearcher/component_parsers/people_also_ask.py b/WebSearcher/component_parsers/people_also_ask.py
index 7fe99f7..5fab95b 100644
--- a/WebSearcher/component_parsers/people_also_ask.py
+++ b/WebSearcher/component_parsers/people_also_ask.py
@@ -15,26 +15,23 @@ def parse_people_also_ask(cmpt, sub_rank=0):
     Returns:
         list : list of parsed subcomponent dictionaries
     """
+
+    # questions = cmpt.find_all('g-accordion-expander')
+    # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
+    questions = cmpt.find_all("div", {"class":"related-question-pair"})
+    details = [parse_question(q) for q in questions] if questions else None
+
     parsed = BaseResult(
         type='people_also_ask',
         sub_rank=sub_rank,
+        details=details,
     )
-    # questions = cmpt.find_all('g-accordion-expander')
-    # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
-    questions = cmpt.find_all("div", {"class":"related-question-pair"})
-    parsed.details = [parse_question(q) for q in questions] if questions else None
 
     return [parsed.model_dump()]
 
 
 def parse_question(question):
     """Parse an individual question in a "People Also Ask" component"""
-    
-    # Get query and URL fragments
-    parsed = {
-        'title': None,
-        'url': None,
-    }
 
     # Get title
     title_divs = [
@@ -43,20 +40,5 @@ def parse_question(question):
         question.find('div', {'class': 'JlqpRe'}),  # 2023-11-16
     ]
     for title_div in filter(None, title_divs):
-        parsed['title'] = webutils.get_text(title_div)
-        parsed['url'] = webutils.get_link(title_div)
-
-    # Get citation
-    parsed['cite'] = webutils.get_text(question, 'cite')
-    
-    # Get text
-    replace = ['qry', 'title', 'cite']
-    text = question.text.replace('Search for: ', '')
-    for key in replace:
-        if key in parsed.keys() and parsed[key]:
-            text = text.replace(parsed[key], '')
-    parsed['text'] = text if text else None
-
-    return parsed
-
-
+        text = webutils.get_text(title_div)
+    return text
diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py
index 8a295e0..6578dbb 100644
--- a/WebSearcher/component_parsers/videos.py
+++ b/WebSearcher/component_parsers/videos.py
@@ -26,6 +26,7 @@ def parse_videos(cmpt):
     divs.extend(webutils.find_all_divs(cmpt, 'g-inner-card'))
     divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'VibNM'}))
     divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'mLmaBd'}))
+    divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'RzdJxc'}))
     # divs.extend(cmpt.find_all('div', {'class':'sI5x9c'})) # Selects a level too low, missing links.
     divs = list(filter(None, divs))
 
diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py
new file mode 100644
index 0000000..bd8f398
--- /dev/null
+++ b/WebSearcher/extractors.py
@@ -0,0 +1,135 @@
+from .component_parsers.footer import extract_footer, extract_footer_components
+
+def extract_results_column(soup):
+    """Extract SERP components
+    
+    Args:
+        soup (bs4): BeautifulSoup SERP
+    
+    Returns:
+        list: a list of HTML result components
+    """
+
+    # Drop tags
+    drop_tags = {'script', 'style', None}
+
+    # Check if layout contains left side bar
+    layout_shift = [
+        soup.find('div', {'class': 'OeVqAd'}),  # left side bar
+        soup.find('div', {'class': 'M8OgIe'}),  # top bar
+    ]
+    rso = soup.find('div', {'id':'rso'})
+    column = []
+
+    if not any(layout_shift) and rso:
+        for child in rso.children:
+            if child.name in drop_tags:
+                continue
+            if not child.attrs:
+                column.extend(child.contents)
+            else:
+                column.append(child)
+    elif rso:
+        # Extract results from two div sections
+
+        # Find section 1 results and append to rso list
+        column = rso.find_all('div', {'class':'sATSHe'})
+        column = [c for c in column if c.name not in drop_tags]
+
+    else:
+        section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'})
+        for div in section1:
+
+            # Conditional handling for Twitter result
+            if div.find('h2') and div.find('h2').text == "Twitter Results": 
+                column.append(div.find('div').parent)
+
+            # Conditional handling for g-section with header
+            elif div.find('g-section-with-header'): 
+                column.append(div.find('g-section-with-header').parent)
+
+            # Include divs with a "View more" type of button
+            elif div.find('g-more-link'): 
+                column.append(div)
+
+            # Include footer components that appear in the main column
+            elif div.find('div', {'class':'oIk2Cb'}):
+                column.append(div)
+
+            else:
+                # Handle general results
+                for child in div.find_all('div',  {'class':'g'}): 
+                    column.append(child)
+
+            # Find section 2 results and append to column list
+            section2 = soup.find('div', {'class':'WvKfwe a3spGf'})
+            if section2:
+                for child in section2.children:
+                    column.append(child)
+            column = [c for c in column if c.name not in drop_tags]
+
+    # Drop empty components
+    drop_text = {
+        "Main results",    # Remove empty rso component; hidden <h2> header  
+        "Twitter Results", # Remove empty Twitter component
+        "",                # Remove empty divs
+    }
+    column = [c for c in column if c.text not in drop_text]
+    column = list(zip(['main']*len(column), column))
+
+
+    return column
+
+
+def extract_components(soup):
+    """Extract SERP components
+    
+    Args:
+        soup (bs4): BeautifulSoup SERP
+    
+    Returns:
+        list: a rank ordered top-to-bottom and left-to-right list of 
+             (component location, component soup) tuples
+    """
+
+    cmpts = []
+
+    # Top Image Carousel
+    top_bar = soup.find('div', {'id':'appbar'})
+    if top_bar:
+        has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
+        if top_bar.find('g-scrolling-carousel') and has_img:
+            cmpts.append(('top_image_carousel', top_bar))
+
+    # Shopping Ads
+    shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'})
+    if shopping_ads:
+        cmpts.append(('shopping_ad', shopping_ads))
+
+    # Top Ads
+    ads = soup.find('div', {'id':'tads'})
+    if ads: 
+        cmpts.append(('ad', ads))
+
+    column = extract_results_column(soup)
+    cmpts.extend(column)
+
+    # Bottom Ads
+    ads = soup.find('div', {'id':'tadsb'})
+    if ads:
+        cmpts.append(('ad', ads))
+
+    # Footer results
+    footer = extract_footer(soup)
+    if footer and extract_footer_components(footer):
+        cmpts.append(('footer', footer))
+
+    # RHS Knowledge Panel 
+    rhs = soup.find('div', {'id': 'rhs'})
+    if rhs:
+        rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']})
+        if rhs_kp:
+            # reading from top-to-bottom, left-to-right
+            cmpts.append(('knowledge_rhs', rhs_kp))
+            
+    return cmpts
diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py
index 1659824..8ee6d8e 100644
--- a/WebSearcher/parsers.py
+++ b/WebSearcher/parsers.py
@@ -1,30 +1,14 @@
 from . import webutils
 from .component_classifier import classify_type
 from .component_parsers import type_functions
-from .component_parsers.footer import extract_footer, extract_footer_components
-from . import logger
-log = logger.Logger().start(__name__)
+from .extractors import extract_components
+from .models import BaseResult
+from .logger import Logger
+log = Logger().start(__name__)
 
 import traceback
 from bs4 import BeautifulSoup
 
-UNKNOWN_COMPONENT = {
-    'sub_rank':0, 
-    'type': 'unknown'
-}
-
-def parse_query(soup):
-    """Parse query from title of html soup"""
-    title = str(soup.html.find('title'))
-    return webutils.strip_html_tags(title).split(" - ")[0]
-
-def parse_lang(soup):
-    """Parse language from html tags"""
-    try:
-        return soup.find('html').attrs['lang']
-    except Exception as e:
-        log.exception('Error while parsing language')
-        return None
 
 def get_component_parser(cmpt_type, cmpt_funcs=type_functions):
     """Returns the parser for a given component type"""
@@ -33,150 +17,14 @@ def get_component_parser(cmpt_type, cmpt_funcs=type_functions):
     except KeyError as e:
         return not_implemented
 
+
 def not_implemented(cmpt):
     """Placeholder function for component parsers that are not implemented"""
-    parsed = UNKNOWN_COMPONENT.copy()
-    parsed['type'] = classify_type(cmpt)
+    parsed = BaseResult(type=classify_type(cmpt), sub_rank=0).model_dump()
     parsed['error'] = 'not implemented'
     return [parsed]
 
 
-def extract_results_column(soup):
-    """Extract SERP components
-    
-    Args:
-        soup (bs4): BeautifulSoup SERP
-    
-    Returns:
-        list: a list of HTML result components
-    """
-    # Check if layout contains left side bar
-    left_side_bar = soup.find('div', {'class': 'OeVqAd'})
-    rso = soup.find('div', {'id':'rso'})
-
-    if not left_side_bar and rso:
-        # Extract results from single div
-        drop_tags = {'script', 'style', None}
-        column = []
-        for child in rso.children:
-            if child.name in drop_tags:
-                continue
-            if not child.attrs:
-                column.extend(child.contents)
-            else:
-                column.append(child)
-        column = list(zip(['main']*len(column), column))
-
-    else:
-        # Extract results from two div sections
-        rso = []
-        # rso = soup.find('div', {'id':'rso'})
-
-        # Find section 1 results and append to rso list
-        section1 = soup.find_all('div', {'class':'sATSHe'})
-        # section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'})
-        for div in section1:
-
-            # Conditional handling for Twitter result
-            if div.find('h2') and div.find('h2').text == "Twitter Results": 
-                rso.append(div.find('div').parent)
-
-            # Conditional handling for g-section with header
-            elif div.find('g-section-with-header'): 
-                rso.append(div.find('g-section-with-header').parent)
-
-            # Include divs with a "View more" type of button
-            elif div.find('g-more-link'): 
-                rso.append(div)
-
-            # Include footer components that appear in the main column
-            elif div.find('div', {'class':'oIk2Cb'}):
-                rso.append(div)
-
-            else:
-                # Handle general results
-                for child in div.find_all('div',  {'class':'g'}): 
-                    rso.append(child)
-
-        # Find section 2 results and append to rso list
-        section2 = soup.find('div', {'class':'WvKfwe a3spGf'})
-        if section2:
-            for child in section2.children:
-                rso.append(child)
-
-        drop_tags = {'script', 'style'}
-        column = [('main', c) for c in rso if c.name not in drop_tags]
-
-    # Legacy parsing
-    # div_class = {'class':['g','bkWMgd']}
-    # column = [('main', r) for r in soup.find_all('div', div_class)]
-
-    # Remove empty rso component; hidden <h2> header
-    drop_text = {"Main results"}
-    column = [(cloc, c) for (cloc, c) in column if c.text not in drop_text]
-
-    # Hacky fix removing named Twitter component without content, possible G error
-    # Another fix for empty components, e.g. - <div class="bkWMgd"></div>
-    drop_text = {'Twitter Results', ''}
-    column = [(cloc, c) for (cloc, c) in column if c.text not in drop_text]
-    return column
-    
-    
-
-def extract_components(soup):
-    """Extract SERP components
-    
-    Args:
-        soup (bs4): BeautifulSoup SERP
-    
-    Returns:
-        list: a rank ordered top-to-bottom and left-to-right list of 
-             (component location, component soup) tuples
-    """
-
-    cmpts = []
-
-    # Top Image Carousel
-    top_bar = soup.find('div', {'id':'appbar'})
-    if top_bar:
-        has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
-        if top_bar.find('g-scrolling-carousel') and has_img:
-            cmpts.append(('top_image_carousel', top_bar))
-
-    # Shopping Ads
-    shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'})
-    if shopping_ads:
-        cmpts.append(('shopping_ad', shopping_ads))
-
-    # Top Ads
-    ads = soup.find('div', {'id':'tads'})
-    if ads: 
-        cmpts.append(('ad', ads))
-
-    column = extract_results_column(soup)
-    cmpts.extend(column)
-
-    # Bottom Ads
-    ads = soup.find('div', {'id':'tadsb'})
-    if ads:
-        cmpts.append(('ad', ads))
-
-    # Footer results
-    footer = extract_footer(soup)
-    if footer and extract_footer_components(footer):
-        cmpts.append(('footer', footer))
-
-    # RHS Knowledge Panel 
-    rhs = soup.find('div', {'id': 'rhs'})
-    if rhs:
-        rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']})
-        if rhs_kp:
-            # reading from top-to-bottom, left-to-right
-            cmpts.append(('knowledge_rhs', rhs_kp))
-            
-    return cmpts
-
-
 def parse_component(cmpt, cmpt_type='', cmpt_rank=0):
     """Parse a SERP component
     
@@ -188,18 +36,16 @@ def parse_component(cmpt, cmpt_type='', cmpt_rank=0):
     Returns:
         dict: The parsed results and/or subresults
     """
+
     # Classify Component
     cmpt_type = cmpt_type if cmpt_type else classify_type(cmpt)
     assert cmpt_type, 'Null component type'
 
-    # if cmpt_type == 'directions':
-    #     print(cmpt)
-
     # Return unknown components
     if cmpt_type == 'unknown':
-        unknown_component = UNKNOWN_COMPONENT.copy()
-        unknown_component['cmpt_rank'] = 0
-        return [unknown_component]
+        parsed = BaseResult(type='unknown', sub_rank=0).model_dump()
+        parsed['cmpt_rank'] = cmpt_rank
+        return [parsed]
 
     # Parse component
     try:
@@ -220,6 +66,7 @@ def parse_component(cmpt, cmpt_type='', cmpt_rank=0):
 
     return parsed_cmpt
 
+
 def parse_serp(serp, serp_id=None, crawl_id=None, verbose=False, make_soup=False):
     """Parse a Search Engine Result Page (SERP)
     
@@ -267,13 +114,6 @@ def parse_serp(serp, serp_id=None, crawl_id=None, verbose=False, make_soup=False
     if crawl_id:
         serp_attrs['crawl_id'] = crawl_id
 
-    # Deprecated: Unused; can quickly get via regex post-parse
-    # serp_attrs.update({
-    #     'qry': parse_query(soup),
-    #     'lang': parse_lang(soup),
-    #     'lhs_bar': soup.find('div', {'class': 'OeVqAd'}) is not None,
-    # })
-
     for serp_rank, p in enumerate(parsed):
         p['serp_rank'] = serp_rank
         p.update(serp_attrs)
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index f689ab8..420a68b 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -59,7 +59,8 @@ def __init__(self,
         # Set a log file, prints to console by default
         self.log = logger.Logger(
             file_name=log_fp, 
-            file_mode=log_mode
+            file_mode=log_mode,
+            console=True if not log_fp else False,
         ).start(__name__)
 
         # Set an SSH tunnel - conducting the search from somewhere else
diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py
index 72b9f7b..7f2c29e 100644
--- a/WebSearcher/webutils.py
+++ b/WebSearcher/webutils.py
@@ -81,10 +81,21 @@ def parse_hashtags(text):
     hashtags = [re.sub(r"(\W+)$", "", h, flags = re.UNICODE) for h in hashtags]
     return list(set(hashtags))
 
+
+def parse_lang(soup):
+    """Parse language from html tags"""
+    try:
+        return soup.find('html').attrs['lang']
+    except Exception as e:
+        log.exception('Error while parsing language')
+        return None
+    
+
 # Deprecated: text processing should be done after parsing not during
 # def parse_emojis(text):
 #     return [emoji.demojize(e['emoji']) for e in emoji.emoji_lis(text)]
 
+
 # Get divs, links, and text ----------------------------------------------------
 
 
diff --git a/setup.py b/setup.py
index ba02ac9..6130aac 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ def get_readme_descriptions(fp='README.md', s='#', stop_at=2):
 
 setuptools.setup(
     name='WebSearcher',
-    version='0.3.4',
+    version='0.3.5',
     url='http://github.com/gitronald/WebSearcher',
     author='Ronald E. Robertson',
     author_email='rer@acm.org',