Update extractors in new file, more model usage, logging defaults (#45)

* update: use model, no timestamp, sub_type funcs * add: general result with video sub type * update: consistent args,docs in html parsing utils * fix: optional parsed values * fix: use no attrs safe get_link * update: parse 'complementary results' with general * add: standalone models file * update: people also ask, use model * update: use models, limit details, recent SERPs * update: filter out hidden footer components, usually PAA * update: use models, limit details, recent SERPs * Bump to 0.3.3 * fix: tw card classifying, better header classifier * update: use models, current SERPs * update: limit details * update: optional ID attrs at result level * update: classifier for map and local * update: use models, modern SERPs * update: minimal serp attrs, ignore hidden surveys * add: filter-notice, filter hidden footer cmpts * update: classify knowledge components before general * update: grab children of blank div in main column * update: account for vertical top stories * fix: convert filter to list for empty check * Bump to 0.3.4 * update: turn off console log if log file provided * update: extractors in new file * update: keep only text ppa suggestions, need selenium for more * update: use models in main parsers * update: handle layout shift extraction * add: local news wrapper for top stories * update: additional video div class * update: include parse lang in webutils * update: switch main init to .extractors * Bump to 0.3.5
gitronald · Nov 20, 2023 · 5255e71 · 5255e71
1 parent 491d605
commit 5255e71
Show file tree

Hide file tree

Showing 11 changed files with 191 additions and 203 deletions.
diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
@@ -1,6 +1,7 @@
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 from .searchers import SearchEngine
-from .parsers import parse_serp, extract_components
+from .parsers import parse_serp
+from .extractors import extract_components
 from .locations import download_locations
 from .component_classifier import classify_type
 from .webutils import load_html, make_soup, load_soup
diff --git a/WebSearcher/component_classifier.py b/WebSearcher/component_classifier.py
@@ -79,6 +79,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
             'Resultados de la Web': 'general',
             'Sports Results': 'knowledge',
             'Top stories': 'top_stories',
+            'Local news': 'local_news',
             'Translation Result': 'knowledge',
             'Twitter Results': 'twitter',
             'Unit Converter': 'knowledge',
@@ -111,7 +112,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
    # Check for string matches in header text e.g. `h2.text`
     for header in filter(None, header_list):
         for text, label in header_dict.items():
-            if header.text.startswith(text):
+            if header.text.strip().startswith(text):
                 return label
 
     # Return unknown if no matches

diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py
@@ -20,6 +20,7 @@
 from .knowledge_rhs import parse_knowledge_rhs
 from .shopping_ads import parse_shopping_ads
 from .perspectives import parse_perspectives
+from .local_news import parse_local_news
 from .banner import parse_banner
 
 # Component details dataframe
@@ -34,6 +35,7 @@
     ('general_subresult', parse_general_results, 'General Subresult'),
     ('available_on', parse_available_on, 'Available On'),
     ('top_stories', parse_top_stories, 'Top Stories'),
+    ('local_news', parse_local_news, 'Local News'),
     ('latest_from', parse_latest_from, 'Latest From'),
     ('view_more_news', parse_view_more_news, 'View More News'),
     ('news_quotes', parse_news_quotes, 'News Quotes'),

diff --git a/WebSearcher/component_parsers/local_news.py b/WebSearcher/component_parsers/local_news.py
@@ -0,0 +1,14 @@
+from . import parse_top_stories
+
+def parse_local_news(cmpt):
+    """Parse a "Perspectives & opinions" component
+
+    These components are the same as Top Stories, but have a different heading.
+    
+    Args:
+        cmpt (bs4 object): A latest from component
+    
+    Returns:
+        dict : parsed result
+    """
+    return parse_top_stories(cmpt, ctype='local_news')
diff --git a/WebSearcher/component_parsers/people_also_ask.py b/WebSearcher/component_parsers/people_also_ask.py
@@ -15,26 +15,23 @@ def parse_people_also_ask(cmpt, sub_rank=0):
     Returns:
         list : list of parsed subcomponent dictionaries
     """
+
+    # questions = cmpt.find_all('g-accordion-expander')
+    # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
+    questions = cmpt.find_all("div", {"class":"related-question-pair"})
+    details = [parse_question(q) for q in questions] if questions else None
+
     parsed = BaseResult(
         type='people_also_ask',
         sub_rank=sub_rank,
+        details=details,
     )
-    # questions = cmpt.find_all('g-accordion-expander')
-    # questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
-    questions = cmpt.find_all("div", {"class":"related-question-pair"})
-    parsed.details = [parse_question(q) for q in questions] if questions else None
 
     return [parsed.model_dump()]
 
 
 def parse_question(question):
     """Parse an individual question in a "People Also Ask" component"""
-
-    # Get query and URL fragments
-    parsed = {
-        'title': None,
-        'url': None,
-    }
 
     # Get title
     title_divs = [
@@ -43,20 +40,5 @@ def parse_question(question):
         question.find('div', {'class': 'JlqpRe'}),  # 2023-11-16
     ]
     for title_div in filter(None, title_divs):
-        parsed['title'] = webutils.get_text(title_div)
-        parsed['url'] = webutils.get_link(title_div)
-
-    # Get citation
-    parsed['cite'] = webutils.get_text(question, 'cite')
-
-    # Get text
-    replace = ['qry', 'title', 'cite']
-    text = question.text.replace('Search for: ', '')
-    for key in replace:
-        if key in parsed.keys() and parsed[key]:
-            text = text.replace(parsed[key], '')
-    parsed['text'] = text if text else None
-
-    return parsed
-
-
+        text = webutils.get_text(title_div)
+    return text
diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py
@@ -26,6 +26,7 @@ def parse_videos(cmpt):
     divs.extend(webutils.find_all_divs(cmpt, 'g-inner-card'))
     divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'VibNM'}))
     divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'mLmaBd'}))
+    divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'RzdJxc'}))
     # divs.extend(cmpt.find_all('div', {'class':'sI5x9c'})) # Selects a level too low, missing links.
     divs = list(filter(None, divs))
 

diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py
@@ -0,0 +1,135 @@
+from .component_parsers.footer import extract_footer, extract_footer_components
+
+def extract_results_column(soup):
+    """Extract SERP components
+    
+    Args:
+        soup (bs4): BeautifulSoup SERP
+    
+    Returns:
+        list: a list of HTML result components
+    """
+
+    # Drop tags
+    drop_tags = {'script', 'style', None}
+
+    # Check if layout contains left side bar
+    layout_shift = [
+        soup.find('div', {'class': 'OeVqAd'}),  # left side bar
+        soup.find('div', {'class': 'M8OgIe'}),  # top bar
+    ]
+    rso = soup.find('div', {'id':'rso'})
+    column = []
+
+    if not any(layout_shift) and rso:
+        for child in rso.children:
+            if child.name in drop_tags:
+                continue
+            if not child.attrs:
+                column.extend(child.contents)
+            else:
+                column.append(child)
+    elif rso:
+        # Extract results from two div sections
+
+        # Find section 1 results and append to rso list
+        column = rso.find_all('div', {'class':'sATSHe'})
+        column = [c for c in column if c.name not in drop_tags]
+
+    else:
+        section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'})
+        for div in section1:
+
+            # Conditional handling for Twitter result
+            if div.find('h2') and div.find('h2').text == "Twitter Results": 
+                column.append(div.find('div').parent)
+
+            # Conditional handling for g-section with header
+            elif div.find('g-section-with-header'): 
+                column.append(div.find('g-section-with-header').parent)
+
+            # Include divs with a "View more" type of button
+            elif div.find('g-more-link'): 
+                column.append(div)
+
+            # Include footer components that appear in the main column
+            elif div.find('div', {'class':'oIk2Cb'}):
+                column.append(div)
+
+            else:
+                # Handle general results
+                for child in div.find_all('div',  {'class':'g'}): 
+                    column.append(child)
+
+            # Find section 2 results and append to column list
+            section2 = soup.find('div', {'class':'WvKfwe a3spGf'})
+            if section2:
+                for child in section2.children:
+                    column.append(child)
+            column = [c for c in column if c.name not in drop_tags]
+
+    # Drop empty components
+    drop_text = {
+        "Main results",    # Remove empty rso component; hidden <h2> header  
+        "Twitter Results", # Remove empty Twitter component
+        "",                # Remove empty divs
+    }
+    column = [c for c in column if c.text not in drop_text]
+    column = list(zip(['main']*len(column), column))
+
+
+    return column
+
+
+def extract_components(soup):
+    """Extract SERP components
+    
+    Args:
+        soup (bs4): BeautifulSoup SERP
+    
+    Returns:
+        list: a rank ordered top-to-bottom and left-to-right list of 
+             (component location, component soup) tuples
+    """
+
+    cmpts = []
+
+    # Top Image Carousel
+    top_bar = soup.find('div', {'id':'appbar'})
+    if top_bar:
+        has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
+        if top_bar.find('g-scrolling-carousel') and has_img:
+            cmpts.append(('top_image_carousel', top_bar))
+
+    # Shopping Ads
+    shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'})
+    if shopping_ads:
+        cmpts.append(('shopping_ad', shopping_ads))
+
+    # Top Ads
+    ads = soup.find('div', {'id':'tads'})
+    if ads: 
+        cmpts.append(('ad', ads))
+
+    column = extract_results_column(soup)
+    cmpts.extend(column)
+
+    # Bottom Ads
+    ads = soup.find('div', {'id':'tadsb'})
+    if ads:
+        cmpts.append(('ad', ads))
+
+    # Footer results
+    footer = extract_footer(soup)
+    if footer and extract_footer_components(footer):
+        cmpts.append(('footer', footer))
+
+    # RHS Knowledge Panel 
+    rhs = soup.find('div', {'id': 'rhs'})
+    if rhs:
+        rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']})
+        if rhs_kp:
+            # reading from top-to-bottom, left-to-right
+            cmpts.append(('knowledge_rhs', rhs_kp))
+
+    return cmpts