update: local results text and labels

gitronald · May 9, 2024 · 5658a04 · 5658a04
1 parent 6944dbd
commit 5658a04
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 82 deletions.
diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.3.12.dev2"
+__version__ = "0.3.12.dev3"
 from .searchers import SearchEngine
 from .parsers import parse_serp
 from .extractors import extract_components

diff --git a/WebSearcher/component_parsers/local_results.py b/WebSearcher/component_parsers/local_results.py
@@ -1,80 +1,100 @@
-from .. import utils
-from .. import webutils
-from ..models import BaseResult
-
-def parse_local_results(cmpt):
-    """Parse a "Local Results" component
-
-    These components contain an embedded map followed by vertically 
-    stacked subcomponents for locations. These locations are typically 
-    businesses relevant to the query.
-    
-    Args:
-        cmpt (bs4 object): A local results component
-    
-    Returns:
-        list : list of parsed subcomponent dictionaries
-    """
-    subs = cmpt.find_all('div', {'class': 'VkpGBb'})
-    parsed = [parse_local_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
-    if parsed:
-        return parsed
-    else:
-        return [BaseResult(
-        type='local_results',
-        sub_rank=0,
-        text=webutils.get_text(cmpt, 'div', {'class': 'n6tePd'}) # No results message
-    ).model_dump()]
-
-
-def parse_local_result(sub, sub_rank=0):
-    """Parse a "Local Results" subcomponent
-    
-    Args:
-        sub (bs4 object): A local results subcomponent
-    
-    Returns:
-        dict : parsed subresult
-    """
-
-    details = parse_local_details(sub)
-
-    parsed = BaseResult(
-        type='local_results',
-        sub_rank=sub_rank,
-        title=webutils.get_text(sub, 'div', {'class': 'dbg0pd'}),
-        url=details['website'] if 'website' in details else None,
-        details=details,
-    )
-    return parsed.model_dump()
-
-
-def parse_local_details(sub):
-
-    local_details = {}
-
-    # Extract summary details
-    detail_div = sub.find('span', {'class':'rllt__details'})
-    detail_divs = detail_div.find_all('div') if detail_div else None
-
-    # Extract rating and location type
-    if detail_divs:
-        rating_div = detail_divs[0]
-        rating = rating_div.find('span', {'class':'BTtC6e'})
-        if rating: 
-            local_details['rating'] = float(rating.text)
-            n_reviews = utils.get_between_parentheses(rating_div.text).replace(',','')
-            local_details['n_reviews'] = int(n_reviews)
-        local_details['loc_label'] = rating_div.text.split('·')[-1].strip()
-
-        # Extract contact details
-        if len(detail_divs) > 1:
-            contact_div = detail_divs[1]
-            local_details['contact'] = contact_div.text
-
-    # Extract various links
-    links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
-    links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
-    links_dict = dict(zip(links_text, links))
-    local_details.update(links_dict)
+from .. import utils
+from .. import webutils
+from ..models import BaseResult
+
+def parse_local_results(cmpt):
+    """Parse a "Local Results" component
+
+    These components contain an embedded map followed by vertically 
+    stacked subcomponents for locations. These locations are typically 
+    businesses relevant to the query.
+    
+    Args:
+        cmpt (bs4 object): A local results component
+    
+    Returns:
+        list : list of parsed subcomponent dictionaries
+    """
+    subs = cmpt.find_all('div', {'class': 'VkpGBb'})
+    parsed_list = [parse_local_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
+    if parsed_list:
+
+        # Set first non-empty header as sub_type (e.g. "Places" -> places)
+        header_list = [
+            webutils.get_text(cmpt, "h2", {"role":"heading"}),
+            webutils.get_text(cmpt, 'div', {'aria-level':"2", "role":"heading"}),
+        ]
+        header_list = list(filter(None, header_list))
+        if header_list:
+            sub_type = str(header_list[0]).lower().replace(" ", "_")
+            for parsed in parsed_list:
+                parsed.update({'sub_type':sub_type})
+
+        return parsed_list
+    else:
+        parsed = {
+            'type':'local_results',
+            'sub_rank':0,
+            'text':webutils.get_text(cmpt, 'div', {'class': 'n6tePd'}) # No results message
+        }
+        return [BaseResult(**parsed).model_dump()]
+
+def parse_local_result(sub, sub_rank=0):
+    """Parse a "Local Results" subcomponent
+    
+    Args:
+        sub (bs4 object): A local results subcomponent
+    
+    Returns:
+        dict : parsed subresult
+    """
+
+    parsed = {'type':'local_results', 
+              'sub_rank':sub_rank}
+    parsed['title'] = webutils.get_text(sub, 'div', {'class':'dbg0pd'})
+
+    # Extract URL
+    links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
+    links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
+    links_dict = dict(zip(links_text, links))
+    parsed['url'] = links_dict.get('website', None)
+
+    # Extract text and label
+    text = webutils.get_text(sub, 'div', {'class':'rllt__details'}, separator='<|>')
+    label = webutils.get_text(sub, "span", {"class":"X0w5lc"})
+    parsed['text'] = f"{text} <label>{label}</label>" if label else text
+    parsed['details'] = parse_local_details(sub)
+
+    validated = BaseResult(**parsed)
+    return validated.model_dump()
+
+
+def parse_local_details(sub):
+
+    local_details = {}
+
+    # Extract summary details
+    detail_div = sub.find('span', {'class':'rllt__details'})
+    detail_divs = detail_div.find_all('div') if detail_div else None
+
+    # Extract rating and location type
+    if detail_divs:
+        rating_div = detail_divs[0]
+        rating = rating_div.find('span', {'class':'BTtC6e'})
+        if rating: 
+            local_details['rating'] = float(rating.text)
+            n_reviews = utils.get_between_parentheses(rating_div.text).replace(',','')
+            local_details['n_reviews'] = int(n_reviews)
+        local_details['loc_label'] = rating_div.text.split('·')[-1].strip()
+
+        # Extract contact details
+        if len(detail_divs) > 1:
+            contact_div = detail_divs[1]
+            local_details['contact'] = contact_div.text
+
+    # Extract various links
+    links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
+    links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
+    links_dict = dict(zip(links_text, links))
+    local_details.update(links_dict)
     return local_details
diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py
@@ -120,10 +120,14 @@ def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty
         divs = [c for c in divs if c.text != '']
     return divs
 
-def find_children(soup, name: str, attrs: dict = {}) -> list:
+def find_children(soup, name: str, attrs: dict = {}, filter_empty: bool = False):
     """Find all children of a div with a given name and attribute"""
     div = get_div(soup, name, attrs)
-    return div.children if div else []
+    divs = div.children if div else []
+    if divs and filter_empty:
+        divs = [c for c in divs if c]
+        divs = [c for c in divs if c.text != '']
+    return divs
 
 
 # URLs -------------------------------------------------------------------------