Skip to content

Commit

Permalink
update: local results text and labels
Browse files Browse the repository at this point in the history
  • Loading branch information
gitronald committed May 9, 2024
1 parent 6944dbd commit 5658a04
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 82 deletions.
2 changes: 1 addition & 1 deletion WebSearcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.3.12.dev2"
__version__ = "0.3.12.dev3"
from .searchers import SearchEngine
from .parsers import parse_serp
from .extractors import extract_components
Expand Down
178 changes: 99 additions & 79 deletions WebSearcher/component_parsers/local_results.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,100 @@
from .. import utils
from .. import webutils
from ..models import BaseResult

def parse_local_results(cmpt):
"""Parse a "Local Results" component
These components contain an embedded map followed by vertically
stacked subcomponents for locations. These locations are typically
businesses relevant to the query.
Args:
cmpt (bs4 object): A local results component
Returns:
list : list of parsed subcomponent dictionaries
"""
subs = cmpt.find_all('div', {'class': 'VkpGBb'})
parsed = [parse_local_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
if parsed:
return parsed
else:
return [BaseResult(
type='local_results',
sub_rank=0,
text=webutils.get_text(cmpt, 'div', {'class': 'n6tePd'}) # No results message
).model_dump()]


def parse_local_result(sub, sub_rank=0):
"""Parse a "Local Results" subcomponent
Args:
sub (bs4 object): A local results subcomponent
Returns:
dict : parsed subresult
"""

details = parse_local_details(sub)

parsed = BaseResult(
type='local_results',
sub_rank=sub_rank,
title=webutils.get_text(sub, 'div', {'class': 'dbg0pd'}),
url=details['website'] if 'website' in details else None,
details=details,
)
return parsed.model_dump()


def parse_local_details(sub):

local_details = {}

# Extract summary details
detail_div = sub.find('span', {'class':'rllt__details'})
detail_divs = detail_div.find_all('div') if detail_div else None

# Extract rating and location type
if detail_divs:
rating_div = detail_divs[0]
rating = rating_div.find('span', {'class':'BTtC6e'})
if rating:
local_details['rating'] = float(rating.text)
n_reviews = utils.get_between_parentheses(rating_div.text).replace(',','')
local_details['n_reviews'] = int(n_reviews)
local_details['loc_label'] = rating_div.text.split('·')[-1].strip()

# Extract contact details
if len(detail_divs) > 1:
contact_div = detail_divs[1]
local_details['contact'] = contact_div.text

# Extract various links
links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
links_dict = dict(zip(links_text, links))
local_details.update(links_dict)
from .. import utils
from .. import webutils
from ..models import BaseResult

def parse_local_results(cmpt):
"""Parse a "Local Results" component
These components contain an embedded map followed by vertically
stacked subcomponents for locations. These locations are typically
businesses relevant to the query.
Args:
cmpt (bs4 object): A local results component
Returns:
list : list of parsed subcomponent dictionaries
"""
subs = cmpt.find_all('div', {'class': 'VkpGBb'})
parsed_list = [parse_local_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
if parsed_list:

# Set first non-empty header as sub_type (e.g. "Places" -> places)
header_list = [
webutils.get_text(cmpt, "h2", {"role":"heading"}),
webutils.get_text(cmpt, 'div', {'aria-level':"2", "role":"heading"}),
]
header_list = list(filter(None, header_list))
if header_list:
sub_type = str(header_list[0]).lower().replace(" ", "_")
for parsed in parsed_list:
parsed.update({'sub_type':sub_type})

return parsed_list
else:
parsed = {
'type':'local_results',
'sub_rank':0,
'text':webutils.get_text(cmpt, 'div', {'class': 'n6tePd'}) # No results message
}
return [BaseResult(**parsed).model_dump()]

def parse_local_result(sub, sub_rank=0):
"""Parse a "Local Results" subcomponent
Args:
sub (bs4 object): A local results subcomponent
Returns:
dict : parsed subresult
"""

parsed = {'type':'local_results',
'sub_rank':sub_rank}
parsed['title'] = webutils.get_text(sub, 'div', {'class':'dbg0pd'})

# Extract URL
links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
links_dict = dict(zip(links_text, links))
parsed['url'] = links_dict.get('website', None)

# Extract text and label
text = webutils.get_text(sub, 'div', {'class':'rllt__details'}, separator='<|>')
label = webutils.get_text(sub, "span", {"class":"X0w5lc"})
parsed['text'] = f"{text} <label>{label}</label>" if label else text
parsed['details'] = parse_local_details(sub)

validated = BaseResult(**parsed)
return validated.model_dump()


def parse_local_details(sub):

local_details = {}

# Extract summary details
detail_div = sub.find('span', {'class':'rllt__details'})
detail_divs = detail_div.find_all('div') if detail_div else None

# Extract rating and location type
if detail_divs:
rating_div = detail_divs[0]
rating = rating_div.find('span', {'class':'BTtC6e'})
if rating:
local_details['rating'] = float(rating.text)
n_reviews = utils.get_between_parentheses(rating_div.text).replace(',','')
local_details['n_reviews'] = int(n_reviews)
local_details['loc_label'] = rating_div.text.split('·')[-1].strip()

# Extract contact details
if len(detail_divs) > 1:
contact_div = detail_divs[1]
local_details['contact'] = contact_div.text

# Extract various links
links = [a.attrs['href'] for a in sub.find_all('a') if 'href' in a.attrs]
links_text = [a.text.lower() for a in sub.find_all('a') if 'href' in a.attrs]
links_dict = dict(zip(links_text, links))
local_details.update(links_dict)
return local_details
8 changes: 6 additions & 2 deletions WebSearcher/webutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,14 @@ def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty
divs = [c for c in divs if c.text != '']
return divs

def find_children(soup, name: str, attrs: dict = {}) -> list:
def find_children(soup, name: str, attrs: dict = {}, filter_empty: bool = False):
"""Find all children of a div with a given name and attribute"""
div = get_div(soup, name, attrs)
return div.children if div else []
divs = div.children if div else []
if divs and filter_empty:
divs = [c for c in divs if c]
divs = [c for c in divs if c.text != '']
return divs


# URLs -------------------------------------------------------------------------
Expand Down

0 comments on commit 5658a04

Please sign in to comment.