Skip to content

Commit

Permalink
add: catch indented general subresults
Browse files Browse the repository at this point in the history
  • Loading branch information
gitronald committed Oct 13, 2023
1 parent 7e04755 commit 032c3d9
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 5 deletions.
10 changes: 9 additions & 1 deletion WebSearcher/component_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,19 @@ def classify_type(cmpt):
# If only class is 'g' then it is a general component
if cmpt.attrs["class"] == ["g"]:
cmpt_type = "general"
# If class includees 'g' check for extra class tags
# If class includes 'g' check for extra class tags
elif "g" in cmpt.attrs["class"]:
if any(s in ["Ww4FFb"] for s in cmpt.attrs["class"]):
cmpt_type = "general"

if cmpt_type == "unknown":
# A general result followed by an indented result from the same domain
mask_class1 = cmpt.find_all('div', {'class':'g'})
mask_class2 = cmpt.find_all('div', {'class':'d4rhi'})
mask_sum = len(mask_class1) + len(mask_class2)
if mask_sum > 1:
cmpt_type = "general_subresult"

# check for people also ask
if cmpt_type == "unknown":
cmpt_type = classify_people_also_ask(cmpt)
Expand Down
1 change: 1 addition & 0 deletions WebSearcher/component_parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
('general', parse_general_results, 'General'),
('general_questions', parse_general_questions, 'General Questions'),
('general_menu', parse_general_results, 'General Submenu'),
('general_subresult', parse_general_results, 'General Subresult'),
('available_on', parse_available_on, 'Available On'),
('top_stories', parse_top_stories, 'Top Stories'),
('latest_from', parse_latest_from, 'Latest From'),
Expand Down
13 changes: 9 additions & 4 deletions WebSearcher/component_parsers/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ def parse_general_results(cmpt):
subs = cmpt.find_all('div', {'class':'g'})

# 2023.05.09 - finds subs
if cmpt.find_all('div', {'class': 'd4rhi'}):
additional = cmpt.find_all('div', {'class': 'd4rhi'})
if additional:
# Catch general_subresult
# this means that there is a sub-element, with class d4rhi
# the first div child of the div.g is the first sub element
first = cmpt.find('div')
additional = cmpt.find_all('div', {'class': 'd4rhi'})
subs = [first] + additional

# 2023.05.09 - handles duplicate .g tags within one component
Expand All @@ -36,7 +37,8 @@ def parse_general_results(cmpt):
subs = [parent_g]
subs = subs if subs else [cmpt]

return [parse_general_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
parsed = [parse_general_result(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
return parsed

def parse_general_result(sub, sub_rank=0):
"""Parse a general subcomponent
Expand Down Expand Up @@ -103,7 +105,10 @@ def parse_general_result(sub, sub_rank=0):
parsed['text'] = get_text(sub, 'div', {'class':'VwiC3b'})

# Check for subtype and parse
if sub.find('g-review-stars'):
if 'class' in sub.attrs:
if sub.attrs['class'] == 'd4rhi':
parsed['subtype'] == 'subresult'
elif sub.find('g-review-stars'):
parsed['subtype'] = 'submenu_rating'
sibling = sub.find('g-review-stars').next_sibling
if sibling:
Expand Down

0 comments on commit 032c3d9

Please sign in to comment.