Skip to content

Commit

Permalink
small fixes for 2023/24 data. consolidate some fields.
Browse files Browse the repository at this point in the history
  • Loading branch information
jlgleason committed Feb 2, 2024
1 parent f18d32e commit 4fdcdcb
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 35 deletions.
80 changes: 48 additions & 32 deletions WebSearcher/component_parsers/knowledge_rhs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,56 +3,64 @@ def parse_knowledge_rhs(cmpt, sub_rank=0):
Args:
cmpt (bs4 object): a right-hand-side knowledge component
Returns:
list: Return parsed dictionary in a list
"""

parsed_list = parse_knowledge_rhs_main(cmpt)
description = cmpt.find('h2', {'class':'Uo8X3b'})
description = cmpt.find('h2', {'class': 'Uo8X3b'})
if description and description.parent:
subs = [s for s in description.parent.next_siblings]
parsed_subs = [parse_knowledge_rhs_sub(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
parsed_subs = [
parse_knowledge_rhs_sub(sub, sub_rank) for sub_rank, sub in enumerate(subs)
]
parsed_list.extend(parsed_subs)
return parsed_list


def parse_knowledge_rhs_main(cmpt, sub_rank=0):
"""Parse the Right-Hand-Side Knowledge Panel main component"""

parsed = {
'type': 'knowledge',
'sub_type': 'panel_rhs',
'type': 'knowledge',
'sub_type': 'panel_rhs',
'sub_rank': sub_rank,
'title': '',
'subtitle': '',
'text': '',
'url': '',
'img_urls': [],
'details': None
'details': {},
'rhs_column': True
}

# images TODO missing single image to right of title
if cmpt.find('h3') and cmpt.find('h3').text == 'Images':
sibling = cmpt.find('h3').next_sibling
# images
if cmpt.find("h3") and cmpt.find("h3").text == "Images":
sibling = cmpt.find("h3").next_sibling
if sibling:
imgs = sibling.find_all('a')
parsed['img_urls'] = [img['href'] for img in imgs if 'href' in img.attrs]

imgs = sibling.find_all("a")
parsed["details"]["img_urls"] = [
img["href"] for img in imgs if "href" in img.attrs
]

# title, subtitle
if cmpt.find('h2', {'data-attrid':'title'}):
parsed['title'] = cmpt.find('h2', {'data-attrid':'title'}).text
if cmpt.find('div', {'data-attrid':'subtitle'}):
parsed['subtitle'] = cmpt.find('div', {'data-attrid':'subtitle'}).text
if cmpt.find('h2', {'data-attrid': 'title'}):
parsed['title'] = cmpt.find('h2', {'data-attrid': 'title'}).text
if cmpt.find('div', {'data-attrid': 'subtitle'}):
parsed['details']['subtitle'] = cmpt.find(
'div', {'data-attrid': 'subtitle'}
).text

# description
description = cmpt.find('h2', {'class':'Uo8X3b'})
if description and description.parent:
description = cmpt.find('h2', {'class': 'Uo8X3b'})
if description and description.parent:
if description.parent.find('span'):
parsed['text'] = description.parent.find('span').text
if description.parent.find('a') and 'href' in description.parent.find('a').attrs:
if (
description.parent.find('a')
and 'href' in description.parent.find('a').attrs
):
parsed['url'] = description.parent.find('a')['href']

description = cmpt.find('div', {'class':'kno-rdesc'})
description = cmpt.find('div', {'class': 'kno-rdesc'})
if description:
parsed['text'] = description.find('span').text
if description.find('a') and 'href' in description.find('a').attrs:
Expand All @@ -63,31 +71,39 @@ def parse_knowledge_rhs_main(cmpt, sub_rank=0):
alinks = description.parent.find_all('a')
if description.parent.previous_sibling:
alinks += description.parent.previous_sibling.find_all('a')
if len(alinks) > 1: # 1st match has main description
parsed['details'] = [parse_alink(a) for a in alinks[1:] if 'href' in a.attrs]
if len(alinks) > 1: # 1st match has main description
parsed['details']['urls'] = [
parse_alink(a) for a in alinks[1:] if 'href' in a.attrs
]

if not len(parsed['details']):
parsed['details'] = None

return [parsed]


def parse_knowledge_rhs_sub(sub, sub_rank=0):
"""Parse a Right-Hand-Side Knowledge Panel subcomponent"""

parsed = {
'type': 'knowledge',
'sub_type': 'panel_rhs',
'sub_rank': sub_rank+1,
'sub_rank': sub_rank + 1,
'title': '',
'details': None
'details': None,
'rhs_column': True
}

heading = sub.find('div', {'role':'heading'})
heading = sub.find('div', {'role': 'heading'})
if heading:
parsed['title'] = heading.get_text(' ')

alinks = sub.find_all('a')
if alinks:
parsed['details'] = [parse_alink(a) for a in alinks if 'href' in a.attrs]
if alinks:
parsed['details'] = [parse_alink(a) for a in alinks if 'href' in a.attrs]

return parsed


def parse_alink(a):
return {'url': a['href'], 'text': a.text}
return {'url': a['href'], 'text': a.text}
6 changes: 4 additions & 2 deletions WebSearcher/component_parsers/videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,10 @@ def parse_video(sub, sub_rank=0):
def get_url(sub):
"""Get video URL by filtering for non-hash links"""
all_urls = sub.find_all('a')
non_hash_urls = [url for url in all_urls if not url['href'].startswith('#')]
return non_hash_urls[0]['href'] if non_hash_urls else None
for url in all_urls:
if "href" in url and not url['href'].startswith('#'):
return url["href"]
return None


def get_div_text(soup, details):
Expand Down
2 changes: 1 addition & 1 deletion WebSearcher/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def extract_results_column(soup, drop_tags = {'script', 'style', None}):

else:
log.debug("layout: no-rso")
column = extract_from_no_rso(layout_dict, drop_tags)
column = extract_from_no_rso(soup, drop_tags)

# Drop empty components
drop_text = {
Expand Down

0 comments on commit 4fdcdcb

Please sign in to comment.