Skip to content

Commit

Permalink
Update extractors in new file, more model usage, logging defaults (#45)
Browse files Browse the repository at this point in the history
* update: use model, no timestamp, sub_type funcs

* add: general result with video sub type

* update: consistent args,docs in html parsing utils

* fix: optional parsed values

* fix: use no attrs safe get_link

* update: parse 'complementary results' with general

* add: standalone models file

* update: people also ask, use model

* update: use models, limit details, recent SERPs

* update: filter out hidden footer components, usually PAA

* update: use models, limit details, recent SERPs

* Bump to 0.3.3

* fix: tw card classifying, better header classifier

* update: use models, current SERPs

* update: limit details

* update: optional ID attrs at result level

* update: classifier for map and local

* update: use models, modern SERPs

* update: minimal serp attrs, ignore hidden surveys

* add: filter-notice, filter hidden footer cmpts

* update: classify knowledge components before general

* update: grab children of blank div in main column

* update: account for vertical top stories

* fix: convert filter to list for empty check

* Bump to 0.3.4

* update: turn off console log if log file provided

* update: extractors in new file

* update: keep only text ppa suggestions, need selenium for more

* update: use models in main parsers

* update: handle layout shift extraction

* add: local news wrapper for top stories

* update: additional video div class

* update: include parse lang in webutils

* update: switch main init to .extractors

* Bump to 0.3.5
  • Loading branch information
gitronald authored Nov 20, 2023
1 parent 491d605 commit 5255e71
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 203 deletions.
5 changes: 3 additions & 2 deletions WebSearcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__version__ = "0.3.4"
__version__ = "0.3.5"
from .searchers import SearchEngine
from .parsers import parse_serp, extract_components
from .parsers import parse_serp
from .extractors import extract_components
from .locations import download_locations
from .component_classifier import classify_type
from .webutils import load_html, make_soup, load_soup
3 changes: 2 additions & 1 deletion WebSearcher/component_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
'Resultados de la Web': 'general',
'Sports Results': 'knowledge',
'Top stories': 'top_stories',
'Local news': 'local_news',
'Translation Result': 'knowledge',
'Twitter Results': 'twitter',
'Unit Converter': 'knowledge',
Expand Down Expand Up @@ -111,7 +112,7 @@ def classify_header(cmpt: bs4.element.Tag, level):
# Check for string matches in header text e.g. `h2.text`
for header in filter(None, header_list):
for text, label in header_dict.items():
if header.text.startswith(text):
if header.text.strip().startswith(text):
return label

# Return unknown if no matches
Expand Down
2 changes: 2 additions & 0 deletions WebSearcher/component_parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .knowledge_rhs import parse_knowledge_rhs
from .shopping_ads import parse_shopping_ads
from .perspectives import parse_perspectives
from .local_news import parse_local_news
from .banner import parse_banner

# Component details dataframe
Expand All @@ -34,6 +35,7 @@
('general_subresult', parse_general_results, 'General Subresult'),
('available_on', parse_available_on, 'Available On'),
('top_stories', parse_top_stories, 'Top Stories'),
('local_news', parse_local_news, 'Local News'),
('latest_from', parse_latest_from, 'Latest From'),
('view_more_news', parse_view_more_news, 'View More News'),
('news_quotes', parse_news_quotes, 'News Quotes'),
Expand Down
14 changes: 14 additions & 0 deletions WebSearcher/component_parsers/local_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from . import parse_top_stories

def parse_local_news(cmpt):
"""Parse a "Perspectives & opinions" component
These components are the same as Top Stories, but have a different heading.
Args:
cmpt (bs4 object): A latest from component
Returns:
dict : parsed result
"""
return parse_top_stories(cmpt, ctype='local_news')
36 changes: 9 additions & 27 deletions WebSearcher/component_parsers/people_also_ask.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,23 @@ def parse_people_also_ask(cmpt, sub_rank=0):
Returns:
list : list of parsed subcomponent dictionaries
"""

# questions = cmpt.find_all('g-accordion-expander')
# questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
questions = cmpt.find_all("div", {"class":"related-question-pair"})
details = [parse_question(q) for q in questions] if questions else None

parsed = BaseResult(
type='people_also_ask',
sub_rank=sub_rank,
details=details,
)
# questions = cmpt.find_all('g-accordion-expander')
# questions = cmpt.find('section').find_all('div', {'class':'yTrXHe'})
questions = cmpt.find_all("div", {"class":"related-question-pair"})
parsed.details = [parse_question(q) for q in questions] if questions else None

return [parsed.model_dump()]


def parse_question(question):
"""Parse an individual question in a "People Also Ask" component"""

# Get query and URL fragments
parsed = {
'title': None,
'url': None,
}

# Get title
title_divs = [
Expand All @@ -43,20 +40,5 @@ def parse_question(question):
question.find('div', {'class': 'JlqpRe'}), # 2023-11-16
]
for title_div in filter(None, title_divs):
parsed['title'] = webutils.get_text(title_div)
parsed['url'] = webutils.get_link(title_div)

# Get citation
parsed['cite'] = webutils.get_text(question, 'cite')

# Get text
replace = ['qry', 'title', 'cite']
text = question.text.replace('Search for: ', '')
for key in replace:
if key in parsed.keys() and parsed[key]:
text = text.replace(parsed[key], '')
parsed['text'] = text if text else None

return parsed


text = webutils.get_text(title_div)
return text
1 change: 1 addition & 0 deletions WebSearcher/component_parsers/videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def parse_videos(cmpt):
divs.extend(webutils.find_all_divs(cmpt, 'g-inner-card'))
divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'VibNM'}))
divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'mLmaBd'}))
divs.extend(webutils.find_all_divs(cmpt, 'div', {'class':'RzdJxc'}))
# divs.extend(cmpt.find_all('div', {'class':'sI5x9c'})) # Selects a level too low, missing links.
divs = list(filter(None, divs))

Expand Down
135 changes: 135 additions & 0 deletions WebSearcher/extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from .component_parsers.footer import extract_footer, extract_footer_components

def extract_results_column(soup):
"""Extract SERP components
Args:
soup (bs4): BeautifulSoup SERP
Returns:
list: a list of HTML result components
"""

# Drop tags
drop_tags = {'script', 'style', None}

# Check if layout contains left side bar
layout_shift = [
soup.find('div', {'class': 'OeVqAd'}), # left side bar
soup.find('div', {'class': 'M8OgIe'}), # top bar
]
rso = soup.find('div', {'id':'rso'})
column = []

if not any(layout_shift) and rso:
for child in rso.children:
if child.name in drop_tags:
continue
if not child.attrs:
column.extend(child.contents)
else:
column.append(child)
elif rso:
# Extract results from two div sections

# Find section 1 results and append to rso list
column = rso.find_all('div', {'class':'sATSHe'})
column = [c for c in column if c.name not in drop_tags]

else:
section1 = soup.find_all('div', {'class':'UDZeY OTFaAf'})
for div in section1:

# Conditional handling for Twitter result
if div.find('h2') and div.find('h2').text == "Twitter Results":
column.append(div.find('div').parent)

# Conditional handling for g-section with header
elif div.find('g-section-with-header'):
column.append(div.find('g-section-with-header').parent)

# Include divs with a "View more" type of button
elif div.find('g-more-link'):
column.append(div)

# Include footer components that appear in the main column
elif div.find('div', {'class':'oIk2Cb'}):
column.append(div)

else:
# Handle general results
for child in div.find_all('div', {'class':'g'}):
column.append(child)

# Find section 2 results and append to column list
section2 = soup.find('div', {'class':'WvKfwe a3spGf'})
if section2:
for child in section2.children:
column.append(child)
column = [c for c in column if c.name not in drop_tags]

# Drop empty components
drop_text = {
"Main results", # Remove empty rso component; hidden <h2> header
"Twitter Results", # Remove empty Twitter component
"", # Remove empty divs
}
column = [c for c in column if c.text not in drop_text]
column = list(zip(['main']*len(column), column))


return column


def extract_components(soup):
"""Extract SERP components
Args:
soup (bs4): BeautifulSoup SERP
Returns:
list: a rank ordered top-to-bottom and left-to-right list of
(component location, component soup) tuples
"""

cmpts = []

# Top Image Carousel
top_bar = soup.find('div', {'id':'appbar'})
if top_bar:
has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
if top_bar.find('g-scrolling-carousel') and has_img:
cmpts.append(('top_image_carousel', top_bar))

# Shopping Ads
shopping_ads = soup.find('div', {'class': 'commercial-unit-desktop-top'})
if shopping_ads:
cmpts.append(('shopping_ad', shopping_ads))

# Top Ads
ads = soup.find('div', {'id':'tads'})
if ads:
cmpts.append(('ad', ads))

column = extract_results_column(soup)
cmpts.extend(column)

# Bottom Ads
ads = soup.find('div', {'id':'tadsb'})
if ads:
cmpts.append(('ad', ads))

# Footer results
footer = extract_footer(soup)
if footer and extract_footer_components(footer):
cmpts.append(('footer', footer))

# RHS Knowledge Panel
rhs = soup.find('div', {'id': 'rhs'})
if rhs:
rhs_kp = rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel']})
if rhs_kp:
# reading from top-to-bottom, left-to-right
cmpts.append(('knowledge_rhs', rhs_kp))

return cmpts
Loading

0 comments on commit 5255e71

Please sign in to comment.