Skip to content

Commit

Permalink
Merge dev: v0.4.1 (#61)
Browse files Browse the repository at this point in the history
* add: initial extract from left-bar layout

* update: classify and parse url for images-multimedia

* update: clean code

* version: 0.4.1.dev0

* update: add handling for layouts circa Mar 2024

* version: 0.4.1.dev1

* update: rename text-based header classifier for clarity

* update: rename header file for addtl clarity

* add: pipeline for query notices component

* update: filter empty divs util function

* fix: initialize output dict

* update: move top image parser to header section parsers

* version: 0.4.1.dev2

* fix: missing cmpt_ranks due to empty ad components, filter before adding to the list

* fix: broader filtering, sub_types, better title and url parser for medium

* fix: handle ads and shopping ads extracted from same serp

* update: handling for no subcomponents, pass error and text

* clean: quotation formatting

* update: readme example

* version: 0.4.1.dev3

* update: add query suggestion variation, handle multiple suggestions, drop internal url

* update: assert parsed list is not empty

* update: reorg, clearer header extractors, handle shopping ads in ads

* update: catch location query notices

* update: rename query_notice to notice, includes location notices

* version: 0.4.1.dev4

* fix: renaming, include more query edit notices

* version: 0.4.1.dev5

* update: refactor notices parser as class

* version: 0.4.1.dev6

* update: add language tip sub type

* update: grab notice divs more directly

* fix: wrong get_url usage for images urls

* version: 0.4.1.dev7

* Bump to 0.4.1
  • Loading branch information
gitronald authored Aug 26, 2024
1 parent 7f43f9f commit 2d3e607
Show file tree
Hide file tree
Showing 15 changed files with 526 additions and 257 deletions.
54 changes: 30 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,54 +50,60 @@ se = ws.SearchEngine()
vars(se)
```
```python
{'url': 'https://www.google.com/search',
'params': {},
{'version': '0.4.1',
'base_url': 'https://www.google.com/search',
'headers': {'Host': 'www.google.com',
'Referer': 'https://www.google.com/',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip,deflate,br',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'},
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'},
'sesh': <requests.sessions.Session at 0x7f9ac018ece0>,
'ssh_tunnel': None,
'sesh': <requests.sessions.Session at 0x7f7bad8efba8>,
'log': <Logger WebSearcher.searchers (DEBUG)>,
'unzip': True,
'params': {},
'qry': None,
'loc': None,
'num_results': None,
'url': None,
'timestamp': None,
'serp_id': None,
'crawl_id': None,
'response': None,
'html': None,
'results': [],
'results_html': []}
'log': <Logger WebSearcher.searchers (DEBUG)>}
```

#### Conduct a search

```python
# Conduct Search
se.search('immigration')
se.search('immigration news')
```
```
2019-08-14 01:25:38,267 | 2688 | INFO | WebSearcher.searchers | 200 | Searching immigration
2024-08-19 14:09:18.502 | INFO | WebSearcher.searchers | 200 | immigration news
```

```python
# Parse Results
se.parse_results()
```
```
2019-08-14 01:25:42,208 | 2688 | INFO | WebSearcher.parsers | Parsing SERP 4d4fe27fe6b6466041e326622719b03ccc6542427c577c69740ae7fc
```

```python
se.results[0]
{'cite': 'The New York Times',
{'section': 'main',
'cmpt_rank': 0,
'details': {'img_url': None, 'live_stamp': False, 'orient': 'h'},
'lang': 'en',
'qry': 'immigration',
'serp_id': '4d4fe27fe6b6466041e326622719b03ccc6542427c577c69740ae7fc',
'serp_rank': 0,
'sub_rank': 0,
'timestamp': '1 day ago',
'title': 'Trump Policy Favors Wealthier Immigrants for Green Cards',
'type': 'top_stories',
'url': 'https://www.nytimes.com/2019/08/12/us/politics/trump-immigration-policy.html'}
'sub_type': None,
'title': 'Biden citizenship program for migrant spouses in US launches',
'url': 'https://www.newsnationnow.com/us-news/immigration/biden-citizenship-program-migrant-spouses-us-launches/',
'text': None,
'cite': 'NewsNation',
'details': None,
'error': None,
'serp_rank': 0}
```

### Save a Search
Expand Down Expand Up @@ -140,9 +146,9 @@ Happy to have help! If you see a component that we aren't covering yet, please a

### Add a Parser

1. Add classifier to `component_classifier.py`, as `'cmpt_name'`
2. Add parser file in `/component_parsers` as `cmpt_name.py`, with function `parse_cmpt_name`.
3. Add import for `parse_cmpt_name` in `/component_parsers/__init__.py`
1. Add classifier to `classifiers/{main,footer,headers}.py`
2. Add parser as new file in `/component_parsers`
3. Add new parser to imports and catalogue in `/component_parsers/__init__.py`

### Testing
Run tests:
Expand Down
2 changes: 1 addition & 1 deletion WebSearcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.4.0"
__version__ = "0.4.1"
from .searchers import SearchEngine
from .parsers import parse_serp
from .extractors import Extractor
Expand Down
3 changes: 2 additions & 1 deletion WebSearcher/classifiers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .headers import ClassifyByHeader
from .header_text import ClassifyHeaderText
from .header_components import ClassifyHeaderComponent
from .main import ClassifyMain
from .footer import ClassifyFooter
15 changes: 15 additions & 0 deletions WebSearcher/classifiers/header_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .. import webutils
import bs4


class ClassifyHeaderComponent:
"""Classify a component from the header section based on its bs4.element.Tag"""

@staticmethod
def classify(cmpt: bs4.element.Tag) -> str:
"""Classify the component type based on header text"""

cmpt_type = "unknown"
if webutils.check_dict_value(cmpt.attrs, "id", ["taw", "topstuff"]):
cmpt_type = "notice"
return cmpt_type
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
import bs4

class ClassifyByHeader:
class ClassifyHeaderText:
"""Classify components based on header text (e.g. <h2>title</h2>)"""

@staticmethod
def classify(cmpt: bs4.element.Tag, levels: list[int] = [2, 3]) -> str:
for level in levels:
header = ClassifyByHeader._classify_header(cmpt, level)
header = ClassifyHeaderText._classify_header(cmpt, level)
if header != "unknown":
return header
return "unknown"

@staticmethod
def classify_header_lvl2(cmpt: bs4.element.Tag) -> str:
return ClassifyByHeader._classify_header(cmpt, level=2)
return ClassifyHeaderText._classify_header(cmpt, level=2)

@staticmethod
def classify_header_lvl3(cmpt: bs4.element.Tag) -> str:
return ClassifyByHeader._classify_header(cmpt, level=3)
return ClassifyHeaderText._classify_header(cmpt, level=3)

@staticmethod
def _classify_header(cmpt: bs4.element.Tag, level: int) -> str:
"""Check text in common headers for dict matches"""
header_dict = ClassifyByHeader._get_header_level_mapping(level)
header_dict = ClassifyHeaderText._get_header_level_mapping(level)

# Collect list of potential header divs
header_list = []
header_list.extend(cmpt.find_all(f"h{level}", {"role":"heading"}))
header_list.extend(cmpt.find_all(f"h{level}", {"class":["O3JH7", "q8U8x"]}))
header_list.extend(cmpt.find_all("div", {'aria-level':f"{level}", "role":"heading"}))
header_list.extend(cmpt.find_all("div", {'aria-level':f"{level}", "class":"XmmGVd"}))
header_list.extend(cmpt.find_all("div", {"aria-level":f"{level}", "role":"heading"}))
header_list.extend(cmpt.find_all("div", {"aria-level":f"{level}", "class":"XmmGVd"}))

# Check header text for known title matches
for header in filter(None, header_list):
Expand All @@ -42,8 +42,8 @@ def _classify_header(cmpt: bs4.element.Tag, level: int) -> str:
@staticmethod
def _get_header_level_mapping(level) -> dict:
"""Return mapping of header level to header text"""
options = {2: ClassifyByHeader.TYPE_TO_H2_MAPPING,
3: ClassifyByHeader.TYPE_TO_H3_MAPPING}
options = {2: ClassifyHeaderText.TYPE_TO_H2_MAPPING,
3: ClassifyHeaderText.TYPE_TO_H3_MAPPING}
return options.get(level, {})

# WS type -> header level 2 text (e.g., <h2>title</h2>)
Expand All @@ -54,6 +54,7 @@ def _get_header_level_mapping(level) -> dict:
"Resultados de la Web",
"Web Result with Site Links",
"Web results"],
"images": ["Images"],
"jobs": ["Jobs"],
"knowledge": ["Calculator Result",
"Featured snippet from the web",
Expand Down Expand Up @@ -86,17 +87,17 @@ def _get_header_level_mapping(level) -> dict:

# WS type -> header level 2 text (e.g., <h3>title</h3>)
TYPE_TO_H3_MAPPING = {
'images': ['Images for'],
'latest_from': ['Latest from'],
'products': ['Popular products'],
'news_quotes': ['Quotes in the news'],
'recipes': ['Recipes'],
'searches_related': ['Related searches'],
'scholarly_articles': ['Scholarly articles for'],
'top_stories': ['Top stories'],
'videos': ['Videos'],
'view_more_news': ['View more news'],
'view_more_videos': ['View more videos']
"images": ["Images for"],
"latest_from": ["Latest from"],
"products": ["Popular products"],
"news_quotes": ["Quotes in the news"],
"recipes": ["Recipes"],
"searches_related": ["Related searches"],
"scholarly_articles": ["Scholarly articles for"],
"top_stories": ["Top stories"],
"videos": ["Videos"],
"view_more_news": ["View more news"],
"view_more_videos": ["View more videos"]
}

# Invert from {label: [text, ...]} to [{text: label}, ...]
Expand Down
4 changes: 2 additions & 2 deletions WebSearcher/classifiers/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .headers import ClassifyByHeader
from .header_text import ClassifyHeaderText
from .. import webutils
import bs4

Expand All @@ -11,7 +11,7 @@ def classify(cmpt: bs4.element.Tag) -> str:
# Ordered list of classifiers to try
component_classifiers = [
ClassifyMain.top_stories, # Check top stories
ClassifyByHeader.classify, # Check levels 2 & 3 header text
ClassifyHeaderText.classify, # Check levels 2 & 3 header text
ClassifyMain.img_cards, # Check image cards
ClassifyMain.images, # Check images
ClassifyMain.knowledge_panel, # Check knowledge panel
Expand Down
14 changes: 12 additions & 2 deletions WebSearcher/component_parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@

from .notices import parse_notices
from .top_image_carousel import parse_top_image_carousel

from .ads import parse_ads
from .available_on import parse_available_on
from .banner import parse_banner
Expand All @@ -19,7 +23,6 @@
from .scholarly_articles import parse_scholarly_articles
from .searches_related import parse_searches_related
from .shopping_ads import parse_shopping_ads
from .top_image_carousel import parse_top_image_carousel
from .twitter_cards import parse_twitter_cards
from .twitter_result import parse_twitter_result
from .videos import parse_videos
Expand All @@ -28,6 +31,14 @@
from .footer import Footer
from .knowledge_rhs import parse_knowledge_rhs

# Header parsers
header_parsers = [
("notice", parse_notices, "Notices"),
('top_image_carousel', parse_top_image_carousel, 'Top Image Carousel'),
]
header_parser_dict = {i[0]:i[1] for i in header_parsers} # Format {type: function}
header_parser_labels = {i[0]:i[2] for i in header_parsers} # Format {type: label}

# Component details dataframe
columns = ['type', 'func', 'label']
main_parsers = [
Expand All @@ -49,7 +60,6 @@
('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'),
('searches_related', parse_searches_related, 'Related Searches'),
('shopping_ads', parse_shopping_ads, 'Shopping Ad'),
('top_image_carousel', parse_top_image_carousel, 'Top Image Carousel'),
('top_stories', parse_top_stories, 'Top Stories'),
('twitter_cards', parse_twitter_cards, 'Twitter Cards'),
('twitter_result', parse_twitter_result, 'Twitter Result'),
Expand Down
61 changes: 41 additions & 20 deletions WebSearcher/component_parsers/ads.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,43 @@

from .. import webutils
from ..models import BaseResult
from .shopping_ads import parse_shopping_ads
import bs4

def parse_ads(cmpt: bs4.element.Tag):
def parse_ads(cmpt: bs4.element.Tag) -> list:
"""Parse ads from ad component"""

if cmpt.find_all('li', {'class':'ads-ad'}):
# Check for legacy ad format
subs = cmpt.find_all('li', {'class':'ads-ad'})
parser = parse_ad_legacy
elif cmpt.find_all('li', {'class':'ads-fr'}):
# Check for secondary ad format
subs = cmpt.find_all('li', {'class':'ads-fr'})
parser = parse_ad_secondary
else:
# Check for latest ad format
subs = cmpt.find_all('div', {'class':'uEierd'})
parser = parse_ad

return [parser(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
parsed = []
sub_type = classify_ad_type(cmpt)

if sub_type == 'legacy':
subs = cmpt.find_all('li', {'class': 'ads-ad'})
parsed = [parse_ad_legacy(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
elif sub_type == 'secondary':
subs = cmpt.find_all('li', {'class': 'ads-fr'})
parsed = [parse_ad_secondary(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
elif sub_type == 'standard':
subs = webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
for sub in subs:
sub_classes = sub.attrs.get("class", [])
if "commercial-unit-desktop-top" in sub_classes:
parsed.extend(parse_shopping_ads(sub))
elif "uEierd" in sub_classes:
parsed.append(parse_ad(sub))
return [BaseResult(**parsed_item).model_dump() for parsed_item in parsed]


def classify_ad_type(cmpt: bs4.element.Tag) -> str:
"""Classify the type of ad component"""
label_divs = {
"legacy": webutils.find_all_divs(cmpt, 'div', {'class': 'ad_cclk'}),
"secondary": webutils.find_all_divs(cmpt, 'div', {'class': 'd5oMvf'}),
"standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
}
for label, divs in label_divs.items():
if divs:
return label
return 'unknown'


def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
Expand All @@ -56,8 +74,7 @@ def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
parsed['sub_type'] = 'submenu'
parsed['details'] = submenu

validated = BaseResult(**parsed)
return validated.model_dump()
return parsed


def parse_ad_menu(sub: bs4.element.Tag) -> list:
Expand All @@ -81,7 +98,9 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list:
def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
"""Parse details of a single ad subcomponent, similar to general"""

parsed = {'type':'ad', 'sub_rank':sub_rank}
parsed = {"type": "ad",
"sub_type": "secondary",
"sub_rank": sub_rank}
parsed['title'] = sub.find('div', {'role':'heading'}).text
parsed['url'] = sub.find('div', {'class':'d5oMvf'}).find('a')['href']
parsed['cite'] = sub.find('span', {'class':'gBIQub'}).text
Expand All @@ -103,10 +122,12 @@ def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:

return parsed

def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
def parse_ad_legacy(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
"""[legacy] Parse details of a single ad subcomponent, similar to general"""

parsed = {'type':'ad', 'sub_rank':sub_rank}
parsed = {"type": "ad",
"sub_type": "legacy",
"sub_rank": sub_rank}
header = sub.find('div', {'class':'ad_cclk'})
parsed['title'] = header.find('h3').text
parsed['url'] = header.find('cite').text
Expand Down
Loading

0 comments on commit 2d3e607

Please sign in to comment.