Skip to content

Commit

Permalink
Fix SERP snippet parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 21, 2023
1 parent c26951a commit 140dd17
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
1 change: 1 addition & 0 deletions archive_query_log/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ class InnerSerp(InnerDocument):
class Result(BaseDocument):
archive: InnerArchive = Object(InnerArchive)
provider: InnerProvider = Object(InnerProvider)
capture: InnerCapture = Object(InnerCapture)
serp: InnerSerp = Object(InnerSerp)
snippet: Snippet = Object(Snippet)
snippet_parser: InnerParser | None = Object(InnerParser)
Expand Down
10 changes: 9 additions & 1 deletion archive_query_log/parsers/warc_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def add_warc_snippets_parser(

def _parse_warc_snippets(
parser: WarcSnippetsParser,
serp_id: str,
capture_url: str,
warc_store: WarcS3Store,
warc_location: WarcLocation,
Expand Down Expand Up @@ -121,6 +122,7 @@ def _parse_warc_snippets(
with_tail=True,
)
snippet_id_components = (
serp_id,
parser.id,
str(hash(content)),
str(i),
Expand Down Expand Up @@ -175,7 +177,12 @@ def _parse_serp_warc_snippets_action(
for parser in _warc_snippets_parsers(config, serp.provider.id):
# Try to parse the snippets.
warc_snippets = _parse_warc_snippets(
parser, serp.capture.url, config.s3.warc_store, serp.warc_location)
parser=parser,
serp_id=serp.id,
capture_url=serp.capture.url,
warc_store=config.s3.warc_store,
warc_location=serp.warc_location,
)
if warc_snippets is None:
# Parsing was not successful, e.g., URL pattern did not match.
continue
Expand All @@ -188,6 +195,7 @@ def _parse_serp_warc_snippets_action(
meta={"id": snippet.id},
archive=serp.archive,
provider=serp.provider,
capture=serp.capture,
serp=InnerSerp(
id=serp.id,
),
Expand Down

0 comments on commit 140dd17

Please sign in to comment.