From c888e1ac2e0b82ae0ae7a0a590f0b416cfa845b6 Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 6 Sep 2023 11:32:34 -0700 Subject: [PATCH] Add more logging for notion connector + add retries --- .../danswer/connectors/notion/connector.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py index d522b36154f..f976246c4c6 100644 --- a/backend/danswer/connectors/notion/connector.py +++ b/backend/danswer/connectors/notion/connector.py @@ -7,6 +7,7 @@ from typing import Optional import requests +from retry import retry from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -16,6 +17,9 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() @dataclass @@ -68,19 +72,23 @@ def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: "Notion-Version": "2022-06-28", } + @retry(tries=3, delay=1, backoff=2) + def _fetch_block(self, block_id: str) -> dict[str, Any]: + """Fetch a single block via the Notion API.""" + logger.debug(f"Fetching block with ID '{block_id}'") + block_url = f"https://api.notion.com/v1/blocks/{block_id}/children" + query_dict: Dict[str, Any] = {} + res = requests.get(block_url, headers=self.headers, json=query_dict) + res.raise_for_status() + return res.json() + def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str: """Reads blocks for a page""" done = False result_lines_arr = [] cur_block_id = block_id while not done: - block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children" - query_dict: Dict[str, Any] = {} - - res = requests.request( - "GET", block_url, headers=self.headers, json=query_dict - ) - data = res.json() + data = self._fetch_block(cur_block_id) for result in data["results"]: result_type = result["type"] @@ -130,6 +138,7 @@ def _read_pages(self, pages: List[NotionPage]) -> List[Document]: """Reads pages for rich text content and generates Documents""" docs_batch = [] for page in pages: + logger.info(f"Reading page with ID '{page.id}', with url {page.url}") page_text = self._read_blocks(page.id) page_title = self._read_page_title(page) docs_batch.append( @@ -143,8 +152,11 @@ def _read_pages(self, pages: List[NotionPage]) -> List[Document]: ) return docs_batch + @retry(tries=3, delay=1, backoff=2) def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse: - """Search for pages from a Notion database.""" + """Search for pages from a Notion database. Includes some small number of + retries to handle misc, flakey failures.""" + logger.debug(f"Searching for pages in Notion with query_dict: {query_dict}") res = requests.post( "https://api.notion.com/v1/search", headers=self.headers,