Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more logging for notion connector + add retries #405

Merged
merged 1 commit into from
Sep 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions backend/danswer/connectors/notion/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Optional

import requests
from retry import retry

from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
Expand All @@ -16,6 +17,9 @@
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger

logger = setup_logger()


@dataclass
Expand Down Expand Up @@ -68,19 +72,23 @@ def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
"Notion-Version": "2022-06-28",
}

@retry(tries=3, delay=1, backoff=2)
def _fetch_block(self, block_id: str) -> dict[str, Any]:
"""Fetch a single block via the Notion API."""
logger.debug(f"Fetching block with ID '{block_id}'")
block_url = f"https://api.notion.com/v1/blocks/{block_id}/children"
query_dict: Dict[str, Any] = {}
res = requests.get(block_url, headers=self.headers, json=query_dict)
res.raise_for_status()
return res.json()

def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str:
"""Reads blocks for a page"""
done = False
result_lines_arr = []
cur_block_id = block_id
while not done:
block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children"
query_dict: Dict[str, Any] = {}

res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
data = self._fetch_block(cur_block_id)

for result in data["results"]:
result_type = result["type"]
Expand Down Expand Up @@ -130,6 +138,7 @@ def _read_pages(self, pages: List[NotionPage]) -> List[Document]:
"""Reads pages for rich text content and generates Documents"""
docs_batch = []
for page in pages:
logger.info(f"Reading page with ID '{page.id}', with url {page.url}")
page_text = self._read_blocks(page.id)
page_title = self._read_page_title(page)
docs_batch.append(
Expand All @@ -143,8 +152,11 @@ def _read_pages(self, pages: List[NotionPage]) -> List[Document]:
)
return docs_batch

@retry(tries=3, delay=1, backoff=2)
def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse:
"""Search for pages from a Notion database."""
"""Search for pages from a Notion database. Includes some small number of
retries to handle misc, flakey failures."""
logger.debug(f"Searching for pages in Notion with query_dict: {query_dict}")
res = requests.post(
"https://api.notion.com/v1/search",
headers=self.headers,
Expand Down
Loading