Skip to content

Commit

Permalink
Add support for GH prs as well
Browse files Browse the repository at this point in the history
  • Loading branch information
gagb committed Dec 13, 2024
1 parent 0b65547 commit 8a30fca
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 12 deletions.
78 changes: 66 additions & 12 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,22 +846,25 @@ def _get_mlm_description(self, local_path, extension, client, model, prompt=None


class GitHubIssueConverter(DocumentConverter):
"""Converts GitHub issues to Markdown."""
"""Converts GitHub issues and pull requests to Markdown."""

def convert(self, issue_url, github_token) -> Union[None, DocumentConverterResult]:
# Bail if not a valid GitHub issue URL
if issue_url:
parsed_url = urlparse(issue_url)
def convert(self, github_url, github_token) -> Union[None, DocumentConverterResult]:
# Bail if not a valid GitHub issue or pull request URL
if github_url:
parsed_url = urlparse(github_url)
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "issues":
if len(path_parts) < 4 or path_parts[2] not in ["issues", "pull"]:
return None

if not github_token:
raise ValueError(
"GitHub token is not set. Cannot convert GitHub issue."
"GitHub token is not set. Cannot convert GitHub issue or pull request."
)

return self._convert_github_issue(issue_url, github_token)
if path_parts[2] == "issues":
return self._convert_github_issue(github_url, github_token)
elif path_parts[2] == "pull":
return self._convert_github_pr(github_url, github_token)

return None

Expand Down Expand Up @@ -914,6 +917,55 @@ def _convert_github_issue(
text_content=markdown_content,
)

def _convert_github_pr(
self, pr_url: str, github_token: str
) -> DocumentConverterResult:
"""
Convert a GitHub pull request to a markdown document.
Args:
pr_url (str): The URL of the GitHub pull request to convert.
github_token (str): A GitHub token with access to the repository.
Returns:
DocumentConverterResult: The result containing the pull request title and markdown content.
Raises:
ImportError: If the PyGithub library is not installed.
ValueError: If the provided URL is not a valid GitHub pull request URL.
"""
if not IS_GITHUB_ISSUE_CAPABLE:
raise ImportError(
"PyGithub is not installed. Please install it to use this feature."
)

# Parse the pull request URL
parsed_url = urlparse(pr_url)
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "pull":
raise ValueError("Invalid GitHub pull request URL")

owner, repo, _, pr_number = path_parts[:4]

# Authenticate with GitHub
g = Github(github_token)
repo = g.get_repo(f"{owner}/{repo}")
pr = repo.get_pull(int(pr_number))

# Convert pull request details to markdown
markdown_content = f"# {pr.title}\n\n{pr.body}\n\n"
markdown_content += f"**State:** {pr.state}\n"
markdown_content += f"**Created at:** {pr.created_at}\n"
markdown_content += f"**Updated at:** {pr.updated_at}\n"
markdown_content += f"**Comments:**\n"

for comment in pr.get_issue_comments():
markdown_content += (
f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
)

return DocumentConverterResult(
title=pr.title,
text_content=markdown_content,
)


class FileConversionException(BaseException):
pass
Expand Down Expand Up @@ -984,16 +1036,18 @@ def convert(
def convert_url(
self, url: str, **kwargs: Any
) -> DocumentConverterResult: # TODO: fix kwargs type
# Handle GitHub issue URLs directly
# Handle GitHub issue and pull request URLs directly
parsed_url = urlparse(url)
if parsed_url.hostname == "github.com" and "/issues/" in parsed_url.path:
if parsed_url.hostname == "github.com" and any(
x in parsed_url.path for x in ["/issues/", "/pull/"]
):
github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN"))
if not github_token:
raise ValueError(
"GitHub token is required for GitHub issue conversion."
"GitHub token is required for GitHub issue or pull request conversion."
)
return GitHubIssueConverter().convert(
issue_url=url, github_token=github_token
github_url=url, github_token=github_token
)

# Send a HTTP request to the URL
Expand Down
13 changes: 13 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
]

GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421"
GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194"
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")


Expand Down Expand Up @@ -195,9 +196,21 @@ def test_markitdown_github_issue() -> None:
assert "Comments:" in result.text_content


@pytest.mark.skipif(
not GITHUB_TOKEN,
reason="GitHub token not provided",
)
def test_markitdown_github_pr() -> None:
markitdown = MarkItDown()
result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN)
print(result.text_content)
assert "faq" in result.text_content


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_github_issue()
test_markitdown_github_pr()

0 comments on commit 8a30fca

Please sign in to comment.