Skip to content

Commit

Permalink
Improve description parsing logic
Browse files Browse the repository at this point in the history
  • Loading branch information
lgarber-akamai committed Nov 13, 2024
1 parent f5d1092 commit 4c2b626
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 23 deletions.
52 changes: 30 additions & 22 deletions linodecli/baked/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import functools
import re
from html import unescape
from typing import List, Tuple
from typing import List, Optional, Tuple

# Sentence delimiter, split on a period followed by any type of
# whitespace (space, new line, tab, etc.)
REGEX_SENTENCE_DELIMITER = re.compile(r"\W(?:\s|$)")
REGEX_SENTENCE_DELIMITER = re.compile(r"\.(?:\s|$)", flags=re.M)

# Matches on pattern __prefix__ at the beginning of a description
# or after a comma
REGEX_TECHDOCS_PREFIX = re.compile(r"(?:, |\A)__([\w-]+)__")
REGEX_TECHDOCS_PREFIX = re.compile(r"(?:, |\A)__([^_]+)__")

# Matches on pattern [link title](https://.../)
REGEX_MARKDOWN_LINK = re.compile(r"\[(?P<text>.*?)]\((?P<link>.*?)\)")
Expand Down Expand Up @@ -121,23 +121,35 @@ def get_short_description(description: str) -> str:
:rtype: set
"""

target_lines = description.splitlines()
relevant_lines = None

for i, line in enumerate(target_lines):
def __simplify(sentence: str) -> Optional[str]:
# Edge case for descriptions starting with a note
if line.lower().startswith("__note__"):
continue
if sentence.lower().startswith("__note__"):
return None

sentence = strip_techdocs_prefixes(sentence)

relevant_lines = target_lines[i:]
break
# Check that the sentence still has content after stripping prefixes
if len(sentence) < 2:
return None

if relevant_lines is None:
return sentence + "."

# Find the first relevant sentence
result = next(
simplified
for simplified in iter(
__simplify(sentence)
for sentence in REGEX_SENTENCE_DELIMITER.split(description)
)
if simplified is not None
)

if result is None:
raise ValueError(
f"description does not contain any relevant lines: {description}",
)

return REGEX_SENTENCE_DELIMITER.split("\n".join(relevant_lines), 1)[0] + "."
return result


def strip_techdocs_prefixes(description: str) -> str:
Expand All @@ -150,11 +162,7 @@ def strip_techdocs_prefixes(description: str) -> str:
:returns: The stripped description
:rtype: str
"""
result_description = REGEX_TECHDOCS_PREFIX.sub(
"", description.lstrip()
).lstrip()

return result_description
return REGEX_TECHDOCS_PREFIX.sub("", description.lstrip()).lstrip()


def process_arg_description(description: str) -> Tuple[str, str]:
Expand All @@ -173,12 +181,12 @@ def process_arg_description(description: str) -> Tuple[str, str]:
return "", ""

result = get_short_description(description)
result = strip_techdocs_prefixes(result)
result = result.replace("\n", " ").replace("\r", " ")

description, links = extract_markdown_links(result)
# NOTE: Links should only be separated from Rich Markdown links
result_no_links, links = extract_markdown_links(result)

if len(links) > 0:
description += f" See: {'; '.join(links)}"
result_no_links += f" See: {'; '.join(links)}"

return unescape(markdown_to_rich_markup(description)), unescape(description)
return unescape(markdown_to_rich_markup(result_no_links)), unescape(result)
2 changes: 1 addition & 1 deletion tests/unit/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_get_first_sentence(self):

assert (
get_short_description(
"__Note__. This might be a sentence.\nThis is a sentence."
"__Note__ This might be a sentence.\nThis is a sentence."
)
== "This is a sentence."
)
Expand Down

0 comments on commit 4c2b626

Please sign in to comment.