Skip to content

Commit

Permalink
Markdown parser updates (#18)
Browse files Browse the repository at this point in the history
* Parse standalone URLs too

* move single-quote back into the set

* Tighten URL characters for CFM

* Improve headers formatting

* Simplify expressions

* Update example

* Parse URLs regardless of casing

* open links in new tabs

* Put atx-style back into docstring

* Absorb more spaces and optional closing on headers

* Refine bold and italics expressions by ensuring:
 * Marker matches on both sides
 * Opener and closer is not escaped
  * First character inside is not a space
  • Loading branch information
BrutuZ authored Jun 5, 2023
1 parent 83c9cec commit a29456d
Showing 1 changed file with 42 additions and 28 deletions.
70 changes: 42 additions & 28 deletions proxy/source/markdown_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# The supported tags are:
# - URLs
# - Headers (atx-style, parses to a single level)
# - Headers (atx-style. Parsed into span blocks with font-size. Capped at 4 levels)
# - Inline emphasis (bold, italics)
# - Inline code

Expand All @@ -21,35 +21,31 @@
#############################################################################
# text = """
# [This is a link](http://example.net/).

# Another inline link [here](http://example.net/).

# # Header

# ## Another header

# *some italics* _more italics_

# **some bold** __more bold__

# One more link for good measure: http://example.net/
# # Header 1
# ## Header 2
# ### Header 3
# #### Header 4
# ##### Header 5 - Capped Size
# *some italics* and _more italics_
# **some bold** and __more bold__
# `code`
# """

# result = parse_html(text)

# print(result)
# | <a href="http://example.net/">This is a link</a>.
# |
# | Another inline link <a href="http://example.net/">here</a>.
# |
# | <h3>Header</h3>
# |
# | <h3>Another header</h3>
# |
# | <em>some italics</em> <em>more italics</em>
# |
# | <strong>some bold</strong> <strong>more bold</strong>
# |
# | One more link for good measure: <a href="http://example.net/">http://example.net/</a>
# | <span style="font-size: 1.8em;">Header 1</span>
# | <span style="font-size: 1.7em;">Header 2</span>
# | <span style="font-size: 1.6em;">Header 3</span>
# | <span style="font-size: 1.5em;">Header 4</span>
# | <span style="font-size: 1.5em;">Header 5 - Capped Size</span>
# | <em>some italics</em> and <em>more italics</em>
# | <strong>some bold</strong> and <strong>more bold</strong>
# | <code>code</code>


Expand All @@ -58,22 +54,36 @@ def _convert_crlf(input_str: str) -> str:


def _parse_links(input_str: str) -> str:
input_str = re.sub(
r"\[(.+?)\]\((https?:\/\/[-a-zA-Z0-9._~:/?#@!$&()*+,;=%']+)\)",
r'<a href="\2" target="_blank" rel="nofollow noreferrer noopener">\1</a>',
input_str,
flags=re.MULTILINE|re.IGNORECASE,
)
return re.sub(
r"\[([\w\W]+?)\]\(([\w\W]+?)\)",
r'<a href="\2">\1</a>',
r"(?<!href=\")(https?:\/\/[-a-zA-Z0-9._~:/?#@!$&()*+,;=%']+)",
r'<a href="\1" target="_blank" rel="nofollow noreferrer noopener">\1</a>',
input_str,
flags=re.MULTILINE,
flags=re.MULTILINE|re.IGNORECASE,
)


def _parse_headers(input_str: str) -> str:
return re.sub(r"^#+ +([\w\W]+?)\n", r"<h3>\1</h3>\n", input_str, flags=re.MULTILINE)
search = re.finditer(r"^(#{1,5}) +(.+)[# ]?$", input_str, re.MULTILINE)
for i in search:
h = 2 - (min(len(i.group(1)) + 1, 5) / 10)
input_str = re.sub(
i.group(),
f'<span style="font-size: {h}em;">{i.group(2)}</span>',
input_str,
)
return input_str


def _parse_strong_emphasis(input_str: str) -> str:
return "\n".join(
re.sub(
r"(?:\*\*|\_\_)([\w]+?[\w\W]+?[\w]+?)(?:\*\*|\_\_)",
r"(?<!\\)\*\*(\w.*?)(?<!\\)\*\*|(?<!\\)__(\w.*?)(?<!\\)__",
r"<strong>\1</strong>",
l,
)
Expand All @@ -83,14 +93,18 @@ def _parse_strong_emphasis(input_str: str) -> str:

def _parse_em_emphasis(input_str: str) -> str:
return "\n".join(
re.sub(r"(?:\*|\_)([\w]+?[\w\W]+?[\w]+?)(?:\*|\_)", r"<em>\1</em>", l)
re.sub(
r"(?<!\\)\*(\w.*?)(?<!\\)\*|(?<!\\)_(\w.*?)(?<!\\)_",
r"<em>\1</em>",
l,
)
for l in input_str.splitlines()
)


def _parse_code(input_str: str) -> str:
return "\n".join(
re.sub(r"(?:\`)([\w]+?[\w\W]+?[\w]+?)(?:\`)", r"<code>\1</code>", l)
re.sub(r"`(.+?)`", r"<code>\1</code>", l)
for l in input_str.splitlines()
)

Expand Down

0 comments on commit a29456d

Please sign in to comment.