From a29456d9b842bd21daa7229216f7bb1abbd6d598 Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Mon, 5 Jun 2023 00:28:35 -0300 Subject: [PATCH] Markdown parser updates (#18) * Parse standalone URLs too * move single-quote back into the set * Tighten URL characters for CFM * Improve headers formatting * Simplify expressions * Update example * Parse URLs regardless of casing * open links in new tabs * Put atx-style back into docstring * Absorb more spaces and optional closing on headers * Refine bold and italics expressions by ensuring: * Marker matches on both sides * Opener and closer is not escaped * First character inside is not a space --- proxy/source/markdown_parser.py | 70 ++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/proxy/source/markdown_parser.py b/proxy/source/markdown_parser.py index af36aba2..1fa4de1e 100644 --- a/proxy/source/markdown_parser.py +++ b/proxy/source/markdown_parser.py @@ -9,7 +9,7 @@ # The supported tags are: # - URLs -# - Headers (atx-style, parses to a single level) +# - Headers (atx-style. Parsed into span blocks with font-size. Capped at 4 levels) # - Inline emphasis (bold, italics) # - Inline code @@ -21,17 +21,15 @@ ############################################################################# # text = """ # [This is a link](http://example.net/). - # Another inline link [here](http://example.net/). - -# # Header - -# ## Another header - -# *some italics* _more italics_ - -# **some bold** __more bold__ - +# One more link for good measure: http://example.net/ +# # Header 1 +# ## Header 2 +# ### Header 3 +# #### Header 4 +# ##### Header 5 - Capped Size +# *some italics* and _more italics_ +# **some bold** and __more bold__ # `code` # """ @@ -39,17 +37,15 @@ # print(result) # | This is a link. -# | # | Another inline link here. -# | -# |

Header

-# | -# |

Another header

-# | -# | some italics more italics -# | -# | some bold more bold -# | +# | One more link for good measure: http://example.net/ +# | Header 1 +# | Header 2 +# | Header 3 +# | Header 4 +# | Header 5 - Capped Size +# | some italics and more italics +# | some bold and more bold # | code @@ -58,22 +54,36 @@ def _convert_crlf(input_str: str) -> str: def _parse_links(input_str: str) -> str: + input_str = re.sub( + r"\[(.+?)\]\((https?:\/\/[-a-zA-Z0-9._~:/?#@!$&()*+,;=%']+)\)", + r'\1', + input_str, + flags=re.MULTILINE|re.IGNORECASE, + ) return re.sub( - r"\[([\w\W]+?)\]\(([\w\W]+?)\)", - r'\1', + r"(?\1', input_str, - flags=re.MULTILINE, + flags=re.MULTILINE|re.IGNORECASE, ) def _parse_headers(input_str: str) -> str: - return re.sub(r"^#+ +([\w\W]+?)\n", r"

\1

\n", input_str, flags=re.MULTILINE) + search = re.finditer(r"^(#{1,5}) +(.+)[# ]?$", input_str, re.MULTILINE) + for i in search: + h = 2 - (min(len(i.group(1)) + 1, 5) / 10) + input_str = re.sub( + i.group(), + f'{i.group(2)}', + input_str, + ) + return input_str def _parse_strong_emphasis(input_str: str) -> str: return "\n".join( re.sub( - r"(?:\*\*|\_\_)([\w]+?[\w\W]+?[\w]+?)(?:\*\*|\_\_)", + r"(?\1", l, ) @@ -83,14 +93,18 @@ def _parse_strong_emphasis(input_str: str) -> str: def _parse_em_emphasis(input_str: str) -> str: return "\n".join( - re.sub(r"(?:\*|\_)([\w]+?[\w\W]+?[\w]+?)(?:\*|\_)", r"\1", l) + re.sub( + r"(?\1", + l, + ) for l in input_str.splitlines() ) def _parse_code(input_str: str) -> str: return "\n".join( - re.sub(r"(?:\`)([\w]+?[\w\W]+?[\w]+?)(?:\`)", r"\1", l) + re.sub(r"`(.+?)`", r"\1", l) for l in input_str.splitlines() )