diff --git a/mail_editor/models.py b/mail_editor/models.py index 222300e..d427e2c 100644 --- a/mail_editor/models.py +++ b/mail_editor/models.py @@ -183,9 +183,9 @@ def build_message( """ subject, body = self.render(context, subj_context) - body, cid_attachments = process_html(body, settings.BASE_HOST) + result = process_html(body, settings.BASE_HOST) - text_body = txt or strip_tags(body) + text_body = txt or strip_tags(result.html) email_message = EmailMultiAlternatives( subject=subject, @@ -195,7 +195,7 @@ def build_message( cc=cc_addresses, bcc=bcc_addresses, ) - email_message.attach_alternative(body, "text/html") + email_message.attach_alternative(result.html, "text/html") email_message.mixed_subtype = "related" if attachments: @@ -209,8 +209,8 @@ def build_message( else: email_message.attach(*attachment) - if cid_attachments: - for att in cid_attachments: + if result.cid_attachments: + for att in result.cid_attachments: subtype = att.content_type.split("/", maxsplit=1) assert subtype[0] == "image" mime_image = MIMEImage(att.content, _subtype=subtype[1]) diff --git a/mail_editor/process.py b/mail_editor/process.py index c4b12aa..50459d7 100644 --- a/mail_editor/process.py +++ b/mail_editor/process.py @@ -24,26 +24,9 @@ ] -def _html_inline_css(html: str) -> str: - inliner = css_inline.CSSInliner( - inline_style_tags=True, - keep_style_tags=False, - keep_link_tags=False, - extra_css=None, - load_remote_stylesheets=True, - # link urls will have been transformed earlier - base_url=f"file://{FILE_ROOT}/", - ) - try: - html = inliner.inline(html) - return html - except css_inline.InlineError as e: - # we never want errors to block important mail - # maybe we should log though - if settings.DEBUG: - raise e - else: - return html +class FileData(NamedTuple): + content: bytes + content_type: str class CIDAttachment(NamedTuple): @@ -52,17 +35,23 @@ class CIDAttachment(NamedTuple): content_type: str +class ProcessedHTML(NamedTuple): + html: str + cid_attachments: list[CIDAttachment] + + def process_html( html: str, base_url: str, extract_attachments: bool = True, inline_css: bool = True, -) -> tuple[str, list[CIDAttachment]]: +) -> ProcessedHTML: # TODO handle errors in cosmetics and make sure we always produce something parser = etree.HTMLParser() root = etree.fromstring(html, parser) static_url = make_url_absolute(settings.STATIC_URL, base_url) + media_url = make_url_absolute(settings.MEDIA_URL, base_url) image_attachments = dict() url_cid_cache = dict() @@ -93,14 +82,14 @@ def process_html( # we remembered this was a bad url continue else: - content, content_type = load_image(url, base_url) - if not content: + data = load_image(url, base_url, static_url, media_url) + if not data: # if we can't load the image leave element as-is and remember bad url url_cid_cache[url] = None continue - cid = cid_for_bytes(content) + cid = cid_for_bytes(data.content) url_cid_cache[url] = cid - image_attachments[cid] = (content, content_type) + image_attachments[cid] = CIDAttachment(cid, data.content, data.content_type) elem.set("src", f"cid:{cid}") @@ -110,7 +99,7 @@ def process_html( if not url: continue # resolving back to paths is needed so css-inliner's can use a file:// base_url - partial_file_path = find_static_path_for_inliner(url, static_url) + partial_file_path = _find_static_path_for_inliner(url, static_url) if partial_file_path: elem.set("href", partial_file_path) else: @@ -124,55 +113,20 @@ def process_html( if inline_css: result = _html_inline_css(result) - attachments = [ - CIDAttachment(cid, content, ct) - for cid, (content, ct) in image_attachments.items() - ] - - return result, attachments + return ProcessedHTML(result, list(image_attachments.values())) def cid_for_bytes(content: bytes) -> str: - # hash content for de-duplication h = hashlib.sha1(usedforsecurity=False) h.update(content) return h.hexdigest() -def read_image_file(path: str) -> tuple[Optional[bytes], str]: - try: - with open(path, "rb") as f: - content = f.read() - # is guess_type() what we want or do we look in the content? - content_type, _encoding = guess_type(path) - return content, content_type - except Exception: - # TODO stricter exception types - # we never want errors to block important mail - # maybe we should log though - return None, "" - - -def find_static_path_for_inliner(url: str, static_url: str) -> Optional[str]: - if url.startswith(static_url): - file_name = url[len(static_url) :] - file_path = os.path.join(settings.STATIC_ROOT, file_name) - if os.path.exists(file_path): - return str(os.path.relpath(file_path, start=FILE_ROOT)) - else: - file_path = finders.find(file_name) - if file_path: - return str(os.path.relpath(file_path, start=FILE_ROOT)) - - # we don't allow external links - return None - - -def load_image(url: str, base_url: str) -> tuple[Optional[bytes], str]: +def load_image( + url: str, base_url: str, static_url: str, media_url: str +) -> Optional[FileData]: # TODO support data urls? steal from mailcleaner - static_url = make_url_absolute(settings.STATIC_URL, base_url) - media_url = make_url_absolute(settings.MEDIA_URL, base_url) - content, content_type = None, "" + data = None url = make_url_absolute(url, base_url) @@ -180,19 +134,20 @@ def load_image(url: str, base_url: str) -> tuple[Optional[bytes], str]: file_name = url[len(static_url) :] file_path = os.path.join(settings.STATIC_ROOT, file_name) - content, content_type = read_image_file(file_path) - if not content: + data = read_image_file(file_path) + if not data: # fallback for development file_path = finders.find(file_name) if file_path: - content, content_type = read_image_file(file_path) + data = read_image_file(file_path) elif url.startswith(media_url): file_name = url[len(media_url) :] file_path = os.path.join(settings.MEDIA_ROOT, file_name) - content, content_type = read_image_file(file_path) + data = read_image_file(file_path) # else: + # NOTE: loading from http is currently not supported because of http overhead # url = make_url_absolute(base_url) # # TODO check domains? # # TODO check status @@ -201,11 +156,25 @@ def load_image(url: str, base_url: str) -> tuple[Optional[bytes], str]: # content_type = r.headers["Content-Type"] # content = r.content - if not content or content_type not in supported_types: - # TODO lets log - return None, "" + if data and data.content and data.content_type in supported_types: + return data else: - return content, content_type + # TODO lets log/? + return None + + +def read_image_file(path: str) -> Optional[FileData]: + try: + with open(path, "rb") as f: + content = f.read() + # is guess_type() what we want or do we look in the content? + content_type, _encoding = guess_type(path) + return FileData(content, content_type) + except Exception: + # TODO stricter exception types + # we never want errors to block important mail + # maybe we should log though + return None def make_url_absolute(url: str, base_url: str = "") -> str: @@ -233,3 +202,40 @@ def make_url_absolute(url: str, base_url: str = "") -> str: if url[0] != "/": url = f"/{url}" return base_url + url + + +def _html_inline_css(html: str) -> str: + inliner = css_inline.CSSInliner( + inline_style_tags=True, + keep_style_tags=False, + keep_link_tags=False, + extra_css=None, + load_remote_stylesheets=True, + # link urls will have been transformed earlier + base_url=f"file://{FILE_ROOT}/", + ) + try: + html = inliner.inline(html) + return html + except css_inline.InlineError as e: + # we never want errors to block important mail + # maybe we should log though + if settings.DEBUG: + raise e + else: + return html + + +def _find_static_path_for_inliner(url: str, static_url: str) -> Optional[str]: + if url.startswith(static_url): + file_name = url[len(static_url) :] + file_path = os.path.join(settings.STATIC_ROOT, file_name) + if os.path.exists(file_path): + return str(os.path.relpath(file_path, start=FILE_ROOT)) + else: + file_path = finders.find(file_name) + if file_path: + return str(os.path.relpath(file_path, start=FILE_ROOT)) + + # we don't allow external links + return None diff --git a/tests/test_process_html.py b/tests/test_process_html.py index ca7d481..9ff7d62 100644 --- a/tests/test_process_html.py +++ b/tests/test_process_html.py @@ -5,7 +5,7 @@ from django.test import TestCase from mail_editor.process import ( - cid_for_bytes, + FileData, cid_for_bytes, load_image, make_url_absolute, process_html, @@ -14,72 +14,113 @@ class ProcessTestCase(TestCase): - # TODO use better html markup & assertion style - @patch("mail_editor.process.cid_for_bytes", return_value="MY_CID") - @patch("mail_editor.process.load_image", return_value=(b"abc", "image/jpg")) + @patch("mail_editor.process.load_image", return_value=FileData(b"abc", "image/jpg")) def test_extract_images(self, m1, m2): - html = '
' - result, objects = process_html(html, "https://example.com") - - expected_html = ( - '' - ) - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, [("MY_CID", b"abc", "image/jpg")]) - - @patch("mail_editor.process.load_image", return_value=(None, "")) + html = """ + + + + """ + expected_html = """ + + + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, [("MY_CID", b"abc", "image/jpg")]) + + @patch("mail_editor.process.load_image", return_value=None) def test_extract_images__keeps_absolute_url_when_not_loadable(self, m): - html = '' - result, objects = process_html(html, "https://example.com") - - expected_html = '' - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, []) + html = """ + + + + """ + expected_html = """ + + + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, []) def test_fix_anchor_urls(self): - html = 'barbar' - result, objects = process_html(html, "https://example.com") - - expected_html = '
' - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, []) + html = """ + + bar + bar + + """ + expected_html = """ + + bar + bar + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, []) def test_inlines_css_from_style(self): - html = "