diff --git a/mail_editor/models.py b/mail_editor/models.py index 222300e..d427e2c 100644 --- a/mail_editor/models.py +++ b/mail_editor/models.py @@ -183,9 +183,9 @@ def build_message( """ subject, body = self.render(context, subj_context) - body, cid_attachments = process_html(body, settings.BASE_HOST) + result = process_html(body, settings.BASE_HOST) - text_body = txt or strip_tags(body) + text_body = txt or strip_tags(result.html) email_message = EmailMultiAlternatives( subject=subject, @@ -195,7 +195,7 @@ def build_message( cc=cc_addresses, bcc=bcc_addresses, ) - email_message.attach_alternative(body, "text/html") + email_message.attach_alternative(result.html, "text/html") email_message.mixed_subtype = "related" if attachments: @@ -209,8 +209,8 @@ def build_message( else: email_message.attach(*attachment) - if cid_attachments: - for att in cid_attachments: + if result.cid_attachments: + for att in result.cid_attachments: subtype = att.content_type.split("/", maxsplit=1) assert subtype[0] == "image" mime_image = MIMEImage(att.content, _subtype=subtype[1]) diff --git a/mail_editor/process.py b/mail_editor/process.py index c4b12aa..865775c 100644 --- a/mail_editor/process.py +++ b/mail_editor/process.py @@ -3,6 +3,7 @@ from mimetypes import guess_type from typing import NamedTuple, Optional from urllib.parse import urlparse +from urllib.request import urlopen from django.conf import settings from django.contrib.staticfiles import finders @@ -24,26 +25,9 @@ ] -def _html_inline_css(html: str) -> str: - inliner = css_inline.CSSInliner( - inline_style_tags=True, - keep_style_tags=False, - keep_link_tags=False, - extra_css=None, - load_remote_stylesheets=True, - # link urls will have been transformed earlier - base_url=f"file://{FILE_ROOT}/", - ) - try: - html = inliner.inline(html) - return html - except css_inline.InlineError as e: - # we never want errors to block important mail - # maybe we should log though - if settings.DEBUG: - raise e - else: - return html +class FileData(NamedTuple): + content: bytes + content_type: str class CIDAttachment(NamedTuple): @@ -52,17 +36,23 @@ class CIDAttachment(NamedTuple): content_type: str +class ProcessedHTML(NamedTuple): + html: str + cid_attachments: list[CIDAttachment] + + def process_html( html: str, base_url: str, extract_attachments: bool = True, inline_css: bool = True, -) -> tuple[str, list[CIDAttachment]]: +) -> ProcessedHTML: # TODO handle errors in cosmetics and make sure we always produce something parser = etree.HTMLParser() root = etree.fromstring(html, parser) static_url = make_url_absolute(settings.STATIC_URL, base_url) + media_url = make_url_absolute(settings.MEDIA_URL, base_url) image_attachments = dict() url_cid_cache = dict() @@ -93,14 +83,16 @@ def process_html( # we remembered this was a bad url continue else: - content, content_type = load_image(url, base_url) - if not content: + data = load_image(url, base_url, static_url, media_url) + if not data: # if we can't load the image leave element as-is and remember bad url url_cid_cache[url] = None continue - cid = cid_for_bytes(content) + cid = cid_for_bytes(data.content) url_cid_cache[url] = cid - image_attachments[cid] = (content, content_type) + image_attachments[cid] = CIDAttachment( + cid, data.content, data.content_type + ) elem.set("src", f"cid:{cid}") @@ -109,8 +101,8 @@ def process_html( url = elem.get("href") if not url: continue - # resolving back to paths is needed so css-inliner's can use a file:// base_url - partial_file_path = find_static_path_for_inliner(url, static_url) + # resolving back to paths is needed so css-inliner can use a file:// base_url + partial_file_path = _find_static_path_for_inliner(url, static_url) if partial_file_path: elem.set("href", partial_file_path) else: @@ -124,75 +116,46 @@ def process_html( if inline_css: result = _html_inline_css(result) - attachments = [ - CIDAttachment(cid, content, ct) - for cid, (content, ct) in image_attachments.items() - ] + # TODO support inlining CSS referenced images? - return result, attachments + return ProcessedHTML(result, list(image_attachments.values())) def cid_for_bytes(content: bytes) -> str: - # hash content for de-duplication h = hashlib.sha1(usedforsecurity=False) h.update(content) return h.hexdigest() -def read_image_file(path: str) -> tuple[Optional[bytes], str]: - try: - with open(path, "rb") as f: - content = f.read() - # is guess_type() what we want or do we look in the content? - content_type, _encoding = guess_type(path) - return content, content_type - except Exception: - # TODO stricter exception types - # we never want errors to block important mail - # maybe we should log though - return None, "" - - -def find_static_path_for_inliner(url: str, static_url: str) -> Optional[str]: - if url.startswith(static_url): - file_name = url[len(static_url) :] - file_path = os.path.join(settings.STATIC_ROOT, file_name) - if os.path.exists(file_path): - return str(os.path.relpath(file_path, start=FILE_ROOT)) - else: - file_path = finders.find(file_name) - if file_path: - return str(os.path.relpath(file_path, start=FILE_ROOT)) - - # we don't allow external links - return None - - -def load_image(url: str, base_url: str) -> tuple[Optional[bytes], str]: +def load_image( + url: str, base_url: str, static_url: str, media_url: str +) -> Optional[FileData]: # TODO support data urls? steal from mailcleaner - static_url = make_url_absolute(settings.STATIC_URL, base_url) - media_url = make_url_absolute(settings.MEDIA_URL, base_url) - content, content_type = None, "" + data = None url = make_url_absolute(url, base_url) - if url.startswith(static_url): + if url.startswith("data:"): + data = read_data_uri(url) + + elif url.startswith(static_url): file_name = url[len(static_url) :] file_path = os.path.join(settings.STATIC_ROOT, file_name) - content, content_type = read_image_file(file_path) - if not content: + data = read_image_file(file_path) + if not data: # fallback for development file_path = finders.find(file_name) if file_path: - content, content_type = read_image_file(file_path) + data = read_image_file(file_path) elif url.startswith(media_url): file_name = url[len(media_url) :] file_path = os.path.join(settings.MEDIA_ROOT, file_name) - content, content_type = read_image_file(file_path) + data = read_image_file(file_path) # else: + # NOTE: loading from http is currently not supported because of http overhead # url = make_url_absolute(base_url) # # TODO check domains? # # TODO check status @@ -201,18 +164,49 @@ def load_image(url: str, base_url: str) -> tuple[Optional[bytes], str]: # content_type = r.headers["Content-Type"] # content = r.content - if not content or content_type not in supported_types: - # TODO lets log - return None, "" + if data and data.content and data.content_type in supported_types: + return data else: - return content, content_type + # TODO lets log/? + return None + + +def read_data_uri(uri: str) -> Optional[FileData]: + assert uri.startswith("data:") + + try: + with urlopen(uri) as response: + content = response.read() + content_type = response.headers.get("Content-Type") + if content and content_type: + return FileData(content, content_type) + except Exception: + # TODO stricter exception types + # we never want errors to block important mail + # maybe we should log though + pass + + return None + + +def read_image_file(path: str) -> Optional[FileData]: + try: + with open(path, "rb") as f: + content = f.read() + # is guess_type() what we want or do we look in the content? + content_type, _encoding = guess_type(path) + return FileData(content, content_type) + except Exception: + # TODO stricter exception types + # we never want errors to block important mail + # maybe we should log though + return None def make_url_absolute(url: str, base_url: str = "") -> str: """ base_url: https://domain """ - # TODO surely there is a standard and proper way to do this? # TODO we're using the path part as file path so we should handle sneaky attempts to use relative ".." try: parse = urlparse(url) @@ -233,3 +227,40 @@ def make_url_absolute(url: str, base_url: str = "") -> str: if url[0] != "/": url = f"/{url}" return base_url + url + + +def _html_inline_css(html: str) -> str: + inliner = css_inline.CSSInliner( + inline_style_tags=True, + keep_style_tags=False, + keep_link_tags=False, + extra_css=None, + load_remote_stylesheets=True, + # link urls will have been transformed earlier + base_url=f"file://{FILE_ROOT}/", + ) + try: + html = inliner.inline(html) + return html + except css_inline.InlineError as e: + # we never want errors to block important mail + # maybe we should log though + if settings.DEBUG: + raise e + else: + return html + + +def _find_static_path_for_inliner(url: str, static_url: str) -> Optional[str]: + if url.startswith(static_url): + file_name = url[len(static_url) :] + file_path = os.path.join(settings.STATIC_ROOT, file_name) + if os.path.exists(file_path): + return str(os.path.relpath(file_path, start=FILE_ROOT)) + else: + file_path = finders.find(file_name) + if file_path: + return str(os.path.relpath(file_path, start=FILE_ROOT)) + + # we don't allow external links + return None diff --git a/testapp/static/image.svg b/testapp/static/image.svg new file mode 100644 index 0000000..5f074ce --- /dev/null +++ b/testapp/static/image.svg @@ -0,0 +1,7 @@ + diff --git a/tests/test_process_helpers.py b/tests/test_process_helpers.py new file mode 100644 index 0000000..b827e72 --- /dev/null +++ b/tests/test_process_helpers.py @@ -0,0 +1,147 @@ +import base64 +import os + +from django.conf import settings +from django.test import TestCase + +from mail_editor.process import ( + cid_for_bytes, + load_image, + make_url_absolute, + read_data_uri, + read_image_file, +) + + +class ProcessHelpersTestCase(TestCase): + def test_make_url_absolute(self): + tests = [ + ("http://example.com", "/foo", "http://example.com/foo"), + ("http://example.com", "foo", "http://example.com/foo"), + ("http://example.com", "", "http://example.com"), + ("http://example.com", "http://example.com/foo", "http://example.com/foo"), + ("http://example.com", "", "http://example.com"), + ("", "http://example.com/foo", "http://example.com/foo"), + ("", "foo", "/foo"), + ("", "", "/"), + # extras + ("http://example.com", "tel:123456789", "tel:123456789"), + ("http://example.com", "mailto:foo@example.com", "mailto:foo@example.com"), + ( + "http://example.com", + "data:image/png;base64,xyz", + "data:image/png;base64,xyz", + ), + ] + for i, (base, url, expected) in enumerate(tests): + with self.subTest((i, base, url)): + self.assertEqual(make_url_absolute(url, base), expected) + + def test_cid_for_bytes(self): + self.assertEqual(cid_for_bytes(b"abc"), cid_for_bytes(b"abc")) + self.assertNotEqual(cid_for_bytes(b"12356"), cid_for_bytes(b"abc")) + + def test_read_image_file(self): + with self.subTest("exists"): + path = os.path.join(settings.STATIC_ROOT, "logo.png") + with open(path, "rb") as f: + expected = f.read() + + data = read_image_file(path) + self.assertEqual(data.content, expected) + self.assertEqual(data.content_type, "image/png") + + with self.subTest("not exists"): + path = os.path.join(settings.STATIC_ROOT, "not_exists.png") + data = read_image_file(path) + self.assertIsNone(data) + + def test_read_data_uri(self): + path = os.path.join(settings.STATIC_ROOT, "logo.png") + with open(path, "rb") as f: + png_data = f.read() + png_b64 = base64.b64encode(png_data).decode("utf8") + + with self.subTest("valid"): + datauri = f"data:image/png;base64,{png_b64}" + data = read_data_uri(datauri) + self.assertEqual(data.content, png_data) + self.assertEqual(data.content_type, "image/png") + + with self.subTest("bad content"): + datauri = "data:image/png;base64,xxxxxxxxxxx" + data = read_data_uri(datauri) + self.assertIsNone(data) + + def test_load_image(self): + # TODO properly test both collected STATIC_ROOT and the development staticfiles.finders fallback + + static_url = make_url_absolute(settings.STATIC_URL, "http://testserver") + media_url = make_url_absolute(settings.MEDIA_URL, "http://testserver") + + with self.subTest("static URL & PNG"): + with open(os.path.join(settings.STATIC_ROOT, "logo.png"), "rb") as f: + expected = f.read() + + data = load_image( + "/static/logo.png", "http://testserver", static_url, media_url + ) + self.assertEqual(data.content, expected) + self.assertEqual(data.content_type, "image/png") + + with self.subTest("static URL & SVG"): + data = load_image( + "/static/image.svg", "http://testserver", static_url, media_url + ) + self.assertIsNone(data) + + with self.subTest("media URL & JPG"): + with open(os.path.join(settings.MEDIA_ROOT, "logo.jpg"), "rb") as f: + expected = f.read() + + data = load_image( + "/media/logo.jpg", "http://testserver", static_url, media_url + ) + self.assertEqual(data.content, expected) + self.assertEqual(data.content_type, "image/jpeg") + + with self.subTest("datauri & PNG"): + path = os.path.join(settings.STATIC_ROOT, "logo.png") + with open(path, "rb") as f: + png_data = f.read() + png_b64 = base64.b64encode(png_data).decode("utf8") + + data = load_image( + f"data:image/png;base64,{png_b64}", + "http://testserver", + static_url, + media_url, + ) + self.assertEqual(data.content, png_data) + self.assertEqual(data.content_type, "image/png") + + with self.subTest("datauri & SVG"): + path = os.path.join(settings.STATIC_ROOT, "image.svg") + with open(path, "rb") as f: + svg_data = f.read() + svg_b64 = base64.b64encode(svg_data).decode("utf8") + + data = load_image( + f"data:image/svg+xml;base64,{svg_b64}", + "http://testserver", + static_url, + media_url, + ) + self.assertIsNone(data) + + with self.subTest("static not exists"): + data = load_image( + "/static/not_exists.png", "http://testserver", static_url, media_url + ) + self.assertIsNone(data) + + with self.subTest("media not exists"): + data = load_image( + "/media/not_exists.png", "http://testserver", static_url, media_url + ) + self.assertIsNone(data) diff --git a/tests/test_process_html.py b/tests/test_process_html.py index ca7d481..8d8b0fd 100644 --- a/tests/test_process_html.py +++ b/tests/test_process_html.py @@ -1,162 +1,119 @@ -import os from unittest.mock import patch -from django.conf import settings from django.test import TestCase -from mail_editor.process import ( - cid_for_bytes, - load_image, - make_url_absolute, - process_html, - read_image_file, -) +from mail_editor.process import FileData, process_html class ProcessTestCase(TestCase): - # TODO use better html markup & assertion style + """ + note the patched helpers are individually and more exhaustively tested elsewhere + """ @patch("mail_editor.process.cid_for_bytes", return_value="MY_CID") - @patch("mail_editor.process.load_image", return_value=(b"abc", "image/jpg")) + @patch("mail_editor.process.load_image", return_value=FileData(b"abc", "image/jpg")) def test_extract_images(self, m1, m2): - html = '
' - result, objects = process_html(html, "https://example.com") - - expected_html = ( - '' - ) - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, [("MY_CID", b"abc", "image/jpg")]) - - @patch("mail_editor.process.load_image", return_value=(None, "")) + html = """ + + + + """ + expected_html = """ + + + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, [("MY_CID", b"abc", "image/jpg")]) + + @patch("mail_editor.process.load_image", return_value=None) def test_extract_images__keeps_absolute_url_when_not_loadable(self, m): - html = '' - result, objects = process_html(html, "https://example.com") - - expected_html = '' - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, []) + html = """ + + + + """ + expected_html = """ + + + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, []) def test_fix_anchor_urls(self): - html = 'barbar' - result, objects = process_html(html, "https://example.com") - - expected_html = '
' - - self.assertEqual(result.rstrip(), expected_html) - self.assertEqual(objects, []) + html = """ + + foo + bar + + """ + expected_html = """ + + foo + bar + + """ + result = process_html(html, "https://example.com") + self.assertHTMLEqual(result.html, expected_html) + self.assertEqual(result.cid_attachments, []) def test_inlines_css_from_style(self): - html = "