msg html bodies now rely on chardet. Also does a better job as inject…

…ing embeded attached images
UAlbanyArchives · Feb 5, 2024 · 7aa5bb6 · 7aa5bb6
1 parent d90d09b
commit 7aa5bb6
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 4 deletions.
diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py
@@ -92,7 +92,21 @@ def messages(self, iteration_only=False):
                 html_encoding = None
                 text_encoding = None
                 # encoding check priorities
-                encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}}
+                encodings = {}
+                """
+                The listed values are apparently unreliable for HTML bodies.
+                Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option.
+                try:
+                    LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16)
+                    LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16)
+                    message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)]
+                    message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)]
+                    encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"}
+                    encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"}
+                except:
+                    desc = "Error reading codepages"
+                    errors = common.handle_error(errors, e, desc)
+                """
                 try:
                     try:
                         if mail.htmlBody:

diff --git a/mailbagit/helper/derivative.py b/mailbagit/helper/derivative.py
@@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True):
         # HT to extract_msg for this approach
         # https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414
         tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:"))
-        data = None
         for tag in tags:
             # Iterate through the attachments until we get the right one.
+            data = None
             cid = tag["src"][4:]
 
             for attachment in message.Attachments:
                 if attachment.Name:
                     if attachment.Name in cid:
                         data = attachment.File
+            if data == None:
+                for attachment in message.Attachments:
+                    if attachment.Content_ID:
+                        if attachment.Content_ID in cid:
+                            data = attachment.File
 
             # If we found anything, inject it.
             if data:

diff --git a/mailbagit/helper/format.py b/mailbagit/helper/format.py
@@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
             try:
                 valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower()
                 valid.append(valid_encoding)
-                text = binary_text.decode(valid_encoding)
+                text = binary_text.decode(valid_encoding, errors="strict")
                 used = encodings[priority]["name"]
                 success = True
                 break
@@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
     if success == False:
         try:
             detected = chardet.detect(binary_text)["encoding"]
-            text = binary_text.decode(detected)
+            text = binary_text.decode(detected, errors="strict")
             used = detected
             if len(valid) < 1:
                 # desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected