Skip to content

Commit

Permalink
msg html bodies now rely on chardet. Also does a better job as inject…
Browse files Browse the repository at this point in the history
…ing embeded attached images
  • Loading branch information
gwiedeman committed Feb 5, 2024
1 parent d90d09b commit 7aa5bb6
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 4 deletions.
16 changes: 15 additions & 1 deletion mailbagit/formats/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,21 @@ def messages(self, iteration_only=False):
html_encoding = None
text_encoding = None
# encoding check priorities
encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}}
encodings = {}
"""
The listed values are apparently unreliable for HTML bodies.
Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option.
try:
LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16)
LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16)
message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)]
message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)]
encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"}
encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"}
except:
desc = "Error reading codepages"
errors = common.handle_error(errors, e, desc)
"""
try:
try:
if mail.htmlBody:
Expand Down
7 changes: 6 additions & 1 deletion mailbagit/helper/derivative.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True):
# HT to extract_msg for this approach
# https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414
tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:"))
data = None
for tag in tags:
# Iterate through the attachments until we get the right one.
data = None
cid = tag["src"][4:]

for attachment in message.Attachments:
if attachment.Name:
if attachment.Name in cid:
data = attachment.File
if data == None:
for attachment in message.Attachments:
if attachment.Content_ID:
if attachment.Content_ID in cid:
data = attachment.File

# If we found anything, inject it.
if data:
Expand Down
4 changes: 2 additions & 2 deletions mailbagit/helper/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
try:
valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower()
valid.append(valid_encoding)
text = binary_text.decode(valid_encoding)
text = binary_text.decode(valid_encoding, errors="strict")
used = encodings[priority]["name"]
success = True
break
Expand All @@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors):
if success == False:
try:
detected = chardet.detect(binary_text)["encoding"]
text = binary_text.decode(detected)
text = binary_text.decode(detected, errors="strict")
used = detected
if len(valid) < 1:
# desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected
Expand Down

0 comments on commit 7aa5bb6

Please sign in to comment.