diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py index 5a6f246..c12c7c3 100644 --- a/mailbagit/formats/msg.py +++ b/mailbagit/formats/msg.py @@ -92,7 +92,21 @@ def messages(self, iteration_only=False): html_encoding = None text_encoding = None # encoding check priorities - encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}} + encodings = {} + """ + The listed values are apparently unreliable for HTML bodies. + Thus with the encodings dict empty, chardet will be used, which is apparently the least bad option. + try: + LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE = int("0x3fde", base=16) + LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE = int("0x3ffd", base=16) + message_body_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_BODY_CODEPAGE)] + message_codepage = extract_msg.encoding._CODE_PAGES[mail.getPropertyVal(LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE)] + encodings[1] = {"name": message_body_codepage, "label": "PidTagInternetCodepage"} + encodings[2] = {"name": message_codepage, "label": "PidTagMessageCodepage"} + except: + desc = "Error reading codepages" + errors = common.handle_error(errors, e, desc) + """ try: try: if mail.htmlBody: diff --git a/mailbagit/helper/derivative.py b/mailbagit/helper/derivative.py index 46c6d99..61b4761 100644 --- a/mailbagit/helper/derivative.py +++ b/mailbagit/helper/derivative.py @@ -214,15 +214,20 @@ def htmlFormatting(message, external_css, headers=True): # HT to extract_msg for this approach # https://github.com/TeamMsgExtractor/msg-extractor/blob/6bed8213de1a7a41739fcf5c9363322508711fce/extract_msg/message_base.py#L403-L414 tags = (tag for tag in soup.findAll("img") if tag.get("src") and tag.get("src").startswith("cid:")) - data = None for tag in tags: # Iterate through the attachments until we get the right one. + data = None cid = tag["src"][4:] for attachment in message.Attachments: if attachment.Name: if attachment.Name in cid: data = attachment.File + if data == None: + for attachment in message.Attachments: + if attachment.Content_ID: + if attachment.Content_ID in cid: + data = attachment.File # If we found anything, inject it. if data: diff --git a/mailbagit/helper/format.py b/mailbagit/helper/format.py index 45d8022..306da69 100644 --- a/mailbagit/helper/format.py +++ b/mailbagit/helper/format.py @@ -67,7 +67,7 @@ def safely_decode(body_type, binary_text, encodings, errors): try: valid_encoding = codecs.lookup(encodings[priority]["name"]).name.lower() valid.append(valid_encoding) - text = binary_text.decode(valid_encoding) + text = binary_text.decode(valid_encoding, errors="strict") used = encodings[priority]["name"] success = True break @@ -78,7 +78,7 @@ def safely_decode(body_type, binary_text, encodings, errors): if success == False: try: detected = chardet.detect(binary_text)["encoding"] - text = binary_text.decode(detected) + text = binary_text.decode(detected, errors="strict") used = detected if len(valid) < 1: # desc = "No valid listed encodings, but successfully decoded " + body_type + " body with detected encoding " + detected