From e41fa565b5fac2bb913a71d7b3b1db42ff97e71f Mon Sep 17 00:00:00 2001 From: alfons Date: Wed, 4 Jan 2023 18:14:37 +0100 Subject: [PATCH] =?UTF-8?q?Initial=20commit=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 59 ++++ LICENSE | 21 ++ MANIFEST.in | 2 + mailparser_reply/__init__.py | 1 + mailparser_reply/constants.py | 99 ++++++ mailparser_reply/parser.py | 269 +++++++++++++++ mailparser_reply/version.py | 1 + main.py | 14 + setup.py | 38 +++ test/__init__.py | 0 test/emails/caution.txt | 50 +++ test/emails/caution2.txt | 12 + test/emails/correct_sig.txt | 4 + test/emails/email_1_1.txt | 13 + test/emails/email_1_3.txt | 55 ++++ test/emails/email_1_4.txt | 8 + test/emails/email_1_5.txt | 15 + test/emails/email_1_6.txt | 15 + test/emails/email_1_7.txt | 12 + test/emails/email_1_8.txt | 37 +++ test/emails/email_2_1.txt | 25 ++ test/emails/email_2_2.txt | 10 + test/emails/email_2_3.txt | 20 ++ test/emails/email_3_1.txt | 56 ++++ test/emails/email_bullets.txt | 22 ++ test/emails/email_gmail.txt | 14 + test/emails/email_headers_no_delimiter.txt | 15 + test/emails/email_one_is_not_on.txt | 10 + test/emails/email_partial_quote_header.txt | 13 + test/emails/email_sent_from_BlackBerry.txt | 3 + test/emails/email_sent_from_iPhone.txt | 3 + ...ail_sent_from_multi_word_mobile_device.txt | 3 + test/emails/email_sent_from_not_signature.txt | 3 + .../email_sig_delimiter_in_middle_of_line.txt | 7 + test/emails/forward.txt | 6 + test/emails/greedy_on.txt | 16 + test/emails/multi_header.txt | 41 +++ test/emails/multiline_on.txt | 14 + test/emails/multiline_on_de.txt | 14 + test/emails/outlook.txt | 9 + test/emails/pathological.txt | 20 ++ test/test_email_reply_parser.py | 309 ++++++++++++++++++ 42 files changed, 1358 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 mailparser_reply/__init__.py create mode 100644 mailparser_reply/constants.py create mode 100644 mailparser_reply/parser.py create mode 100644 mailparser_reply/version.py create mode 100644 main.py create mode 100644 setup.py create mode 100644 test/__init__.py create mode 100644 test/emails/caution.txt create mode 100644 test/emails/caution2.txt create mode 100644 test/emails/correct_sig.txt create mode 100644 test/emails/email_1_1.txt create mode 100644 test/emails/email_1_3.txt create mode 100644 test/emails/email_1_4.txt create mode 100644 test/emails/email_1_5.txt create mode 100644 test/emails/email_1_6.txt create mode 100644 test/emails/email_1_7.txt create mode 100644 test/emails/email_1_8.txt create mode 100644 test/emails/email_2_1.txt create mode 100644 test/emails/email_2_2.txt create mode 100644 test/emails/email_2_3.txt create mode 100644 test/emails/email_3_1.txt create mode 100644 test/emails/email_bullets.txt create mode 100644 test/emails/email_gmail.txt create mode 100644 test/emails/email_headers_no_delimiter.txt create mode 100644 test/emails/email_one_is_not_on.txt create mode 100644 test/emails/email_partial_quote_header.txt create mode 100644 test/emails/email_sent_from_BlackBerry.txt create mode 100644 test/emails/email_sent_from_iPhone.txt create mode 100644 test/emails/email_sent_from_multi_word_mobile_device.txt create mode 100644 test/emails/email_sent_from_not_signature.txt create mode 100644 test/emails/email_sig_delimiter_in_middle_of_line.txt create mode 100644 test/emails/forward.txt create mode 100644 test/emails/greedy_on.txt create mode 100644 test/emails/multi_header.txt create mode 100644 test/emails/multiline_on.txt create mode 100644 test/emails/multiline_on_de.txt create mode 100644 test/emails/outlook.txt create mode 100644 test/emails/pathological.txt create mode 100644 test/test_email_reply_parser.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18783b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,59 @@ +# Static Files +/test/advanced +.idea/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Unit test / coverage reports +/test-results/ +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# pyenv +.python-version + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# macOS Zeug +.DS_Store +/static/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..29424e2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 RAUSYS, Rau Systemberatung GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..04f196a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include README.md +include LICENSE diff --git a/mailparser_reply/__init__.py b/mailparser_reply/__init__.py new file mode 100644 index 0000000..c784b09 --- /dev/null +++ b/mailparser_reply/__init__.py @@ -0,0 +1 @@ +from .parser import EmailReplyParser, EmailMessage, EmailReply diff --git a/mailparser_reply/constants.py b/mailparser_reply/constants.py new file mode 100644 index 0000000..724b7ee --- /dev/null +++ b/mailparser_reply/constants.py @@ -0,0 +1,99 @@ +#: Fallback language if no other language is specified +from typing import Dict + +MAIL_LANGUAGE_DEFAULT = 'en' + +#: Matches text-mail quotation (usually starting with ">"); +#: resulting in "> Hello world" +QUOTED_REGEX = r'(>+)' +#: Regex to remove all leading quotations +QUOTED_REMOVAL_REGEX = r'^(> *)' +#: Allow to match within (multi)-quoted body +#: e.g. allowing regex to match *inside* lines starting with "> > ..." +QUOTED_MATCH_INCLUDE = r'(?:> ?)*' + +#: Outlook-style mail separator (32 underscores); also occasionally +#: used within signatures +OUTLOOK_MAIL_SEPARATOR = r'(\n{2,} ?[_-]{32,})' +#: Common mail separators (+ old Outlook separator) +GENERIC_MAIL_SEPARATOR = r'^-{5,} ?Original Message ?-{5,}$' # unused + +#: Outlook Signature defaults; line optionally starts with whitespace, contains two +#: hyphens or underscores, and ends with optional whitespace. +# 1) -- \nJohn Doe +# 2) -John Doe +DEFAULT_SIGNATURE_REGEX = rf'\s*^{QUOTED_MATCH_INCLUDE}(?:[-_]{{2}}\n|- ?\w).*' + +#: All kinds of whitespaces incl special characters; used for Disclaimers, because they +#: are usually either added in post by a mailserver or scrambled due to their higher complexity. +SINGLE_SPACE_VARIATIONS = '[ \u200b\xA0\t]' +#: Linebreaks ok too +OPTIONAL_LINEBREAK = f'[,()]?{SINGLE_SPACE_VARIATIONS}{{0,3}}[\n\r]?{SINGLE_SPACE_VARIATIONS}{{0,3}}[,()]?' +#: Possible ways to check for linebreaks +SENTENCE_START = f'(?:[\n\r.!?]|^){SINGLE_SPACE_VARIATIONS}{{0,3}}' + +#: Matching regex for all languages +MAIL_LANGUAGES: Dict[str, Dict[str, str]] = { + 'de': { + 'wrote_header': r'^(?!Am.*Am\s.+?schrieb.*:)(' + QUOTED_MATCH_INCLUDE + r'Am\s(?:.+?\s?)schrieb\s(?:.+?\s?.+?):)$', + 'from_header': r'((?:(?:^|\n|\n' + QUOTED_MATCH_INCLUDE + r')[* ]*(?:Von|Gesendet|An|Betreff|Datum):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + '(?:Wichtiger )?Hinweis:', + 'Achtung:', + ], + 'signatures': [ + r'Mit freundlichen Gr\u00fc\u00DFen', + r'Mit freundlichen Gr\u00fc\u00DFen / (?:Best|Kind) regards,', + r'(?:(?:Beste(?:n)?|Liebe|Viele) )?(?:Gr(?:\u00fc|ue)(?:\u00DF|ss)(?:e)?|Gru\u00DF|Gruss)' + ], + 'sent_from': 'Gesendet von', + }, + 'en': { + # ^(?!On[.\s]*On\s(.+?\s?.+?)\swrote:) – Negative lookahead, see: + # https://github.com/github/email_reply_parser/pull/31 + # – allow matching this inside quoted levels + # On\s(?:.+?\s?.+?)\swrote:) – match "On 01.01.2025, John Doe wrote:" + # See multiline_on.txt for example data + 'wrote_header': r'^(?!On[.\s]*On\s(.+?\s?.+?)\swrote:)(' + QUOTED_MATCH_INCLUDE + r'On\s(?:.+?\s?.+?)\s?wrote:)$', + # (?:(?:^|\n)[* ]*(?:From|Sent|To|Subject|Date|Cc):[ *]* – match From:/*From*:, ... headers + # (?:\s{,2}).*){2,} – allow multi-line headers; some clients split the headers up into multiple lines. + # Also require at least two occurrences of the above pattern; e.g. From: ...\n Sent: ... + # (?:\n.*){,1} – allow optional subject or other broken multi-line at the end + 'from_header': r'((?:(?:^|\n|\n' + QUOTED_MATCH_INCLUDE + r')[* ]*(?:From|Sent|To|Subject|Date|Cc):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + 'CAUTION:', + 'Disclaimer:', + 'Warning:', + 'Confidential:', + 'CONFIDENTIALITY:', + '(?:Privileged|Confidential|Private|Sensitive|Important) (?:Notice|Note|Information):', + '[\* ]*Disclaimer[\* ]*' + ], + 'signatures': [ + 'Best regards', + 'Kind Regards', + 'Thanks,', + 'Thank you,', + 'Best,', + 'All the best', + 'regards,' + ], + 'sent_from': 'Sent from my|Get Outlook for' + }, + 'fr': { + 'wrote_header': r'(?!Le.*Le\s.+?a \u00e9crit[a-zA-Z0-9.:;<>()&@ -]*:)(' + QUOTED_MATCH_INCLUDE + r'Le\s(.+?)a \u00e9crit[a-zA-Z0-9.:;<>()&@ -]*:)', + 'from_header': r'((?:(?:^|\n|\n' + QUOTED_MATCH_INCLUDE + r')[* ]*(?:De |Envoy\u00e9 |\u00C0 |Objet | |Cc ):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'signatures': [ + 'cordialement', + 'salutations', + r'bonne r[\u00e9e]ception', + r'bonne journ[\u00e9e]e' + ], + 'sent_from': r'Envoy\u00e9 depuis' + }, + 'david': { + # Custom Software Headers – also kind of like a language, right? + 'from_header': r'((?:^ *Original Message processed by david.+?$\n{,7})(?:.*\n){,3}(?:(?:^|\n)[* ]*(?:Von|An|Cc)(?:\s{,2}).*){2,})' + } +} + diff --git a/mailparser_reply/parser.py b/mailparser_reply/parser.py new file mode 100644 index 0000000..f3aa8eb --- /dev/null +++ b/mailparser_reply/parser.py @@ -0,0 +1,269 @@ +# Copyright (c) 2022 Rau Systemberatung GmbH. All rights reserved. +# +# Licensed under the GNU General Public License, Version 3 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at: +# https://www.gnu.org/licenses/gpl-3.0.html +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Selling exceptions may apply as per written contractual agreements between +# commercial entities and Rau Systemberatung GmbH, releasing the former from +# the License obligations. + +import logging +import re +from dataclasses import dataclass, field +from itertools import chain +from typing import Union, List, Optional, Tuple + +from typing.re import Pattern + +from .constants import MAIL_LANGUAGES, MAIL_LANGUAGE_DEFAULT, OUTLOOK_MAIL_SEPARATOR, QUOTED_REMOVAL_REGEX, \ + SINGLE_SPACE_VARIATIONS, SENTENCE_START, OPTIONAL_LINEBREAK, DEFAULT_SIGNATURE_REGEX, QUOTED_MATCH_INCLUDE + +logger = logging.getLogger(__name__) + + +@dataclass +class EmailReplyParser: + """ Easy EmailMessage parsing interface """ + languages: List[str] = field(default_factory=lambda: []) + default_language: str = MAIL_LANGUAGE_DEFAULT + + def __post_init__(self): + self.languages = [language.lower().strip() for language in self.languages] + self.languages = [language for language in self.languages if language in MAIL_LANGUAGES] + if not self.languages: + self.languages = [self.default_language] + + def read(self, text: str) -> 'EmailMessage': + """ Factory method that splits email into list of fragments + text - A string email body""" + return EmailMessage(text=text, languages=self.languages).read() + + def parse_reply(self, text: str) -> Union[str, None]: + """ Provides the latest reply portion of email. + text - A string email body """ + return self.read(text).latest_reply + + + +@dataclass +class EmailMessage: + """ An email message represents a parsed email body. """ + #: Email message text body + text: str + + #: Languages used to detect common mail client email headers, separating replies. + #: This is used to fragment the mail into its single replies. + languages: List[str] = field(default_factory=lambda: []) + + #: Standalone replies mail is made out of + replies: List['EmailReply'] = field(default_factory=lambda: []) + + #: Whether to automatically include English versions too; desirable in multi-language environments + include_english: bool = True + + #: Whether to remove quotes on standalone replies (aka replies, that do not *include* quoted content, + #: but are completely quoted by themselves) + remove_quotes_replies: bool = False # TODO: Implement? + + #: Fallback language when other languages don't have dict entry + default_language: str = MAIL_LANGUAGE_DEFAULT + _header_regex: Union[Pattern, None] = None + _disclaimers_regex: Union[Pattern, None] = None + _signature_regex: Union[Pattern, None] = None + + def __post_init__(self): + if self.include_english and 'en' not in self.languages: + self.languages.append('en') + self._normalize_text() + + @property + def latest_reply(self) -> Union[str, None]: + """ Captures the latest reply message within email """ + if not self.replies: return None + return self.replies[0].content + + def _get_language_regex(self, language: str, regex_key: str) -> str: + """ Returns the language-specific regex pattern; if no pattern is available + for the language it falls back to the default_language's regex """ + flat_list = lambda x: '|'.join(chain(x)) if isinstance(x, list) else x + + if language in MAIL_LANGUAGES.keys(): + if regex_key in MAIL_LANGUAGES[language].keys(): + return flat_list(MAIL_LANGUAGES[language][regex_key]) + + if self.default_language in self.languages: return '' + # Fallback; language does not have regex_key defined; use global fallback language's regex key + return flat_list(MAIL_LANGUAGES[self.default_language][regex_key]) + + @property + def DISCLAIMERS_REGEX(self) -> Pattern: + """ Compile regex to remove disclaimers at the end of the mail """ + if self._disclaimers_regex: return self._disclaimers_regex + + ALLOW_ANY_EXTENSION = r'[a-zA-Z0-9\u00C0-\u017F:;.,?!<>()@&/\'\"\“\” \u200b\xA0\t\-]*' + disclaimers = [self._get_language_regex(language=language, regex_key='disclaimers') for language in self.languages] + disclaimers = '|'.join([ + disclaimer for disclaimer in disclaimers if disclaimer + ]).replace(' ', SINGLE_SPACE_VARIATIONS) + + self._disclaimers_regex = re.compile( + f'{SENTENCE_START}(?:{disclaimers})(?:{OPTIONAL_LINEBREAK}{ALLOW_ANY_EXTENSION}?(?:mail){ALLOW_ANY_EXTENSION}){{1,2}}', + flags=re.MULTILINE | re.IGNORECASE + ) + logger.debug(f'Mail Disclaimer RegEx: "{self._disclaimers_regex.pattern!r}"') + return self._disclaimers_regex + + @property + def HEADER_REGEX(self) -> Pattern: + """ Helper function to build the regex used for detecting headers """ + if self._header_regex: return self._header_regex + regex_headers = [self._get_language_regex(language=language, regex_key='wrote_header') for language in self.languages] + regex_headers += [self._get_language_regex(language=language, regex_key='from_header') for language in self.languages] + regex_headers = '|'.join([header for header in regex_headers if header]) + self._header_regex = re.compile(regex_headers, flags=re.MULTILINE | re.IGNORECASE) + logger.debug(f'Mail Header RegEx: "{self._header_regex.pattern!r}"') + return self._header_regex + + @property + def SIGNATURE_REGEX(self) -> Pattern: + if self._signature_regex: return self._signature_regex + sent_from_regex = [self._get_language_regex(language=language, regex_key='sent_from') for language in self.languages] + sent_from_regex = '|'.join([header for header in sent_from_regex if header]) + signatures = [self._get_language_regex(language=language, regex_key='signatures') for language in self.languages] + signatures = '|'.join([header for header in signatures if header]) + # Matches the following signatures – when a signature is matched it's considered to move all the way + # until the end of the mail body. Might be dangerous; but honestly how github/email_reply_parser works too + # 1) Outlook-style signatures + # 2) Idiot-filter phone email_reply_parser "Sent from my ..." (usually 1-3 words) + # 3) Get Outlook for... / Sent from Outlook for iOS + # 4) Regular signature-indicating stuff; e.g. "Best regards, ..." + # TODO: Add quotation optional + self._signature_regex = re.compile( + fr'(({DEFAULT_SIGNATURE_REGEX}|{OUTLOOK_MAIL_SEPARATOR}|' + # 1) + fr'\s*^{QUOTED_MATCH_INCLUDE}(?:{sent_from_regex}) ?(?:(?:[\w.<>:// ]+)|(?:\w+ ){1,3})$|'+ # 2) + 3) + fr'^{QUOTED_MATCH_INCLUDE}(?:{signatures}))(.|\s)*)', # 4) + flags=re.MULTILINE | re.IGNORECASE + ) + logger.debug(f'Mail Signature RegEx: "{self._signature_regex.pattern!r}"') + + # TODO: Always match whole signature until the next fragment/regex or until end of text + return self._signature_regex + + def _normalize_text(self): + # Normalize Line Endings + self.text = self.text.replace("\r\n", "\n") + # Remove invisible characters and dead line-beginnings/-endings + self.text = '\n'.join([line.strip() for line in self.text.split('\n')]) + + # Some users may reply directly above a line of underscores. + # In order to ensure that these fragments are split correctly, make sure that all lines + # of underscores are preceded by at least two newline characters. + # See email_2_2.txt for an example + self.text = re.sub(f'([^\n]){OUTLOOK_MAIL_SEPARATOR}', '\\1\n', self.text, re.MULTILINE) + + def _process_signatures_disclaimers(self, text: str) -> Tuple[List[str], str]: + """ Identifies Signature Elements and Disclaimers """ + disclaimers = self.DISCLAIMERS_REGEX.findall(text) + signatures = self.SIGNATURE_REGEX.search(text) + return disclaimers, signatures.group() if signatures else '' + + def read(self): + """ Processes mail text body, splitting it up in distinct, digestible EmailReplies + based on headers separating mail replies/mail parts """ + + # Find all headers in mail body and convert to flat list + headers = self.HEADER_REGEX.findall(self.text) + headers = [header for header in chain.from_iterable(headers) if header] + + current_position = 0 + previous_header = '' + + # Delimits eMail body by headers + for header in headers: + position = self.text.find( + header, + current_position + 1 if current_position > 0 else current_position + ) + + disclaimers, signatures = self._process_signatures_disclaimers(self.text[current_position:position]) + + _reply = EmailReply( + headers=previous_header, + content=self.text[current_position:position], + signatures=signatures, + disclaimers=disclaimers + ) + current_position = position if position >= 0 else 0 + previous_header = header + if not _reply.content: continue + self.replies.append(_reply) + + # r.findall – get last or get until end? + + # Add last reply element that is otherwise skipped due to the way we're iterating over headers. + # This also adds the message body as a whole, in case there are no email headers at all + disclaimers, signatures = self._process_signatures_disclaimers(self.text[current_position:]) + _reply = EmailReply( + headers=previous_header, + content=self.text[current_position:], + signatures=signatures, + disclaimers=disclaimers + ) + self.replies.append(_reply) + + return self + + # for count, line in enumerate(content): + # if headers[0] not in line: + # continue + # # Possible candidate in line, but not enough text content + # if len(content) < count + len(headers): + # + # for line in flat_text: + # self._scan_line(line.strip()) + # # if line.strip(): TODO; allow overwrite + # # TODO: Allow strip signatures + headers + # + # self._finish_reply() + # return self + +@dataclass +class EmailReply: + """ A reply is a standalone part of an Email Message, including headers, body, signatures and disclaimers """ + + #: Unprocessed mail text body + content: str + + #: Headers element within text body + headers: Optional[str] = '' + #: Signature element within text body + signatures: Optional[str] = '' + #: Disclaimers within text body + disclaimers: Optional[List[str]] = field(default_factory=lambda: []) + + def __post_init__(self): + self.content = self.content.strip() + self.headers = self.headers.strip() + self.signatures = self.signatures.strip() + self.disclaimers = [d.strip() for d in self.disclaimers] + + @property + def body(self) -> str: + """ Returns the message's body without the headers, signatures and disclaimers """ + _body = self.content + for disclaimer in self.disclaimers: + _body = _body.replace(disclaimer) + if self.signatures: + _body = _body.replace(self.signatures, '') + return _body.replace(self.headers or '', '').strip() + + @property + def full_body(self) -> str: + """ Returns the message's body without the headers, but with signatures and disclaimers """ + return self.content.replace(self.headers or '', '').strip() diff --git a/mailparser_reply/version.py b/mailparser_reply/version.py new file mode 100644 index 0000000..a9fe560 --- /dev/null +++ b/mailparser_reply/version.py @@ -0,0 +1 @@ +VERSION = '1.0' diff --git a/main.py b/main.py new file mode 100644 index 0000000..753ee02 --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +# This is a sample Python script. + +# Press ⌃R to execute it or replace it with your code. +# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. + + +def get_mail(name: str): + with open('test/emails/%s.txt' % name) as f: + return f.read() + +x = get_mail('delete') +from mailparser_reply.parser import EmailReplyParser +p = EmailReplyParser(languages='en') +f = p.read(x) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6516580 --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +import os +import sys + +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'mailparser_reply')) +import version + +with open("README.md", "r") as fh: + long_description = fh.read() + +setup( + name='mailparser_reply', + version=version.VERSION, + description='eMail reply parser', + long_description=long_description, + long_description_content_type="text/markdown", + packages=['mailparser_reply'], + package_data={'mailparser_reply': ['../VERSION']}, + author='Alfons Rau', + author_email='alfonsrv@pm.me', + url='https://github.com/alfonsrv/mailparser-reply', + license='MIT', + test_suite='test', + classifiers=[ + 'Intended Audience :: Developers', + 'Operating System :: OS Independent', + 'License :: OSI Approved :: MIT License', + "Programming Language :: Python", + "Programming Language :: Python :: 3", + 'Topic :: Software Development' + ] +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/emails/caution.txt b/test/emails/caution.txt new file mode 100644 index 0000000..67c69bb --- /dev/null +++ b/test/emails/caution.txt @@ -0,0 +1,50 @@ +CAUTION: This email originated from outside of this company. Do not click links or open attachments unless you recognize the sender and know the content is safe. + + + +Hi lads's Team, + + +Below THIS and THAT file for 31st August 2019 is still not available on "lads.lads.com" could you please check and advise us ASAP. + +pfg.Zip +pfg02.Zip + + +part01_07.Zip +_part02_07.Zip +_part03_07.Zip +_part04_07.Zip +_part05_07.Zip +_part06_07.Zip +_part07_07.Zip + + +job at our end. + + +Thanks, +___________________ +HAHA LOLO +Markets Application Production Services - Reference Data +ROLE +Bank of LADS - LADS LADS +BUILDING 5B,HAHA - THIS THAT, COUNTRY +Direct: (+00)00-000-0000 Mobile: (+00)000000 +______________________________________________________________________________ + +To report an issue or request for technical assistance with Product Reference Data applications, please send email to LADS SUPPORT. This is the only OO being monitored by the Product Reference Data support team. No other DGs or Mailboxes are being actively monitored. +Please make a note of this to avoid any delays. + +Escalation: LADS MANAGEMENT +___________________________________________________________ + +---------------------------------------------------------------------- +This message w/attachments (message) is intended solely for the use of the intended recipient(s) and may contain information that is privileged, confidential or proprietary. If you are not an intended recipient, please notify the sender, and then please delete and destroy all copies and attachments, and be advised that any review or dissemination of, or the taking of any action in reliance on, the information contained in or attached to this message is prohibited. +Unless specifically indicated, this message is not an offer to sell or a solicitation of any investment products or other financial product or service, an official confirmation of any transaction, or an official statement of Sender. Subject to applicable law, Sender may intercept, monitor, review and retain e-communications (EC) traveling through its networks/systems and may produce any such EC to regulators, law enforcement, in litigation and as required by law. +The laws of the country of each sender/recipient may impact the handling of EC, and EC may be archived, supervised and produced in countries other than the country in which you are located. This message cannot be guaranteed to be secure or free of errors or viruses. Attachments that are part of this EC may have additional important disclosures and disclaimers, which you should read. By messaging with Sender you consent to the foregoing. +----------------------------------------- + +lads's monitors email communications through its networks for regulatory compliance purposes and to protect its customers, employees and business and where allowed to do so by applicable law. The information contained in this e-mail message, and any attachment thereto, is confidential and may not be disclosed without our express permission. If you are not the intended recipient or an employee or agent responsible for delivering this message to the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution or copying of this message, or any attachment thereto, in whole or in part, is strictly prohibited. If you have received this message in error, please immediately notify us by telephone, fax or e-mail and delete the message and all of its attachments. Every effort is made to keep our network free from viruses. You should, however, review this e-mail message, as well as any attachment thereto, for viruses. We take no responsibility and have no liability for any computer virus which may be transferred via this e-mail message. + +----------------------------------------- diff --git a/test/emails/caution2.txt b/test/emails/caution2.txt new file mode 100644 index 0000000..8624943 --- /dev/null +++ b/test/emails/caution2.txt @@ -0,0 +1,12 @@ +Hello, + +I am trying to place an order and it keeps tell me my order cannot be processed at this time. I tried using two different consultants and it still won't work. I am not sure if it's a technical problem. My order is time sensitive. Thank you for your help! + +Person Person. + + +Disclaimer + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by Thing Ltd, an innovator in Software as a Service (SaaS) for business. Providing a safer and more useful place for your human generated data. Specializing in; Security, archiving and compliance. To find out more Click Here (http://www.thisthat.com/things/) . diff --git a/test/emails/correct_sig.txt b/test/emails/correct_sig.txt new file mode 100644 index 0000000..4bb8a2a --- /dev/null +++ b/test/emails/correct_sig.txt @@ -0,0 +1,4 @@ +this is an email with a correct -- signature. + +-- +rick diff --git a/test/emails/email_1_1.txt b/test/emails/email_1_1.txt new file mode 100644 index 0000000..fd56b25 --- /dev/null +++ b/test/emails/email_1_1.txt @@ -0,0 +1,13 @@ +Hi folks + +What is the best way to clear a Riak bucket of all key, values after +running a test? +I am currently using the Java HTTP API. + +-Abhishek Kona + + +_______________________________________________ +riak-users mailing list +riak-users@lists.basho.com +http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com diff --git a/test/emails/email_1_3.txt b/test/emails/email_1_3.txt new file mode 100644 index 0000000..f7ae6f3 --- /dev/null +++ b/test/emails/email_1_3.txt @@ -0,0 +1,55 @@ +Oh thanks. + +Having the function would be great. + +-Abhishek Kona + +On 01/03/11 7:07 PM, Russell Brown wrote: +> Hi, +> On Tue, 2011-03-01 at 18:02 +0530, Abhishek Kona wrote: +>> Hi folks +>> +>> What is the best way to clear a Riak bucket of all key, values after +>> running a test? +>> I am currently using the Java HTTP API. +> You can list the keys for the bucket and call delete for each. Or if you +> put the keys (and kept track of them in your test) you can delete them +> one at a time (without incurring the cost of calling list first.) +> +> Something like: +> +> String bucket = "my_bucket"; +> BucketResponse bucketResponse = riakClient.listBucket(bucket); +> RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo(); +> +> for(String key : bucketInfo.getKeys()) { +> riakClient.delete(bucket, key); +> } +> +> +> would do it. +> +> See also +> +> http://wiki.basho.com/REST-API.html#Bucket-operations +> +> which says +> +> "At the moment there is no straightforward way to delete an entire +> Bucket. There is, however, an open ticket for the feature. To delete all +> the keys in a bucket, you’ll need to delete them all individually." +> +>> -Abhishek Kona +>> +>> +>> _______________________________________________ +>> riak-users mailing list +>> riak-users@lists.basho.com +>> http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com +> + + +_______________________________________________ +riak-users mailing list +riak-users@lists.basho.com +http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com diff --git a/test/emails/email_1_4.txt b/test/emails/email_1_4.txt new file mode 100644 index 0000000..2e55ded --- /dev/null +++ b/test/emails/email_1_4.txt @@ -0,0 +1,8 @@ +Awesome! I haven't had another problem with it. + +On Aug 22, 2011, at 7:37 PM, defunkt wrote: + + + + +> Loader seems to be working well. diff --git a/test/emails/email_1_5.txt b/test/emails/email_1_5.txt new file mode 100644 index 0000000..2498775 --- /dev/null +++ b/test/emails/email_1_5.txt @@ -0,0 +1,15 @@ +One: Here's what I've got. + +- This would be the first bullet point that wraps to the second line +to the next +- This is the second bullet point and it doesn't wrap +- This is the third bullet point and I'm having trouble coming up with enough +to say +- This is the fourth bullet point + +Two: +- Here is another bullet point +- And another one + +This is a paragraph that talks about a bunch of stuff. It goes on and on +for a while. diff --git a/test/emails/email_1_6.txt b/test/emails/email_1_6.txt new file mode 100644 index 0000000..8f8f564 --- /dev/null +++ b/test/emails/email_1_6.txt @@ -0,0 +1,15 @@ +I get proper rendering as well. + +Sent from a magnificent torch of pixels + +On Dec 16, 2011, at 12:47 PM, Corey Donohoe + +wrote: + +> Was this caching related or fixed already? I get proper rendering here. +> +> ![](https://img.skitch.com/20111216-m9munqjsy112yqap5cjee5wr6c.jpg) +> +> --- +> Reply to this email directly or view it on GitHub: +> https://github.com/github/github/issues/2278#issuecomment-3182418 diff --git a/test/emails/email_1_7.txt b/test/emails/email_1_7.txt new file mode 100644 index 0000000..fb86822 --- /dev/null +++ b/test/emails/email_1_7.txt @@ -0,0 +1,12 @@ +:+1: + +On Tue, Sep 25, 2012 at 8:59 AM, Chris Wanstrath +wrote: + +> Steps 0-2 are in prod. Gonna let them sit for a bit then start cleaning up +> the old code with 3 & 4. +> +> +> Reply to this email directly or view it on GitHub. +> +> diff --git a/test/emails/email_1_8.txt b/test/emails/email_1_8.txt new file mode 100644 index 0000000..d4e4121 --- /dev/null +++ b/test/emails/email_1_8.txt @@ -0,0 +1,37 @@ +On Tue, Apr 29, 2014 at 4:22 PM, Example Dev wrote: + +> okay. Well, here's some stuff I can write. +> +> And if I write a 2 second line you and maybe reply under this? +> +> Or if you didn't really feel like it, you could reply under this line.Or +> if you didn't really feel like it, you could reply under this line. Or if +> you didn't really feel like it, you could reply under this line. Or if you +> didn't really feel like it, you could reply under this line. +> + +I will reply under this one + +> +> okay? +> + +and under this. + +> +> -- Tim +> +> On Tue, April 29, 2014 at 4:21 PM, Tim Haines wrote: +> > hi there +> > +> > After you reply to this I'm going to send you some inline responses. +> > +> > -- +> > Hey there, this is my signature +> +> +> + + +-- +Hey there, this is my signature diff --git a/test/emails/email_2_1.txt b/test/emails/email_2_1.txt new file mode 100644 index 0000000..adffd08 --- /dev/null +++ b/test/emails/email_2_1.txt @@ -0,0 +1,25 @@ +Outlook with a reply + + + ------------------------------ + +*From:* Google Apps Sync Team [mailto:mail-noreply@google.com] +*Sent:* Thursday, February 09, 2012 1:36 PM +*To:* jow@xxxx.com +*Subject:* Google Apps Sync was updated! + + + +Dear Google Apps Sync user, + +Google Apps Sync for Microsoft Outlook® was recently updated. Your computer +now has the latest version (version 2.5). This release includes bug fixes +to improve product reliability. For more information about these and other +changes, please see the help article here: + +http://www.google.com/support/a/bin/answer.py?answer=153463 + +Sincerely, + +The Google Apps Sync Team. + diff --git a/test/emails/email_2_2.txt b/test/emails/email_2_2.txt new file mode 100644 index 0000000..cae8050 --- /dev/null +++ b/test/emails/email_2_2.txt @@ -0,0 +1,10 @@ +Outlook with a reply directly above line +________________________________________ +From: CRM Comments [crm-comment@example.com] +Sent: Friday, 23 March 2012 5:08 p.m. +To: John S. Greene +Subject: [contact:106] John Greene + +A new comment has been added to the Contact named 'John Greene': + +I am replying to a comment. diff --git a/test/emails/email_2_3.txt b/test/emails/email_2_3.txt new file mode 100644 index 0000000..26912ee --- /dev/null +++ b/test/emails/email_2_3.txt @@ -0,0 +1,20 @@ +Outlook with a reply above headers using unusual format + +*From:* Kim via Site [mailto:noreply@site.com] +*Sent:* Monday, January 15, 2018 2:15 AM +*To:* user@xxxxx.com +*Subject:* You have a new message! + +Respond to Kim by replying directly to this email + +New message from Kim on Site: + + Ei tale aliquam eum, at vel tale sensibus, an sit vero magna. Vis no veri + clita, movet detraxit inciderint te est. Mel nusquam perfecto repudiandae + ei. Error paulo pri ea. At partem offendit appetere sea, no vis audiam + latine appellantur. + + Sea id aperiam accusam, vel dico omnesque qualisque cu. Cu nec alii euismod + eloquentiam. Ea nisl utinam vis. Est impetus intellegam dissentiet et. Nec + ea rationibus percipitur, eam fugit impetus ad, ad possit semper recusabo + quo. diff --git a/test/emails/email_3_1.txt b/test/emails/email_3_1.txt new file mode 100644 index 0000000..355d7bf --- /dev/null +++ b/test/emails/email_3_1.txt @@ -0,0 +1,56 @@ +First reply. + +This is fairly common; nothing too hard here! + + +Am So., 1. Jan. 2023 um 17:22 Uhr schrieb John Doe < +noreply@github.com>: + +> This is my second test reply. Should be the content. +> +> +> Begin forwarded message: +> +> *From:* Veeam Support +> *Date:* 30. December 2022 at 06:54:30 CET +> *To:* noreply@github.com +> *Subject:* *Backup copy fails with: Failed to process method +> Transform.Patch - Longer multiline subject +> +> Hello John, +> +> I've been working on adding a third reply to this mail's body. So far it +> has been working great. Let's see how it handles with multi-line splitting. +> +> If we can decrypt it in the lab try to do the same on a production server. +> +> Have a nice day. +> +> +> Best Regards, +> Unknown Bandit +> Veeam Cloud & Service Providers Support +> +> --------------- Original Message --------------- +> +> Note added: Done. [ ref:asdasdasd:ref ] +> +> + +Original Message processed by david® +FW: Mails Resync: Encoding Konsistenz 9. August 2022, 12:44 Uhr +Von Marine Ho +An John Doe +Cc (2) nobdoy|test + +Fourth reply. Not sure what to write anymore, but eh. + + +On 07.08.2022 um 23:55, John Doe wrote: + + +Fifth reply. Or actually the original message? Who reads from +bottom to top anyways in tests? + + + diff --git a/test/emails/email_bullets.txt b/test/emails/email_bullets.txt new file mode 100644 index 0000000..fb124ab --- /dev/null +++ b/test/emails/email_bullets.txt @@ -0,0 +1,22 @@ +test 2 this should list second + +and have spaces + +and retain this formatting + + + - how about bullets + - and another + + +On Fri, Feb 24, 2012 at 10:19 AM, wrote: + +> Give us an example of how you applied what they learned to achieve +> something in your organization + + + + +-- + +*Joe Smith | Director, Product Management* diff --git a/test/emails/email_gmail.txt b/test/emails/email_gmail.txt new file mode 100644 index 0000000..a371f04 --- /dev/null +++ b/test/emails/email_gmail.txt @@ -0,0 +1,14 @@ +This is a test for inbox replying to a github message. + + + +On Wed, May 18, 2016 at 11:10 PM Steven Scott +wrote: + + +That way people can tell how outdated their version is, mostly because I'm +personally too lazy to increment a version number all the time 👍 + +—- +You are receiving this because you are subscribed to this thread. +Reply to this email directly or view it on GitHub diff --git a/test/emails/email_headers_no_delimiter.txt b/test/emails/email_headers_no_delimiter.txt new file mode 100644 index 0000000..6994b9c --- /dev/null +++ b/test/emails/email_headers_no_delimiter.txt @@ -0,0 +1,15 @@ +And another reply! + +From: Dan Watson [mailto:user@host.com] +Sent: Monday, November 26, 2012 10:48 AM +To: Watson, Dan +Subject: Re: New Issue + +A reply + +-- +Sent from my iPhone + +On Nov 26, 2012, at 10:27 AM, "Watson, Dan" wrote: +This is a message. +With a second line. diff --git a/test/emails/email_one_is_not_on.txt b/test/emails/email_one_is_not_on.txt new file mode 100644 index 0000000..ffff964 --- /dev/null +++ b/test/emails/email_one_is_not_on.txt @@ -0,0 +1,10 @@ +Thank, this is really helpful. + +One outstanding question I had: + +Locally (on development), when I run... + +On Oct 1, 2012, at 11:55 PM, Dave Tapley wrote: + +> The good news is that I've found a much better query for lastLocation. +> diff --git a/test/emails/email_partial_quote_header.txt b/test/emails/email_partial_quote_header.txt new file mode 100644 index 0000000..97ef99a --- /dev/null +++ b/test/emails/email_partial_quote_header.txt @@ -0,0 +1,13 @@ +On your remote host you can run: + + telnet 127.0.0.1 52698 + +This should connect to TextMate (on your Mac, via the tunnel). If that +fails, the tunnel is not working. + +On 9 Jan 2014, at 2:47, George Plymale wrote: + +> I am having an odd issue wherein suddenly port forwarding stopped +> working in a particular scenario for me. By default I have ssh set to +> use the following config (my ~/.ssh/config file): +> […] diff --git a/test/emails/email_sent_from_BlackBerry.txt b/test/emails/email_sent_from_BlackBerry.txt new file mode 100644 index 0000000..9cf4824 --- /dev/null +++ b/test/emails/email_sent_from_BlackBerry.txt @@ -0,0 +1,3 @@ +Here is another email + +Sent from my BlackBerry diff --git a/test/emails/email_sent_from_iPhone.txt b/test/emails/email_sent_from_iPhone.txt new file mode 100644 index 0000000..e5d2169 --- /dev/null +++ b/test/emails/email_sent_from_iPhone.txt @@ -0,0 +1,3 @@ +Here is another email + +Sent from my iPhone diff --git a/test/emails/email_sent_from_multi_word_mobile_device.txt b/test/emails/email_sent_from_multi_word_mobile_device.txt new file mode 100644 index 0000000..c9f89e2 --- /dev/null +++ b/test/emails/email_sent_from_multi_word_mobile_device.txt @@ -0,0 +1,3 @@ +Here is another email + +Sent from my Verizon Wireless BlackBerry diff --git a/test/emails/email_sent_from_not_signature.txt b/test/emails/email_sent_from_not_signature.txt new file mode 100644 index 0000000..ce97004 --- /dev/null +++ b/test/emails/email_sent_from_not_signature.txt @@ -0,0 +1,3 @@ +Here is another email + +Sent from my desk, is much easier than my mobile phone. diff --git a/test/emails/email_sig_delimiter_in_middle_of_line.txt b/test/emails/email_sig_delimiter_in_middle_of_line.txt new file mode 100644 index 0000000..68f00f8 --- /dev/null +++ b/test/emails/email_sig_delimiter_in_middle_of_line.txt @@ -0,0 +1,7 @@ +Hi there! + +Stuff happened. + +And here is a fix -- this is not a signature. + +kthxbai diff --git a/test/emails/forward.txt b/test/emails/forward.txt new file mode 100644 index 0000000..a715ce3 --- /dev/null +++ b/test/emails/forward.txt @@ -0,0 +1,6 @@ +FW: YYY Arrival Notice XYZ - YYY ELA/XYZ ETA: 2020-06-08 +This is a follow-up to your previous request #12345 "RE: XYZ and Manifest amendm..." +Hello team, +Can I get 1 Arrival Notice without PU# and invoice? +Thank you. +Best regards diff --git a/test/emails/greedy_on.txt b/test/emails/greedy_on.txt new file mode 100644 index 0000000..9b60aa6 --- /dev/null +++ b/test/emails/greedy_on.txt @@ -0,0 +1,16 @@ +On your remote host you can run: + + telnet 127.0.0.1 52698 + +This should connect to TextMate (on your Mac, via the tunnel). If that +fails, the tunnel is not working. + +On 9 Jan 2014, at 2:47, George Plymale wrote: + +> I am having an odd issue wherein suddenly port forwarding stopped +> working in a particular scenario for me. By default I have ssh set to +> use the following config (my ~/.ssh/config file): +> […] +> --- +> Reply to this email directly or view it on GitHub: +> https://github.com/textmate/rmate/issues/29 diff --git a/test/emails/multi_header.txt b/test/emails/multi_header.txt new file mode 100644 index 0000000..8eeded0 --- /dev/null +++ b/test/emails/multi_header.txt @@ -0,0 +1,41 @@ +No problem. I’ll just start a new order. + +On May 30, 2020, at 4:24 PM, XYZ wrote: + + +Hi XYZ, + +Unfortunately, we are unable to add items to your order, but if you would like we can cancel your order and issue a full refund so that you may order again with your preferred selection of pastries. This may result in a later delivery date, but please let us know if you would like us to cancel your order and we will set that up for you. + + + +On May 30, 2020, 4:15 PM XYZ xyz@xyz.com wrote: + +No worries and thank you. I wanted to add a couple of new items to the same shipment. Would that Be possible or should I just order thru the website? + +On May 30, 2020, at 3:54 PM, XYZ wrote: + + +Hi XYZ, + +We apologize for the incorrect product and for any inconvenience this may have caused. + +We have placed a replacement order of 1 X for delivery on June 4, 2020. You will be receiving an email confirmation for this new order and your tracking number will be emailed 1-2 nights before the delivery date. + +Thank you for your patience! + + + +On May 30, 2020, 10:01 AM XYZ xyz@xyz.com wrote: + +Hello XYZ, + +Thank you for emailing the Team! + +We are writing to you to confirm that we have received your email. + +We apologize for any inconvenience and assure you that we will find a solution for any question, concern, or comment you may have. + +We appreciate your patience during these times. + +-Team diff --git a/test/emails/multiline_on.txt b/test/emails/multiline_on.txt new file mode 100644 index 0000000..5ca3ef0 --- /dev/null +++ b/test/emails/multiline_on.txt @@ -0,0 +1,14 @@ +On Dec 16, 2011, at 12:47 PM, John Doe + +wrote: + +On Dec 16, 2011, at 12:47 PM, John Doe wrote: +asd + +On So., 1. Jan. 2023 um 17:22 Uhr John Doe < +areply@reply.github.com> wrote: + + +On this day +On So., 1. Jan. 2023 um 17:22 Uhr John Doe < +areply@reply.github.com> wrote: diff --git a/test/emails/multiline_on_de.txt b/test/emails/multiline_on_de.txt new file mode 100644 index 0000000..a13b7e0 --- /dev/null +++ b/test/emails/multiline_on_de.txt @@ -0,0 +1,14 @@ +Am heutigen Tage +Am So., 1. Jan. 2023 um 17:22 Uhr schrieb John Doe < +noreply@github.com>: + +Am heutigen Tage +Am So., 1. Jan. 2023 um 17:22 Uhr +schrieb John Doe < +noreply@github.com>: + +Am heutigen Tage +Am So., 1. Jan. 2023 um 17:22 Uhr schrieb John Doe : + + +Am So., 1. Jan. 2023 um 17:22 Uhr schrieb Straßburg Dößä : diff --git a/test/emails/outlook.txt b/test/emails/outlook.txt new file mode 100644 index 0000000..4111b57 --- /dev/null +++ b/test/emails/outlook.txt @@ -0,0 +1,9 @@ +Dear Team, + +Hello + +What is the best way to clear a Riak bucket of all key, values after +running a test? + +My number is: +00 0000 000 000 +Get Outlook for iOS diff --git a/test/emails/pathological.txt b/test/emails/pathological.txt new file mode 100644 index 0000000..f5845b7 --- /dev/null +++ b/test/emails/pathological.txt @@ -0,0 +1,20 @@ +I think you're onto something. I will try to fix the problem as soon as I +get back to a computer. +On Dec 8, 2013 2:10 PM, "John Sullivan" wrote: + +> I think your code is shortening the reference sequence you return to be +> the same size as the query sequence, and we end up losing data. Here's some +> debugging output from me putzing around... +> +> name: gi|253409428|ref|GQ227366.1| Influenza A virus (A/pika/Qinghai/BI/2007(H5N1)) segment 1 polymerase PB2 (PB2) gene, complete cds +> score: 39.0 +> +> organism.sequence: ATGGAGAGAATAAAGGAATTAAGAGATCTAATGTCACAGTCCCGCACTCGCGAGATACTAACAAAGACCACTGTGGACCATATGGCCATAATCAAGAAATACACATCAGGAAGACAAGAGAAGAACCCTGCTCTCAGAATGAAATGGATGATGGCAATGAAATATCCAATCACAGCGGACAAGAGAATAATAGAGATGATTCCTGAAAGGAATGAACAAGGACAGACACTCTGGAGCAAGACAAATGATGCTGGATCGGACAGGGTGATGGTGTCTCCCCTAGCTGTAACTTGGTGGAATAGGAATGGGCCGACGACAAGTACAGTTCATTATCCAAAGGTTTACAAAACATACTTTGAGAAGGTTGAAAGGTTAAAACATGGAACCTTCGGTCCCGTTCATTTCCGAAACCAAGTTAAAATACGCCGCCGAGTTGATACAAATCCTGGCCATGCAGATCTCAGTGCTAAAGAAGCACAAGATGTCATCATGGAGGTCGTTTTCCCAAATGAAGTGGGAGCTAGAATATTGACTTCAGAGTCACAGTTGACAATAACGAAAGAGAAAAAAGAAGAGCTCCAAGATTGTAAGATTGCTCCCTTAATGGTTGCATACATGTTGGAAAGGGAACTGGTCCGCAAAACCAGATTCCTACCAGTAGCAGGCGGAACAAGCAGTGTGTACATTGAGGTATTGCATTTGACTCAAGGAACCTGCTGGGCACAGATGTACACTCCAGGCGGAGAAGTAAGAAATGACGATGTTGACCAGAGTTTGATCATTGCTGCCAGAAACATTGTTAGGAGAGCAACGGTATCAGCGGATCCACTGGCATCACTGCTGGAGATGTGTCACAGCACACAAATTGGTGGGATAAGGATGGTGGACATCCTTAGGCAAACTCCAACTGAGGAACAAGCTGTGGATATATGCAAAGCAGCAATGGGTCTGAGGATTAGTTCATCCTTTAGCTTTGGAG +> GCTTCACTTTCAAAAGAACAAGTGGATCATCCGCCACGAAGGAAGAGGAAGTGCTTACAGGCAACCTCCAAACATTGAAAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTTGGGCAGAGGGCAACAGCTATCCTGAGGAAAGCAACTAGAAGGCTGATTCAGTTGATAGTAAGTGGAAGAAACGAACAATCAATCGCTGAGGCAATCATTGTAGCAATGGTGTTCTCACAGGAGGATCGCATGATAAAAGCAGTCCGAGGCGATCTGAATTTCGTAAACAGAGCAAACCAAAGATTAAACCCCATGCATCAACTCCTGAGACATTTTCAAAAGGACGCAAAAGTGCTATTTCAGAATTGGGGAACTGAGCCAATTGATAATGTCATGGGGATGATCGGAATATTACCTGACATGACTCCCAGCACAGAAACGTCACTGAGAGGAGTGAGAGTTAGTAAAATGGGAGTAGATGAGTATTCCAGCACTGAGAGAGTAGTTGTAAGCATTGACCGCTTCTTAAGGGTTCGAGACCAGCGGGGGAACGTACTCTTATCTCCCGAAGAGGTCAGCGAAACCCAGGGAACAGAGAAGTTGACAATAACATATTCATCATCAATGATGTGGGAAATCAACGGTCCTGAGTCAGTGCTTGTTAACACTTACCAATGGATCATTAGAAACTGGGAGACCGTGAAAATTCAGTGGTCTCAGGACCCCACGATGTTGTACAATAAGATGGAGTTTGAACCGTTCCAATCCTTGGTACCTAAAGCTGCCAGAGGTCAATACAGTGGATTTGTGAGAACATTATTCCAACAAATGCGTGACGTACTGGGGACATTTGATACTGTCCAGATAATAAAGCTGCTACCATTTGCAGCAGCCCCACCGAAGCAGAGCAGAATGCAGTTTTCTTCTCTAACTGTGAATGTGAGAGGCTCAGGAATGAGAATACTCATAAGGGGCAATTCCCCTGTGTTCAACTACAA +> TAAGGCAACCCAAAGACTTACCGTTCTTGGAAAGGACGCAGGTGCATTAACAGAGGATCCAGATGAGGGGACAGCCGGAGTGGAATCTGCAGTACTGAGGGGGTTCCTAATTCTAGGCAAGGAGGACAAAAGATATGGACCAGCATTGAGCATCAATGAACTGAGCAATCTTGCAAAAGGGGAGAAAGCTAATGTGCTGATAGGGCAAGGAGACGTGGTGTTGGTAATGAAACGGAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAGTGTCGAATTGTTTAAAAACGACCTTGTTTCTACT +> reference_alignment: ________________________________________________ +> +> query: AGCGAAAGCAGGTCAAATATATTCAATATGGAGAGAATAAAAGAATTAAG +> +> query_alignment: GCGAAAGCAGGTCAAATATATTCAATATGGAGAGAATAAAAGAATTAAG +> diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py new file mode 100644 index 0000000..751d186 --- /dev/null +++ b/test/test_email_reply_parser.py @@ -0,0 +1,309 @@ +import os +import sys +import unittest +import re + +from mailparser_reply.constants import MAIL_LANGUAGE_DEFAULT + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from mailparser_reply import EmailReplyParser + + +class EmailMessageTest(unittest.TestCase): + def test_simple_body(self): + mail = self.get_email('email_1_1', parse=True, languages=['en']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue("riak-users" in mail.replies[0].content) + self.assertTrue("riak-users" in mail.replies[0].signatures) + self.assertTrue("riak-users" not in mail.replies[0].body) + + def test_simple_quoted_body(self): + mail = self.get_email('email_1_3', parse=True, languages=['en']) + self.assertEqual(3, len(mail.replies)) + self.assertTrue("On 01/03/11 7:07 PM, Russell Brown wrote:" in mail.replies[1].content) + self.assertTrue("On 01/03/11 7:07 PM, Russell Brown wrote:" not in mail.replies[1].body) + self.assertTrue("-Abhishek Kona" in mail.replies[0].signatures) + self.assertTrue("-Abhishek Kona" not in mail.replies[0].body) + + self.assertTrue("> Hi," == mail.replies[1].body) + # test if matching quoted signatures works + self.assertTrue(">> -Abhishek Kona" in mail.replies[2].content) + self.assertTrue(">> -Abhishek Kona" in mail.replies[2].signatures) + self.assertTrue(">> -Abhishek Kona" not in mail.replies[2].body) + + def test_simple_scrambled_body(self): + mail = self.get_email('email_1_4', parse=True, languages=['en']) + self.assertEqual(2, len(mail.replies)) + self.assertTrue("defunkt" in mail.replies[1].content) + self.assertTrue("defunkt" in mail.replies[1].headers) + + def test_simple_longer_mail(self): + mail = self.get_email('email_1_5', parse=True, languages=['en', 'de', 'david']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue(len(mail.latest_reply.split('\n')) == 15) + + def test_simple_scrambled_header(self): + mail = self.get_email('email_1_6', parse=True, languages=['en']) + self.assertEqual(2, len(mail.replies)) + self.assertTrue("" in mail.replies[1].headers) + + def test_simple_scrambled_header2(self): + mail = self.get_email('email_1_7', parse=True, languages=['en']) + self.assertEqual(2, len(mail.replies)) + self.assertTrue("wrote:" in mail.replies[1].headers) + + def test_simple_quoted_reply(self): + mail = self.get_email('email_1_8', parse=True, languages=['en']) + # TODO: Should this *actually* be the desired behaviour? tbh, nobody sends mails including this header tho + # Maybe otherwise: 1) Negative lookahead unquoted message + # 2) Unless message is disclaimer/signature (scan from behind) + self.assertEqual(2, len(mail.replies)) + # self.assertTrue("--\nHey there, this is my signature" == mail.replies[1].signatures) + + def test_gmail_header(self): + mail = self.get_email('email_2_1', parse=True, languages=['en']) + self.assertEqual(2, len(mail.replies)) + self.assertTrue("Outlook with a reply\n\n\n------------------------------" == mail.replies[0].body) + self.assertTrue("Google Apps Sync Team [mailto:mail-noreply@google.com]" in mail.replies[1].headers) + self.assertTrue("Google Apps Sync Team [mailto:mail-noreply@google.com]" not in mail.replies[1].body) + + def test_gmail_indented(self): + mail = self.get_email('email_2_3', parse=True, languages=['en']) + self.assertEqual(2, len(mail.replies)) + self.assertTrue("Outlook with a reply above headers using unusual format" == mail.replies[0].body) + # _normalize_body flattens the lines + self.assertTrue("Ei tale aliquam eum, at vel tale sensibus, an sit vero magna. Vis no veri" in mail.replies[1].body) + + def test_complex_mail_thread(self): + mail = self.get_email('email_3_1', parse=True, languages=['en', 'de', 'david']) + self.assertEqual(5, len(mail.replies)) + + def test_multiline_on(self): + mail = self.get_email('multiline_on', parse=True, languages=['en', 'de']) + self.assertEqual(4, len(mail.replies)) + + def test_header_no_delimiter(self): + mail = self.get_email('email_headers_no_delimiter', parse=True, languages=['en',]) + self.assertEqual(3, len(mail.replies)) + self.assertTrue("And another reply!" == mail.replies[0].body) + self.assertTrue("A reply" == mail.replies[1].body) + self.assertTrue("--\nSent from my iPhone" == mail.replies[1].signatures) + self.assertTrue("This is a message.\nWith a second line." == mail.replies[2].body) + + def test_sent_from_junk1(self): + mail = self.get_email('email_sent_from_iPhone', parse=True, languages=['en']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue("Here is another email" == mail.replies[0].body) + self.assertTrue("Sent from my iPhone" == mail.replies[0].signatures) + + def test_sent_from_junk2(self): + mail = self.get_email('email_sent_from_multi_word_mobile_device', parse=True, languages=['en']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue("Here is another email" == mail.replies[0].body) + self.assertTrue("Sent from my Verizon Wireless BlackBerry" == mail.replies[0].signatures) + + def test_sent_from_junk3(self): + mail = self.get_email('email_sent_from_BlackBerry', parse=True, languages=['en']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue("Here is another email" == mail.replies[0].body) + self.assertTrue("Sent from my BlackBerry" == mail.replies[0].signatures) + + def test_sent_from_junk4(self): + mail = self.get_email('email_sent_from_not_signature', parse=True, languages=['en']) + self.assertEqual(1, len(mail.replies)) + self.assertTrue("Here is another email\n\nSent from my desk, is much easier than my mobile phone." == mail.replies[0].body) + self.assertTrue("" == mail.replies[0].signatures) + + def get_email(self, name: str, parse: bool = True, languages: list = None): + """ Return EmailMessage instance or text content """ + with open(f'emails/{name}.txt') as f: + text = f.read() + return EmailReplyParser( + languages=languages or [MAIL_LANGUAGE_DEFAULT] + ).read(text) if parse else text + + +# class EmailMessageTest(unittest.TestCase): +# def test_simple_body(self): +# message = self.get_email('email_1_1') +# +# self.assertEqual(3, len(message.fragments)) +# self.assertEqual( +# [False, True, True], +# [f.signature for f in message.fragments] +# ) +# self.assertEqual( +# [False, True, True], +# [f.hidden for f in message.fragments] +# ) +# self.assertTrue("folks" in message.fragments[0].content) +# self.assertTrue("riak-users" in message.fragments[2].content) +# +# def test_reads_bottom_message(self): +# message = self.get_email('email_1_2') +# +# self.assertEqual(6, len(message.fragments)) +# self.assertEqual( +# [False, True, False, True, False, False], +# [f.quoted for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, False, False, False, False, True], +# [f.signature for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, False, False, True, True, True], +# [f.hidden for f in message.fragments] +# ) +# +# self.assertTrue("Hi," in message.fragments[0].content) +# self.assertTrue("On" in message.fragments[1].content) +# self.assertTrue(">" in message.fragments[3].content) +# self.assertTrue("riak-users" in message.fragments[5].content) +# +# def test_reads_inline_replies(self): +# message = self.get_email('email_1_8') +# self.assertEqual(7, len(message.fragments)) +# +# self.assertEqual( +# [True, False, True, False, True, False, False], +# [f.quoted for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, False, False, False, False, False, True], +# [f.signature for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, False, False, False, True, True, True], +# [f.hidden for f in message.fragments] +# ) +# +# def test_reads_top_post(self): +# message = self.get_email('email_1_3') +# self.assertEqual(5, len(message.fragments)) +# +# def test_multiline_reply_headers(self): +# message = self.get_email('email_1_6') +# self.assertTrue('I get' in message.fragments[0].content) +# self.assertTrue('On' in message.fragments[1].content) +# +# def test_captures_date_string(self): +# message = self.get_email('email_1_4') +# +# self.assertTrue('Awesome' in message.fragments[0].content) +# self.assertTrue('On' in message.fragments[1].content) +# self.assertTrue('Loader' in message.fragments[1].content) +# +# def test_complex_body_with_one_fragment(self): +# message = self.get_email('email_1_5') +# +# self.assertEqual(1, len(message.fragments)) +# +# def test_verify_reads_signature_correct(self): +# message = self.get_email('correct_sig') +# self.assertEqual(2, len(message.fragments)) +# +# self.assertEqual( +# [False, False], +# [f.quoted for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, True], +# [f.signature for f in message.fragments] +# ) +# +# self.assertEqual( +# [False, True], +# [f.hidden for f in message.fragments] +# ) +# +# self.assertTrue('--' in message.fragments[1].content) +# +# def test_deals_with_windows_line_endings(self): +# msg = self.get_email('email_1_7') +# +# self.assertTrue(':+1:' in msg.fragments[0].content) +# self.assertTrue('On' in msg.fragments[1].content) +# self.assertTrue('Steps 0-2' in msg.fragments[1].content) +# +# def test_reply_is_parsed(self): +# message = self.get_email('email_1_2') +# self.assertTrue("You can list the keys for the bucket" in message.reply) +# +# def test_reply_from_gmail(self): +# with open('test/emails/email_gmail.txt') as f: +# self.assertEqual('This is a test for inbox replying to a github message.', +# EmailReplyParser().parse_reply(f.read())) +# +# def test_parse_out_just_top_for_outlook_reply(self): +# with open('test/emails/email_2_1.txt') as f: +# self.assertEqual("Outlook with a reply", EmailReplyParser().parse_reply(f.read())) +# +# def test_parse_out_just_top_for_outlook_with_reply_directly_above_line(self): +# with open('test/emails/email_2_2.txt') as f: +# self.assertEqual("Outlook with a reply directly above line", EmailReplyParser().parse_reply(f.read())) +# +# def test_parse_out_just_top_for_outlook_with_unusual_headers_format(self): +# with open('test/emails/email_2_3.txt') as f: +# self.assertEqual( +# "Outlook with a reply above headers using unusual format", +# EmailReplyParser().parse_reply(f.read())) +# +# def test_sent_from_iphone(self): +# with open('test/emails/email_iPhone.txt') as email: +# +# self.assertTrue("Sent from my iPhone" not in EmailReplyParser().parse_reply(email.read())) +# +# def test_email_one_is_not_on(self): +# with open('test/emails/email_one_is_not_on.txt') as email: +# self.assertTrue( +# "On Oct 1, 2012, at 11:55 PM, Dave Tapley wrote:" not in EmailReplyParser().parse_reply(email.read())) +# +# def test_partial_quote_header(self): +# message = self.get_email('email_partial_quote_header') +# self.assertTrue("On your remote host you can run:" in message.reply) +# self.assertTrue("telnet 127.0.0.1 52698" in message.reply) +# self.assertTrue("This should connect to TextMate" in message.reply) +# +# def test_email_headers_no_delimiter(self): +# message = self.get_email('email_headers_no_delimiter') +# self.assertEqual(message.reply.strip(), 'And another reply!') +# +# def test_multiple_on(self): +# message = self.get_email("greedy_on") +# self.assertTrue(re.match('^On your remote host', message.fragments[0].content)) +# self.assertTrue(re.match('^On 9 Jan 2014', message.fragments[1].content)) +# +# self.assertEqual( +# [False, True, False], +# [fragment.quoted for fragment in message.fragments] +# ) +# +# self.assertEqual( +# [False, False, False], +# [fragment.signature for fragment in message.fragments] +# ) +# +# self.assertEqual( +# [False, True, True], +# [fragment.hidden for fragment in message.fragments] +# ) +# +# def test_pathological_emails(self): +# t0 = time.time() +# message = self.get_email("pathological") +# self.assertTrue(time.time() - t0 < 1, "Took too long") +# +# def test_doesnt_remove_signature_delimiter_in_mid_line(self): +# message = self.get_email('email_sig_delimiter_in_middle_of_line') +# self.assertEqual(1, len(message.fragments)) +# + + +if __name__ == '__main__': + unittest.main()