From 4814b0403eda0041939d4f4f1bb78da4fc382bae Mon Sep 17 00:00:00 2001 From: Anna Larch Date: Mon, 9 Oct 2023 19:54:05 +0200 Subject: [PATCH] fixup! fix(encoding): better character encoding --- lib/IMAP/Charset/Converter.php | 73 ++++++++++++++++ lib/IMAP/ImapMessageFetcher.php | 37 ++------ tests/Unit/IMAP/Charset/ConverterTest.php | 102 ++++++++++++++++++++++ 3 files changed, 180 insertions(+), 32 deletions(-) create mode 100644 lib/IMAP/Charset/Converter.php create mode 100644 tests/Unit/IMAP/Charset/ConverterTest.php diff --git a/lib/IMAP/Charset/Converter.php b/lib/IMAP/Charset/Converter.php new file mode 100644 index 0000000000..1fa9849a02 --- /dev/null +++ b/lib/IMAP/Charset/Converter.php @@ -0,0 +1,73 @@ + + * * + * * @author Anna Larch + * * + * * This library is free software; you can redistribute it and/or + * * modify it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE + * * License as published by the Free Software Foundation; either + * * version 3 of the License, or any later version. + * * + * * This library is distributed in the hope that it will be useful, + * * but WITHOUT ANY WARRANTY; without even the implied warranty of + * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * * + * * You should have received a copy of the GNU Affero General Public + * * License along with this library. If not, see . + * * + * + */ + +namespace OCA\Mail\IMAP\Charset; + +use Horde_Mime_Part; +use OCA\Mail\Exception\ServiceException; + +class Converter { + + /** + * @param Horde_Mime_Part $p + * @return string + * @throws ServiceException + */ + public function convert(Horde_Mime_Part $p): string { + $data = $p->getContents(); + if ($data === null) { + return ''; + } + + // Only convert encoding if it is explicitly specified in the header because text/calendar + // data is utf-8 by default. + $charset = $p->getCharset(); + if($charset !== null && strtoupper($charset) === 'UTF-8') { + return $data; + } + + // No charset specified + $charset = mb_detect_encoding($data, 'UTF-8', true); + if (!$charset) { + // Fallback + $charset = mb_detect_encoding($data, null, true); + } + + // Still UTF8, no need to convert + if($charset !== null && strtoupper($charset) === 'UTF-8') { + return $data; + } + + $converted = @mb_convert_encoding($data, 'UTF-8', $charset); + if (!$converted) { + // Might be a charset that PHP mb doesn't know how to handle, fall back to iconv + $converted = iconv($charset, 'UTF-8', $data); + } + + if ($converted === false) { + throw new ServiceException('Could not detect message charset'); + } + return $converted; + } +} diff --git a/lib/IMAP/ImapMessageFetcher.php b/lib/IMAP/ImapMessageFetcher.php index ef3e7ed699..da5124e56c 100644 --- a/lib/IMAP/ImapMessageFetcher.php +++ b/lib/IMAP/ImapMessageFetcher.php @@ -40,6 +40,7 @@ use Horde_Mime_Part; use OCA\Mail\AddressList; use OCA\Mail\Exception\ServiceException; +use OCA\Mail\IMAP\Charset\Converter; use OCA\Mail\Model\IMAPMessage; use OCA\Mail\Service\Html; use OCA\Mail\Service\SmimeService; @@ -80,7 +81,8 @@ public function __construct(int $uid, Horde_Imap_Client_Base $client, string $userId, Html $htmlService, - SmimeService $smimeService) { + SmimeService $smimeService, + private Converter $converter) { $this->uid = $uid; $this->mailbox = $mailbox; $this->client = $client; @@ -432,6 +434,7 @@ private function handleHtmlMessage(Horde_Mime_Part $p, string $partNo, bool $isF * @throws DoesNotExistException * @throws Horde_Imap_Client_Exception * @throws Horde_Imap_Client_Exception_NoSupportExtension + * @throws ServiceException */ private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetched): string { if (!$isFetched) { @@ -462,37 +465,7 @@ private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetche $p->setContents($data); } - $data = $p->getContents(); - if ($data === null) { - return ''; - } - - // Only convert encoding if it is explicitly specified in the header because text/calendar - // data is utf-8 by default. - $charset = mb_detect_encoding($data, 'UTF-8', true); - if (!$charset) { - $charset = $p->getCharset(); - } - - // Try the email encoding first - $converted = @mb_convert_encoding($data, 'UTF-8', $p->getCharset()); - if ($converted) { - return $converted; - } - - // Fallback - $charset = mb_detect_encoding($data, null, true); - $converted = @mb_convert_encoding($data, 'UTF-8', $charset); - if ($converted) { - return $converted; - } - - // Might be a charset that PHP mb doesn't know how to handle, fall back to iconv - $converted = iconv($charset, 'UTF-8', $data); - if ($converted === false) { - throw new ServiceException('Could not detect message charset'); - } - return $converted; + return $this->converter->convert($p); } private function hasAttachments(Horde_Mime_Part $part): bool { diff --git a/tests/Unit/IMAP/Charset/ConverterTest.php b/tests/Unit/IMAP/Charset/ConverterTest.php new file mode 100644 index 0000000000..5625672c5a --- /dev/null +++ b/tests/Unit/IMAP/Charset/ConverterTest.php @@ -0,0 +1,102 @@ + + * @author Anna Larch + * + * Mail + * + * This code is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License, version 3, + * along with this program. If not, see + * + */ + +namespace OCA\Mail\Tests\Unit\IMAP\Charset; + +use ChristophWurst\Nextcloud\Testing\TestCase; +use Horde_Imap_Client; +use Horde_Imap_Client_Data_Fetch; +use Horde_Imap_Client_Fetch_Query; +use Horde_Imap_Client_Fetch_Results; +use Horde_Imap_Client_Ids; +use Horde_Imap_Client_Socket; +use Horde_Mime_Part; +use OCA\Mail\Db\Mailbox; +use OCA\Mail\IMAP\Charset\Converter; +use OCA\Mail\IMAP\ImapMessageFetcher; +use OCA\Mail\IMAP\ImapMessageFetcherFactory; +use OCA\Mail\IMAP\MessageMapper; +use OCA\Mail\Model\IMAPMessage; +use OCA\Mail\Service\SmimeService; +use OCA\Mail\Support\PerformanceLoggerTask; +use PHPUnit\Framework\MockObject\MockObject; +use Psr\Log\LoggerInterface; +use function range; + +class ConverterTest extends TestCase { + + public Converter $converter; + protected function setUp(): void { + parent::setUp(); + + $this->converter = new Converter(); + } + + /** + * @dataProvider dataProviderMimeParts + */ + public function testConvert($mimePart, $expected): void { + $actual = $this->converter->convert($mimePart); + $this->assertEquals($expected, $actual); + $isUtf8 = mb_check_encoding($actual, 'UTF-8'); + $this->assertTrue($isUtf8); + } + + public function dataProviderMimeParts(): array { + // UTF8 + $utfMimePart = new Horde_Mime_Part(); + $utfMimePart->setCharset('UTF-8'); + $utfMimePart->setContents('😊'); + // Hebrew + $iso88591MimePart = new Horde_Mime_Part(); + $iso88591MimePart->setCharset('ISO 8859-1'); + $iso88591MimePart->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'); + $iso88591MimePart_noCharset = new Horde_Mime_Part(); + $iso88591MimePart_noCharset->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'); + // Japanese + $iso2022jpMimePart = new Horde_Mime_Part(); + $iso2022jpMimePart->setCharset('ISO-2022-JP'); + $iso2022jpMimePart->setContents('外せ園査リツハワ題'); + $iso2022jpMimePart_noCharset = new Horde_Mime_Part(); + $iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題'); + // Korean - not in mb + $iso106461MimePart = new Horde_Mime_Part(); + $iso106461MimePart->setCharset('ISO 10646-1'); + $iso106461MimePart->setContents('언론·출판은 타인의 명'); + // Arabic - not in mb + $windowsMimePart = new Horde_Mime_Part(); + $windowsMimePart->setCharset('Windows-1256'); + $windowsMimePart->setContents('قام زهاء أوراقهم ما,'); + + return[ + [$utfMimePart, '😊'], + [$iso88591MimePart, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'], + [$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'], + [$iso2022jpMimePart, '外せ園査リツハワ題'], + [$iso2022jpMimePart_noCharset, '外せ園査リツハワ題'], + [$iso106461MimePart, '언론·출판은 타인의 명'], + [$windowsMimePart, 'قام زهاء أوراقهم ما,'] + ]; + } +}