Skip to content

Commit

Permalink
fixup! fix(encoding): better character encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
miaulalala committed Oct 9, 2023
1 parent b557369 commit 4814b04
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 32 deletions.
73 changes: 73 additions & 0 deletions lib/IMAP/Charset/Converter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
<?php
/*
* *
* *
* * @copyright 2023 Anna Larch <[email protected]>
* *
* * @author Anna Larch <[email protected]>
* *
* * This library is free software; you can redistribute it and/or
* * modify it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE
* * License as published by the Free Software Foundation; either
* * version 3 of the License, or any later version.
* *
* * This library is distributed in the hope that it will be useful,
* * but WITHOUT ANY WARRANTY; without even the implied warranty of
* * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* * GNU AFFERO GENERAL PUBLIC LICENSE for more details.
* *
* * You should have received a copy of the GNU Affero General Public
* * License along with this library. If not, see <http://www.gnu.org/licenses/>.
* *
*
*/

namespace OCA\Mail\IMAP\Charset;

use Horde_Mime_Part;
use OCA\Mail\Exception\ServiceException;

class Converter {

/**
* @param Horde_Mime_Part $p
* @return string
* @throws ServiceException
*/
public function convert(Horde_Mime_Part $p): string {
$data = $p->getContents();
if ($data === null) {
return '';
}

// Only convert encoding if it is explicitly specified in the header because text/calendar
// data is utf-8 by default.
$charset = $p->getCharset();
if($charset !== null && strtoupper($charset) === 'UTF-8') {
return $data;
}

// No charset specified
$charset = mb_detect_encoding($data, 'UTF-8', true);
if (!$charset) {
// Fallback
$charset = mb_detect_encoding($data, null, true);
}

// Still UTF8, no need to convert
if($charset !== null && strtoupper($charset) === 'UTF-8') {

Check failure on line 58 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-master

RedundantCondition

lib/IMAP/Charset/Converter.php:58:6: RedundantCondition: false|string can never contain null (see https://psalm.dev/122)

Check failure on line 58 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-stable27

RedundantCondition

lib/IMAP/Charset/Converter.php:58:6: RedundantCondition: false|string can never contain null (see https://psalm.dev/122)

Check failure on line 58 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-stable26

RedundantCondition

lib/IMAP/Charset/Converter.php:58:6: RedundantCondition: false|string can never contain null (see https://psalm.dev/122)
return $data;
}

$converted = @mb_convert_encoding($data, 'UTF-8', $charset);
if (!$converted) {
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv
$converted = iconv($charset, 'UTF-8', $data);
}

if ($converted === false) {
throw new ServiceException('Could not detect message charset');
}
return $converted;

Check failure on line 71 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-master

InvalidReturnStatement

lib/IMAP/Charset/Converter.php:71:10: InvalidReturnStatement: The inferred type 'non-empty-array<array-key, mixed|string>|string' does not match the declared return type 'string' for OCA\Mail\IMAP\Charset\Converter::convert (see https://psalm.dev/128)

Check failure on line 71 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-stable27

InvalidReturnStatement

lib/IMAP/Charset/Converter.php:71:10: InvalidReturnStatement: The inferred type 'non-empty-array<array-key, mixed|string>|string' does not match the declared return type 'string' for OCA\Mail\IMAP\Charset\Converter::convert (see https://psalm.dev/128)

Check failure on line 71 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / Nextcloud dev-stable26

InvalidReturnStatement

lib/IMAP/Charset/Converter.php:71:10: InvalidReturnStatement: The inferred type 'non-empty-array<array-key, mixed|string>|string' does not match the declared return type 'string' for OCA\Mail\IMAP\Charset\Converter::convert (see https://psalm.dev/128)
}
}
37 changes: 5 additions & 32 deletions lib/IMAP/ImapMessageFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
use Horde_Mime_Part;
use OCA\Mail\AddressList;
use OCA\Mail\Exception\ServiceException;
use OCA\Mail\IMAP\Charset\Converter;
use OCA\Mail\Model\IMAPMessage;
use OCA\Mail\Service\Html;
use OCA\Mail\Service\SmimeService;
Expand Down Expand Up @@ -80,7 +81,8 @@ public function __construct(int $uid,
Horde_Imap_Client_Base $client,
string $userId,
Html $htmlService,
SmimeService $smimeService) {
SmimeService $smimeService,
private Converter $converter) {
$this->uid = $uid;
$this->mailbox = $mailbox;
$this->client = $client;
Expand Down Expand Up @@ -432,6 +434,7 @@ private function handleHtmlMessage(Horde_Mime_Part $p, string $partNo, bool $isF
* @throws DoesNotExistException
* @throws Horde_Imap_Client_Exception
* @throws Horde_Imap_Client_Exception_NoSupportExtension
* @throws ServiceException
*/
private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetched): string {
if (!$isFetched) {
Expand Down Expand Up @@ -462,37 +465,7 @@ private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetche
$p->setContents($data);
}

$data = $p->getContents();
if ($data === null) {
return '';
}

// Only convert encoding if it is explicitly specified in the header because text/calendar
// data is utf-8 by default.
$charset = mb_detect_encoding($data, 'UTF-8', true);
if (!$charset) {
$charset = $p->getCharset();
}

// Try the email encoding first
$converted = @mb_convert_encoding($data, 'UTF-8', $p->getCharset());
if ($converted) {
return $converted;
}

// Fallback
$charset = mb_detect_encoding($data, null, true);
$converted = @mb_convert_encoding($data, 'UTF-8', $charset);
if ($converted) {
return $converted;
}

// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv
$converted = iconv($charset, 'UTF-8', $data);
if ($converted === false) {
throw new ServiceException('Could not detect message charset');
}
return $converted;
return $this->converter->convert($p);
}

private function hasAttachments(Horde_Mime_Part $part): bool {
Expand Down
102 changes: 102 additions & 0 deletions tests/Unit/IMAP/Charset/ConverterTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?php

declare(strict_types=1);

/**
* @copyright 2023 Anna Larch <[email protected]>
* @author Anna Larch <[email protected]>
*
* Mail
*
* This code is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License, version 3,
* along with this program. If not, see <http://www.gnu.org/licenses/>
*
*/

namespace OCA\Mail\Tests\Unit\IMAP\Charset;

use ChristophWurst\Nextcloud\Testing\TestCase;
use Horde_Imap_Client;
use Horde_Imap_Client_Data_Fetch;
use Horde_Imap_Client_Fetch_Query;
use Horde_Imap_Client_Fetch_Results;
use Horde_Imap_Client_Ids;
use Horde_Imap_Client_Socket;
use Horde_Mime_Part;
use OCA\Mail\Db\Mailbox;
use OCA\Mail\IMAP\Charset\Converter;
use OCA\Mail\IMAP\ImapMessageFetcher;
use OCA\Mail\IMAP\ImapMessageFetcherFactory;
use OCA\Mail\IMAP\MessageMapper;
use OCA\Mail\Model\IMAPMessage;
use OCA\Mail\Service\SmimeService;
use OCA\Mail\Support\PerformanceLoggerTask;
use PHPUnit\Framework\MockObject\MockObject;
use Psr\Log\LoggerInterface;
use function range;

class ConverterTest extends TestCase {

public Converter $converter;
protected function setUp(): void {
parent::setUp();

$this->converter = new Converter();
}

/**
* @dataProvider dataProviderMimeParts
*/
public function testConvert($mimePart, $expected): void {
$actual = $this->converter->convert($mimePart);
$this->assertEquals($expected, $actual);
$isUtf8 = mb_check_encoding($actual, 'UTF-8');
$this->assertTrue($isUtf8);
}

public function dataProviderMimeParts(): array {
// UTF8
$utfMimePart = new Horde_Mime_Part();
$utfMimePart->setCharset('UTF-8');
$utfMimePart->setContents('😊');
// Hebrew
$iso88591MimePart = new Horde_Mime_Part();
$iso88591MimePart->setCharset('ISO 8859-1');
$iso88591MimePart->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת');
$iso88591MimePart_noCharset = new Horde_Mime_Part();
$iso88591MimePart_noCharset->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת');
// Japanese
$iso2022jpMimePart = new Horde_Mime_Part();
$iso2022jpMimePart->setCharset('ISO-2022-JP');
$iso2022jpMimePart->setContents('外せ園査リツハワ題');
$iso2022jpMimePart_noCharset = new Horde_Mime_Part();
$iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題');
// Korean - not in mb
$iso106461MimePart = new Horde_Mime_Part();
$iso106461MimePart->setCharset('ISO 10646-1');
$iso106461MimePart->setContents('언론·출판은 타인의 명');
// Arabic - not in mb
$windowsMimePart = new Horde_Mime_Part();
$windowsMimePart->setCharset('Windows-1256');
$windowsMimePart->setContents('قام زهاء أوراقهم ما,');

return[
[$utfMimePart, '😊'],
[$iso88591MimePart, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'],
[$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'],
[$iso2022jpMimePart, '外せ園査リツハワ題'],
[$iso2022jpMimePart_noCharset, '外せ園査リツハワ題'],
[$iso106461MimePart, '언론·출판은 타인의 명'],
[$windowsMimePart, 'قام زهاء أوراقهم ما,']
];
}
}

0 comments on commit 4814b04

Please sign in to comment.