Skip to content

Commit

Permalink
fix(encoding): better character encoding
Browse files Browse the repository at this point in the history
Signed-off-by: Anna Larch <[email protected]>
  • Loading branch information
miaulalala authored and ChristophWurst committed Feb 27, 2024
1 parent a537687 commit 176e074
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 15 deletions.
87 changes: 87 additions & 0 deletions lib/IMAP/Charset/Converter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<?php

declare(strict_types=1);

/**
* @author Anna Larch <[email protected]>
*
* Mail
*
* This code is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License, version 3,
* along with this program. If not, see <http://www.gnu.org/licenses/>
*
*/

namespace OCA\Mail\IMAP\Charset;

use Horde_Mime_Part;
use OCA\Mail\Exception\ServiceException;
use function in_array;
use function is_string;

class Converter {

/**
* @param Horde_Mime_Part $p
* @return string
* @throws ServiceException
*/
public function convert(Horde_Mime_Part $p): string {
/** @var null|string $data */
$data = $p->getContents();
if ($data === null) {
return '';
}

// Only convert encoding if it is explicitly specified in the header because text/calendar
// data is utf-8 by default.
$charset = $p->getCharset();
if ($charset !== null && strtoupper($charset) === 'UTF-8') {
return $data;
}

// The part specifies a charset
if ($charset !== null) {
if (in_array($charset, mb_list_encodings(), true)) {
$converted = mb_convert_encoding($data, 'UTF-8', $charset);
} else {
$converted = iconv($charset, 'UTF-8', $data);
}

if (is_string($converted)) {
return $converted;
}
}

// No charset specified, let's ask mb if this could be UTF-8
$detectedCharset = mb_detect_encoding($data, 'UTF-8', true);
if ($detectedCharset === false) {
// Fallback, non UTF-8
$detectedCharset = mb_detect_encoding($data, null, true);
}
// Still UTF8, no need to convert
if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') {
return $data;
}

$converted = @mb_convert_encoding($data, 'UTF-8', $charset);
if ($converted === false) {
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv
$converted = iconv($charset, 'UTF-8', $data);
}

if (!is_string($converted)) {
throw new ServiceException('Could not detect message charset');
}
return $converted;
}
}
19 changes: 5 additions & 14 deletions lib/IMAP/ImapMessageFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
use Horde_Mime_Part;
use OCA\Mail\AddressList;
use OCA\Mail\Exception\ServiceException;
use OCA\Mail\IMAP\Charset\Converter;
use OCA\Mail\Model\IMAPMessage;
use OCA\Mail\Service\Html;
use OCA\Mail\Service\SmimeService;
Expand Down Expand Up @@ -79,7 +80,8 @@ public function __construct(int $uid,
Horde_Imap_Client_Base $client,
string $userId,
Html $htmlService,
SmimeService $smimeService) {
SmimeService $smimeService,
private Converter $converter) {
$this->uid = $uid;
$this->mailbox = $mailbox;
$this->client = $client;
Expand Down Expand Up @@ -434,6 +436,7 @@ private function handleHtmlMessage(Horde_Mime_Part $p, string $partNo, bool $isF
* @throws DoesNotExistException
* @throws Horde_Imap_Client_Exception
* @throws Horde_Imap_Client_Exception_NoSupportExtension
* @throws ServiceException
*/
private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetched): string {
if (!$isFetched) {
Expand Down Expand Up @@ -461,22 +464,10 @@ private function loadBodyData(Horde_Mime_Part $p, string $partNo, bool $isFetche
}

$data = $fetch->getBodyPart($partNo);

$p->setContents($data);
}

$data = $p->getContents();
if ($data === null) {
return '';
}

// Only convert encoding if it is explicitly specified in the header because text/calendar
// data is utf-8 by default.
$charset = $p->getContentTypeParameter('charset');
if ($charset !== null && strtoupper($charset) !== 'UTF-8') {
$data = mb_convert_encoding($data, 'UTF-8', $charset);
}
return (string)$data;
return $this->converter->convert($p);
}

private function hasAttachments(Horde_Mime_Part $part): bool {
Expand Down
7 changes: 6 additions & 1 deletion lib/IMAP/ImapMessageFetcherFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,21 @@
namespace OCA\Mail\IMAP;

use Horde_Imap_Client_Base;
use OCA\Mail\IMAP\Charset\Converter;
use OCA\Mail\Service\Html;
use OCA\Mail\Service\SmimeService;

class ImapMessageFetcherFactory {
private Html $htmlService;
private SmimeService $smimeService;
private Converter $charsetConverter;

public function __construct(Html $htmlService,
SmimeService $smimeService) {
SmimeService $smimeService,
Converter $charsetConverter) {
$this->htmlService = $htmlService;
$this->smimeService = $smimeService;
$this->charsetConverter = $charsetConverter;
}

public function build(int $uid,
Expand All @@ -51,6 +55,7 @@ public function build(int $uid,
$userId,
$this->htmlService,
$this->smimeService,
$this->charsetConverter,
);
}
}
101 changes: 101 additions & 0 deletions tests/Unit/IMAP/Charset/ConverterTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?php

declare(strict_types=1);

/**
* @copyright 2023 Anna Larch <[email protected]>
* @author Anna Larch <[email protected]>
*
* Mail
*
* This code is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License, version 3,
* along with this program. If not, see <http://www.gnu.org/licenses/>
*
*/

namespace OCA\Mail\Tests\Unit\IMAP\Charset;

use ChristophWurst\Nextcloud\Testing\TestCase;
use Horde_Mime_Part;
use OCA\Mail\IMAP\Charset\Converter;
use function fopen;
use function fwrite;
use function mb_convert_encoding;

class ConverterTest extends TestCase {

public Converter $converter;
protected function setUp(): void {
parent::setUp();

$this->converter = new Converter();
}

/**
* @dataProvider dataProviderMimeParts
*/
public function testConvert($mimePart, $expected): void {
$actual = $this->converter->convert($mimePart);
$this->assertEquals($expected, $actual);
$isUtf8 = mb_check_encoding($actual, 'UTF-8');
$this->assertTrue($isUtf8);
}

public function dataProviderMimeParts(): array {
// UTF8
$utfMimePart = new Horde_Mime_Part();
$utfMimePart->setType('text/plain');
$utfMimePart->setCharset('UTF-8');
$utfMimePart->setContents('😊');
// UTF8 stream
$utfMimeStreamPart = new Horde_Mime_Part();
$utfMimeStreamPart->setType('text/plain');
$utfMimeStreamPart->setCharset('UTF-8');
$fh = fopen("php://temp", 'r+');
fwrite($fh, '💦');
$utfMimeStreamPart->setContents($fh, [ 'usestream' => true, ]);
// Hebrew
$iso88591MimePart = new Horde_Mime_Part();
$iso88591MimePart->setType('text/plain');
$iso88591MimePart->setCharset('ISO-8859-1');
$iso88591MimePart->setContents(mb_convert_encoding('Ümlaut', 'ISO-8859-1', 'UTF-8'));
$iso88591MimePart_noCharset = new Horde_Mime_Part();
$iso88591MimePart_noCharset->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת');
// Japanese
$iso2022jpMimePart = new Horde_Mime_Part();
$iso2022jpMimePart->setType('text/plain');
$iso2022jpMimePart->setCharset('ISO-2022-JP');
$iso2022jpMimePart->setContents(mb_convert_encoding('外せ園査リツハワ題', 'ISO-2022-JP', 'UTF-8'));
$iso2022jpMimePart_noCharset = new Horde_Mime_Part();
$iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題');
// Korean - not in mb nor iconv
// $iso106461MimePart = new Horde_Mime_Part();
// $iso106461MimePart->setType('text/plain');
// $iso106461MimePart->setCharset('ISO 10646-1');
//$iso106461MimePart->setContents(iconv('UTF-8', 'ISO 10646-1', '언론·출판은 타인의 명'));
// Arabic - not in mb
$windowsMimePart = new Horde_Mime_Part();
$windowsMimePart->setType('text/plain');
$windowsMimePart->setCharset('Windows-1256');
$windowsMimePart->setContents(iconv('UTF-8', 'Windows-1256', 'قام زهاء أوراقهم ما,'));

return[
[$utfMimePart, '😊'],
[$utfMimeStreamPart, '💦'],
[$iso88591MimePart, 'Ümlaut'],
[$iso2022jpMimePart, '外せ園査リツハワ題'],
[$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'],
// [$iso106461MimePart, '언론·출판은 타인의 명'],
[$windowsMimePart, 'قام زهاء أوراقهم ما,']
];
}
}

0 comments on commit 176e074

Please sign in to comment.