-
Notifications
You must be signed in to change notification settings - Fork 262
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5882 from nextcloud/fix/detect-char-encoding
Change how charsets are detected for emails
- Loading branch information
Showing
4 changed files
with
199 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
/** | ||
* @author Anna Larch <[email protected]> | ||
* | ||
* | ||
* This code is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License, version 3, | ||
* as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License, version 3, | ||
* along with this program. If not, see <http://www.gnu.org/licenses/> | ||
* | ||
*/ | ||
|
||
namespace OCA\Mail\IMAP\Charset; | ||
|
||
use Horde_Mime_Part; | ||
use OCA\Mail\Exception\ServiceException; | ||
use function in_array; | ||
use function is_string; | ||
|
||
class Converter { | ||
|
||
/** | ||
* @param Horde_Mime_Part $p | ||
* @return string | ||
* @throws ServiceException | ||
*/ | ||
public function convert(Horde_Mime_Part $p): string { | ||
/** @var null|string $data */ | ||
$data = $p->getContents(); | ||
if ($data === null) { | ||
return ''; | ||
} | ||
|
||
// Only convert encoding if it is explicitly specified in the header because text/calendar | ||
// data is utf-8 by default. | ||
$charset = $p->getCharset(); | ||
if ($charset !== null && strtoupper($charset) === 'UTF-8') { | ||
return $data; | ||
} | ||
|
||
// The part specifies a charset | ||
if ($charset !== null) { | ||
if (in_array($charset, mb_list_encodings(), true)) { | ||
$converted = mb_convert_encoding($data, 'UTF-8', $charset); | ||
} else { | ||
$converted = iconv($charset, 'UTF-8', $data); | ||
} | ||
|
||
if (is_string($converted)) { | ||
return $converted; | ||
} | ||
} | ||
|
||
// No charset specified, let's ask mb if this could be UTF-8 | ||
$detectedCharset = mb_detect_encoding($data, 'UTF-8', true); | ||
if ($detectedCharset === false) { | ||
// Fallback, non UTF-8 | ||
$detectedCharset = mb_detect_encoding($data, null, true); | ||
} | ||
// Still UTF8, no need to convert | ||
if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') { | ||
return $data; | ||
} | ||
|
||
$converted = @mb_convert_encoding($data, 'UTF-8', $charset); | ||
if ($converted === false) { | ||
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv | ||
$converted = iconv($charset, 'UTF-8', $data); | ||
} | ||
|
||
if (!is_string($converted)) { | ||
throw new ServiceException('Could not detect message charset'); | ||
} | ||
return $converted; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
/** | ||
* @copyright 2023 Anna Larch <[email protected]> | ||
* @author Anna Larch <[email protected]> | ||
* | ||
* | ||
* This code is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License, version 3, | ||
* as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License, version 3, | ||
* along with this program. If not, see <http://www.gnu.org/licenses/> | ||
* | ||
*/ | ||
|
||
namespace OCA\Mail\Tests\Unit\IMAP\Charset; | ||
|
||
use ChristophWurst\Nextcloud\Testing\TestCase; | ||
use Horde_Mime_Part; | ||
use OCA\Mail\IMAP\Charset\Converter; | ||
use function fopen; | ||
use function fwrite; | ||
use function mb_convert_encoding; | ||
|
||
class ConverterTest extends TestCase { | ||
|
||
public Converter $converter; | ||
protected function setUp(): void { | ||
parent::setUp(); | ||
|
||
$this->converter = new Converter(); | ||
} | ||
|
||
/** | ||
* @dataProvider dataProviderMimeParts | ||
*/ | ||
public function testConvert($mimePart, $expected): void { | ||
$actual = $this->converter->convert($mimePart); | ||
$this->assertEquals($expected, $actual); | ||
$isUtf8 = mb_check_encoding($actual, 'UTF-8'); | ||
$this->assertTrue($isUtf8); | ||
} | ||
|
||
public function dataProviderMimeParts(): array { | ||
// UTF8 | ||
$utfMimePart = new Horde_Mime_Part(); | ||
$utfMimePart->setType('text/plain'); | ||
$utfMimePart->setCharset('UTF-8'); | ||
$utfMimePart->setContents('😊'); | ||
// UTF8 stream | ||
$utfMimeStreamPart = new Horde_Mime_Part(); | ||
$utfMimeStreamPart->setType('text/plain'); | ||
$utfMimeStreamPart->setCharset('UTF-8'); | ||
$fh = fopen("php://temp", 'r+'); | ||
fwrite($fh, '💦'); | ||
$utfMimeStreamPart->setContents($fh, [ 'usestream' => true, ]); | ||
// Hebrew | ||
$iso88591MimePart = new Horde_Mime_Part(); | ||
$iso88591MimePart->setType('text/plain'); | ||
$iso88591MimePart->setCharset('ISO-8859-1'); | ||
$iso88591MimePart->setContents(mb_convert_encoding('Ümlaut', 'ISO-8859-1', 'UTF-8')); | ||
$iso88591MimePart_noCharset = new Horde_Mime_Part(); | ||
$iso88591MimePart_noCharset->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'); | ||
// Japanese | ||
$iso2022jpMimePart = new Horde_Mime_Part(); | ||
$iso2022jpMimePart->setType('text/plain'); | ||
$iso2022jpMimePart->setCharset('ISO-2022-JP'); | ||
$iso2022jpMimePart->setContents(mb_convert_encoding('外せ園査リツハワ題', 'ISO-2022-JP', 'UTF-8')); | ||
$iso2022jpMimePart_noCharset = new Horde_Mime_Part(); | ||
$iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題'); | ||
// Korean - not in mb nor iconv | ||
// $iso106461MimePart = new Horde_Mime_Part(); | ||
// $iso106461MimePart->setType('text/plain'); | ||
// $iso106461MimePart->setCharset('ISO 10646-1'); | ||
//$iso106461MimePart->setContents(iconv('UTF-8', 'ISO 10646-1', '언론·출판은 타인의 명')); | ||
// Arabic - not in mb | ||
$windowsMimePart = new Horde_Mime_Part(); | ||
$windowsMimePart->setType('text/plain'); | ||
$windowsMimePart->setCharset('Windows-1256'); | ||
$windowsMimePart->setContents(iconv('UTF-8', 'Windows-1256', 'قام زهاء أوراقهم ما,')); | ||
|
||
return[ | ||
[$utfMimePart, '😊'], | ||
[$utfMimeStreamPart, '💦'], | ||
[$iso88591MimePart, 'Ümlaut'], | ||
[$iso2022jpMimePart, '外せ園査リツハワ題'], | ||
[$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'], | ||
// [$iso106461MimePart, '언론·출판은 타인의 명'], | ||
[$windowsMimePart, 'قام زهاء أوراقهم ما,'] | ||
]; | ||
} | ||
} |