-
Notifications
You must be signed in to change notification settings - Fork 262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Change how charsets are detected for emails #5882
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
/** | ||
* @author Anna Larch <[email protected]> | ||
* | ||
* | ||
* This code is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License, version 3, | ||
* as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License, version 3, | ||
* along with this program. If not, see <http://www.gnu.org/licenses/> | ||
* | ||
*/ | ||
|
||
namespace OCA\Mail\IMAP\Charset; | ||
|
||
use Horde_Mime_Part; | ||
use OCA\Mail\Exception\ServiceException; | ||
use function in_array; | ||
use function is_string; | ||
|
||
class Converter { | ||
|
||
/** | ||
* @param Horde_Mime_Part $p | ||
* @return string | ||
* @throws ServiceException | ||
*/ | ||
public function convert(Horde_Mime_Part $p): string { | ||
/** @var null|string $data */ | ||
$data = $p->getContents(); | ||
if ($data === null) { | ||
return ''; | ||
} | ||
|
||
// Only convert encoding if it is explicitly specified in the header because text/calendar | ||
// data is utf-8 by default. | ||
$charset = $p->getCharset(); | ||
if ($charset !== null && strtoupper($charset) === 'UTF-8') { | ||
return $data; | ||
} | ||
|
||
// The part specifies a charset | ||
if ($charset !== null) { | ||
if (in_array($charset, mb_list_encodings(), true)) { | ||
$converted = mb_convert_encoding($data, 'UTF-8', $charset); | ||
} else { | ||
$converted = iconv($charset, 'UTF-8', $data); | ||
} | ||
|
||
if (is_string($converted)) { | ||
return $converted; | ||
} | ||
} | ||
|
||
// No charset specified, let's ask mb if this could be UTF-8 | ||
$detectedCharset = mb_detect_encoding($data, 'UTF-8', true); | ||
if ($detectedCharset === false) { | ||
// Fallback, non UTF-8 | ||
$detectedCharset = mb_detect_encoding($data, null, true); | ||
} | ||
// Still UTF8, no need to convert | ||
if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') { | ||
return $data; | ||
} | ||
|
||
$converted = @mb_convert_encoding($data, 'UTF-8', $charset); | ||
if ($converted === false) { | ||
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv | ||
$converted = iconv($charset, 'UTF-8', $data); | ||
} | ||
|
||
if (!is_string($converted)) { | ||
throw new ServiceException('Could not detect message charset'); | ||
} | ||
return $converted; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
/** | ||
* @copyright 2023 Anna Larch <[email protected]> | ||
* @author Anna Larch <[email protected]> | ||
* | ||
* | ||
* This code is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License, version 3, | ||
* as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License, version 3, | ||
* along with this program. If not, see <http://www.gnu.org/licenses/> | ||
* | ||
*/ | ||
|
||
namespace OCA\Mail\Tests\Unit\IMAP\Charset; | ||
|
||
use ChristophWurst\Nextcloud\Testing\TestCase; | ||
use Horde_Mime_Part; | ||
use OCA\Mail\IMAP\Charset\Converter; | ||
use function fopen; | ||
use function fwrite; | ||
use function mb_convert_encoding; | ||
|
||
class ConverterTest extends TestCase { | ||
|
||
public Converter $converter; | ||
protected function setUp(): void { | ||
parent::setUp(); | ||
|
||
$this->converter = new Converter(); | ||
} | ||
|
||
/** | ||
* @dataProvider dataProviderMimeParts | ||
*/ | ||
public function testConvert($mimePart, $expected): void { | ||
$actual = $this->converter->convert($mimePart); | ||
$this->assertEquals($expected, $actual); | ||
$isUtf8 = mb_check_encoding($actual, 'UTF-8'); | ||
$this->assertTrue($isUtf8); | ||
} | ||
|
||
public function dataProviderMimeParts(): array { | ||
// UTF8 | ||
$utfMimePart = new Horde_Mime_Part(); | ||
$utfMimePart->setType('text/plain'); | ||
$utfMimePart->setCharset('UTF-8'); | ||
$utfMimePart->setContents('😊'); | ||
// UTF8 stream | ||
$utfMimeStreamPart = new Horde_Mime_Part(); | ||
$utfMimeStreamPart->setType('text/plain'); | ||
$utfMimeStreamPart->setCharset('UTF-8'); | ||
$fh = fopen("php://temp", 'r+'); | ||
fwrite($fh, '💦'); | ||
$utfMimeStreamPart->setContents($fh, [ 'usestream' => true, ]); | ||
// Hebrew | ||
$iso88591MimePart = new Horde_Mime_Part(); | ||
$iso88591MimePart->setType('text/plain'); | ||
ChristophWurst marked this conversation as resolved.
Show resolved
Hide resolved
|
||
$iso88591MimePart->setCharset('ISO-8859-1'); | ||
$iso88591MimePart->setContents(mb_convert_encoding('Ümlaut', 'ISO-8859-1', 'UTF-8')); | ||
ChristophWurst marked this conversation as resolved.
Show resolved
Hide resolved
|
||
$iso88591MimePart_noCharset = new Horde_Mime_Part(); | ||
$iso88591MimePart_noCharset->setContents('בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'); | ||
// Japanese | ||
$iso2022jpMimePart = new Horde_Mime_Part(); | ||
$iso2022jpMimePart->setType('text/plain'); | ||
$iso2022jpMimePart->setCharset('ISO-2022-JP'); | ||
$iso2022jpMimePart->setContents(mb_convert_encoding('外せ園査リツハワ題', 'ISO-2022-JP', 'UTF-8')); | ||
$iso2022jpMimePart_noCharset = new Horde_Mime_Part(); | ||
$iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題'); | ||
// Korean - not in mb nor iconv | ||
// $iso106461MimePart = new Horde_Mime_Part(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ISO 10646-1 is impossible to get working for me. The coding is not mentioned in any report. Can we skip this? |
||
// $iso106461MimePart->setType('text/plain'); | ||
// $iso106461MimePart->setCharset('ISO 10646-1'); | ||
//$iso106461MimePart->setContents(iconv('UTF-8', 'ISO 10646-1', '언론·출판은 타인의 명')); | ||
// Arabic - not in mb | ||
$windowsMimePart = new Horde_Mime_Part(); | ||
$windowsMimePart->setType('text/plain'); | ||
$windowsMimePart->setCharset('Windows-1256'); | ||
$windowsMimePart->setContents(iconv('UTF-8', 'Windows-1256', 'قام زهاء أوراقهم ما,')); | ||
|
||
return[ | ||
[$utfMimePart, '😊'], | ||
[$utfMimeStreamPart, '💦'], | ||
[$iso88591MimePart, 'Ümlaut'], | ||
[$iso2022jpMimePart, '外せ園査リツハワ題'], | ||
[$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'], | ||
// [$iso106461MimePart, '언론·출판은 타인의 명'], | ||
[$windowsMimePart, 'قام زهاء أوراقهم ما,'] | ||
]; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
none of the tests is able to reach a line below this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is still the case. either the charset parameter is set and used, or mb detects the string as UTF-8