From 9103f4f99b2da250d0fe2e50e0965464f6ea3680 Mon Sep 17 00:00:00 2001 From: Eduardo Gulias Davis Date: Sat, 29 Nov 2014 09:45:00 +0100 Subject: [PATCH] #30 - Improved control for UTF8 chars --- src/Egulias/EmailValidator/EmailLexer.php | 43 ++++++++++++++----- src/Egulias/EmailValidator/EmailParser.php | 10 +++-- .../Tests/EmailValidator/EmailLexerTest.php | 13 +++--- .../EmailValidator/EmailValidatorTest.php | 2 +- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/Egulias/EmailValidator/EmailLexer.php b/src/Egulias/EmailValidator/EmailLexer.php index 2dd4d1f..3e0f2d8 100644 --- a/src/Egulias/EmailValidator/EmailLexer.php +++ b/src/Egulias/EmailValidator/EmailLexer.php @@ -143,7 +143,6 @@ protected function getCatchablePatterns() '\r\n', '::', '\s+?', - '[\x10-\x1F]+', '.', ); } @@ -155,7 +154,7 @@ protected function getCatchablePatterns() */ protected function getNonCatchablePatterns() { - return array('[\x7f-\xff]+'); + return array('[\xA0-\xff]+'); } /** @@ -167,16 +166,20 @@ protected function getNonCatchablePatterns() */ protected function getType(&$value) { - if ($this->isNullType($value)) { return self::C_NUL; } - if (isset($this->charValue[$value])) { + if ($this->isValid($value)) { return $this->charValue[$value]; } - if ($this->isInvalid($value)) { + if ($this->isUTF8Invalid($value)) { + $this->hasInvalidTokens = true; + return self::INVALID; + } + + if ($this->isASCIIInvalid($value)) { $this->hasInvalidTokens = true; return self::INVALID; } @@ -184,8 +187,18 @@ protected function getType(&$value) return self::GENERIC; } + protected function isValid($value) + { + if (isset($this->charValue[$value])) { + return true; + } + + return false; + } + /** - * @param string $value + * @param $value + * @return bool */ protected function isNullType($value) { @@ -197,15 +210,25 @@ protected function isNullType($value) } /** - * @param string $value + * @param $value + * @return bool */ - protected function isInvalid($value) + protected function isASCIIInvalid($value) { - if (preg_match('/[\x10-\x1F\x{0001}-\x{000F}\x{0080}-\x{009F}]+/', $value)) { + if (isset($this->invalidASCII[ord($value)])) { return true; } - if (isset($this->invalidASCII[ord($value)])) { + return false; + } + + /** + * @param $value + * @return bool + */ + protected function isUTF8Invalid($value) + { + if (preg_match('/\p{Cc}+/u', $value)) { return true; } diff --git a/src/Egulias/EmailValidator/EmailParser.php b/src/Egulias/EmailValidator/EmailParser.php index 0c5d156..5f4c4aa 100644 --- a/src/Egulias/EmailValidator/EmailParser.php +++ b/src/Egulias/EmailValidator/EmailParser.php @@ -29,7 +29,8 @@ public function __construct(EmailLexer $lexer) } /** - * @param string $str + * @param $str + * @return array */ public function parse($str) { @@ -39,15 +40,16 @@ public function parse($str) throw new \InvalidArgumentException('ERR_NOLOCALPART'); } - if ($this->lexer->hasInvalidTokens()) { - throw new \InvalidArgumentException('ERR_INVALID_ATEXT'); - } $this->localPartParser->parse($str); $this->domainPartParser->parse($str); $this->setParts($str); + if ($this->lexer->hasInvalidTokens()) { + throw new \InvalidArgumentException('ERR_INVALID_ATEXT'); + } + return array('local' => $this->localPart, 'domain' => $this->domainPart); } diff --git a/tests/egulias/Tests/EmailValidator/EmailLexerTest.php b/tests/egulias/Tests/EmailValidator/EmailLexerTest.php index c48071a..bcae105 100644 --- a/tests/egulias/Tests/EmailValidator/EmailLexerTest.php +++ b/tests/egulias/Tests/EmailValidator/EmailLexerTest.php @@ -54,8 +54,8 @@ public function invalidUTF8CharsProvider() { $chars = array(); for ($i = 0; $i < 0x100; ++$i) { - $c = $this->utf8_chr($i); - if (preg_match('/[\x{0001}-\x{000F}\x{0080}-\x{009F}]/u', $c)) { + $c = $this->utf8Chr($i); + if (preg_match('/(?=\p{Cc})(?=[^\t\n\n\r])/u', $c) && !preg_match('/\x{0000}/u', $c)) { $chars[] = array($c); } } @@ -63,7 +63,8 @@ public function invalidUTF8CharsProvider() return $chars; } - protected function utf8_chr($code_point) { + protected function utf8Chr($code_point) + { if ($code_point < 0 || 0x10FFFF < $code_point || (0xD800 <= $code_point && $code_point <= 0xDFFF)) { return ''; @@ -72,16 +73,16 @@ protected function utf8_chr($code_point) { if ($code_point < 0x80) { $hex[0] = $code_point; $ret = chr($hex[0]); - } else if ($code_point < 0x800) { + } elseif ($code_point < 0x800) { $hex[0] = 0x1C0 | $code_point >> 6; $hex[1] = 0x80 | $code_point & 0x3F; $ret = chr($hex[0]).chr($hex[1]); - } else if ($code_point < 0x10000) { + } elseif ($code_point < 0x10000) { $hex[0] = 0xE0 | $code_point >> 12; $hex[1] = 0x80 | $code_point >> 6 & 0x3F; $hex[2] = 0x80 | $code_point & 0x3F; $ret = chr($hex[0]).chr($hex[1]).chr($hex[2]); - } else { + } else { $hex[0] = 0xF0 | $code_point >> 18; $hex[1] = 0x80 | $code_point >> 12 & 0x3F; $hex[2] = 0x80 | $code_point >> 6 & 0x3F; diff --git a/tests/egulias/Tests/EmailValidator/EmailValidatorTest.php b/tests/egulias/Tests/EmailValidator/EmailValidatorTest.php index 89f547f..9107598 100644 --- a/tests/egulias/Tests/EmailValidator/EmailValidatorTest.php +++ b/tests/egulias/Tests/EmailValidator/EmailValidatorTest.php @@ -26,7 +26,7 @@ public function testValidEmails($email) $this->assertTrue($this->validator->isValid($email)); } - public function testInvalidUTF8Emails() + public function testInvalidUTF8Email() { $validator = new EmailValidator; $email = "\x80\x81\x82@\x83\x84\x85.\x86\x87\x88";