Skip to content

Commit

Permalink
#30 - Improved control for UTF8 chars
Browse files Browse the repository at this point in the history
  • Loading branch information
egulias committed Nov 29, 2014
1 parent 833eb65 commit 9103f4f
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 21 deletions.
43 changes: 33 additions & 10 deletions src/Egulias/EmailValidator/EmailLexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ protected function getCatchablePatterns()
'\r\n',
'::',
'\s+?',
'[\x10-\x1F]+',
'.',
);
}
Expand All @@ -155,7 +154,7 @@ protected function getCatchablePatterns()
*/
protected function getNonCatchablePatterns()
{
return array('[\x7f-\xff]+');
return array('[\xA0-\xff]+');
}

/**
Expand All @@ -167,25 +166,39 @@ protected function getNonCatchablePatterns()
*/
protected function getType(&$value)
{

if ($this->isNullType($value)) {
return self::C_NUL;
}

if (isset($this->charValue[$value])) {
if ($this->isValid($value)) {
return $this->charValue[$value];
}

if ($this->isInvalid($value)) {
if ($this->isUTF8Invalid($value)) {
$this->hasInvalidTokens = true;
return self::INVALID;
}

if ($this->isASCIIInvalid($value)) {
$this->hasInvalidTokens = true;
return self::INVALID;
}

return self::GENERIC;
}

protected function isValid($value)
{
if (isset($this->charValue[$value])) {
return true;
}

return false;
}

/**
* @param string $value
* @param $value
* @return bool
*/
protected function isNullType($value)
{
Expand All @@ -197,15 +210,25 @@ protected function isNullType($value)
}

/**
* @param string $value
* @param $value
* @return bool
*/
protected function isInvalid($value)
protected function isASCIIInvalid($value)
{
if (preg_match('/[\x10-\x1F\x{0001}-\x{000F}\x{0080}-\x{009F}]+/', $value)) {
if (isset($this->invalidASCII[ord($value)])) {
return true;
}

if (isset($this->invalidASCII[ord($value)])) {
return false;
}

/**
* @param $value
* @return bool
*/
protected function isUTF8Invalid($value)
{
if (preg_match('/\p{Cc}+/u', $value)) {
return true;
}

Expand Down
10 changes: 6 additions & 4 deletions src/Egulias/EmailValidator/EmailParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ public function __construct(EmailLexer $lexer)
}

/**
* @param string $str
* @param $str
* @return array
*/
public function parse($str)
{
Expand All @@ -39,15 +40,16 @@ public function parse($str)
throw new \InvalidArgumentException('ERR_NOLOCALPART');
}

if ($this->lexer->hasInvalidTokens()) {
throw new \InvalidArgumentException('ERR_INVALID_ATEXT');
}

$this->localPartParser->parse($str);
$this->domainPartParser->parse($str);

$this->setParts($str);

if ($this->lexer->hasInvalidTokens()) {
throw new \InvalidArgumentException('ERR_INVALID_ATEXT');
}

return array('local' => $this->localPart, 'domain' => $this->domainPart);
}

Expand Down
13 changes: 7 additions & 6 deletions tests/egulias/Tests/EmailValidator/EmailLexerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,17 @@ public function invalidUTF8CharsProvider()
{
$chars = array();
for ($i = 0; $i < 0x100; ++$i) {
$c = $this->utf8_chr($i);
if (preg_match('/[\x{0001}-\x{000F}\x{0080}-\x{009F}]/u', $c)) {
$c = $this->utf8Chr($i);
if (preg_match('/(?=\p{Cc})(?=[^\t\n\n\r])/u', $c) && !preg_match('/\x{0000}/u', $c)) {
$chars[] = array($c);
}
}

return $chars;
}

protected function utf8_chr($code_point) {
protected function utf8Chr($code_point)
{

if ($code_point < 0 || 0x10FFFF < $code_point || (0xD800 <= $code_point && $code_point <= 0xDFFF)) {
return '';
Expand All @@ -72,16 +73,16 @@ protected function utf8_chr($code_point) {
if ($code_point < 0x80) {
$hex[0] = $code_point;
$ret = chr($hex[0]);
} else if ($code_point < 0x800) {
} elseif ($code_point < 0x800) {
$hex[0] = 0x1C0 | $code_point >> 6;
$hex[1] = 0x80 | $code_point & 0x3F;
$ret = chr($hex[0]).chr($hex[1]);
} else if ($code_point < 0x10000) {
} elseif ($code_point < 0x10000) {
$hex[0] = 0xE0 | $code_point >> 12;
$hex[1] = 0x80 | $code_point >> 6 & 0x3F;
$hex[2] = 0x80 | $code_point & 0x3F;
$ret = chr($hex[0]).chr($hex[1]).chr($hex[2]);
} else {
} else {
$hex[0] = 0xF0 | $code_point >> 18;
$hex[1] = 0x80 | $code_point >> 12 & 0x3F;
$hex[2] = 0x80 | $code_point >> 6 & 0x3F;
Expand Down
2 changes: 1 addition & 1 deletion tests/egulias/Tests/EmailValidator/EmailValidatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public function testValidEmails($email)
$this->assertTrue($this->validator->isValid($email));
}

public function testInvalidUTF8Emails()
public function testInvalidUTF8Email()
{
$validator = new EmailValidator;
$email = "\x80\x81\x82@\x83\x84\x85.\x86\x87\x88";
Expand Down

0 comments on commit 9103f4f

Please sign in to comment.