From ccdbf6da61665b4b527a7b580a31b78b8593e1db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Mon, 27 May 2019 11:49:07 +0200 Subject: [PATCH] Support GSM National Language Shift Tables (#11) Support for GSM Language Shift Tables, information can be found here: en.wikipedia.org/wiki/GSM_03.38 Added support for Turkish, Spanish and Portuguese only, should be fairly easy to extended for other languages --- README.md | 12 ++- SMSCounter.php | 178 ++++++++++++++++++++++++++++++++++++--- Tests/SMSCounterTest.php | 73 ++++++++++++++++ 3 files changed, 251 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fe94fd5..1629e0e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ use Instasent\SMSCounter\SMSCounter; $smsCounter = new SMSCounter(); $smsCounter->count('some-string-to-be-counted'); +$smsCounter->countWithShiftTables('some-string-to-be-counted'); ``` which returns @@ -26,7 +27,7 @@ stdClass Object ) ``` -You can sanitize your text to be a valid GSM 03.38 charset +You can sanitize your text to be a valid strict GSM 03.38 charset ```php use Instasent\SMSCounter\SMSCounter; @@ -35,6 +36,15 @@ $smsCounter = new SMSCounter(); $smsCounter->sanitizeToGSM('dadáó'); //return dadao ``` +#### National Language Shift Tables + +Starting release 8 of GSM 03.38 some additional charsets are allowed. This is the list of such National Language Shift Tables currently supported + +- [Turkish](https://en.wikipedia.org/wiki/GSM_03.38#Turkish_language_(Latin_script)) +- [Spanish](https://en.wikipedia.org/wiki/GSM_03.38#Spanish_language_(Latin_script)) +- [Portuguese](https://en.wikipedia.org/wiki/GSM_03.38#Portuguese_language_(Latin_script)) + + ## Installation `sms-counter-php` is available via [composer](http://getcomposer.org) on [packagist](https://packagist.org/packages/instasent/sms-counter-php). diff --git a/SMSCounter.php b/SMSCounter.php index 950d6b7..13a12e3 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -105,18 +105,110 @@ public function getGsm7bitExMap() ); } + public function getTurkishGsm7bitMap() + { + return [ + 10, 12, 13, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, + 116, 117, 118, 119, 120, 121, 122, + 123, 124, 125, 126, 163, 164, 165, + 167, 196, 197, 199, 201, 209, 214, + 220, 223, 224, 228, 229, 231, 233, + 241, 242, 246, 249, 252, 286, 287, + 304, 305, 350, 351, 915, 916, 920, + 923, 926, 928, 931, 934, 936, 937, + 8364, + ]; + } + + public function getAddedTurkishGsm7bitExMap() + { + return [12, 91, 92, 93, 94, 123, 124, 125, 126, 286, 287, 304, 305, 350, 351, 8364]; + } + + public function getAddedSpanishGsm7bitExMap() + { + return [12, 91, 92, 93, 94, 123, 124, 125, 126, 193, 205, 211, 218, 225, 231, 237, 243, 250, 8364]; + } + + public function getPortugueseGsm7bitMap() + { + return [ + 10, 12, 13, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 163, 165, 167, 170, 186, 192, + 193, 194, 195, 199, 201, 202, 205, 211, + 212, 213, 218, 220, 224, 225, 226, 227, + 231, 233, 234, 237, 242, 243, 244, 245, + 250, 252, 915, 916, 920, 928, 931, 934, + 936, 937, 8364, 8734, + ]; + } + + public function getAddedPortugueseGsm7bitExMap() + { + return [ + 12, 91, 92, 93, 94, 123, 124, 125, + 126, 193, 194, 195, 202, 205, 211, 212, + 213, 218, 225, 226, 227, 231, 234, 237, + 242, 243, 245, 250, 915, 920, 928, 931, + 934, 936, 937, 8364, + ]; + } + /** * Detects the encoding, Counts the characters, message length, remaining characters. * * @return \stdClass Object with params encoding,length, per_message, remaining, messages */ public function count($text) + { + return $this->doCount($text, false); + } + + /** + * Detects the encoding, Counts the characters, message length, remaining characters. + * Supports language shift tables characters. + * + * @return \stdClass Object with params encoding,length, per_message, remaining, messages + */ + public function countWithShiftTables($text) + { + return $this->doCount($text, true); + } + + /** + * @return \stdClass Object with params encoding,length, per_message, remaining, messages + */ + private function doCount($text, $supportShiftTables) { $unicodeArray = $this->utf8ToUnicode($text); // variable to catch if any ex chars while encoding detection. $exChars = []; - $encoding = $this->detectEncoding($unicodeArray, $exChars); + $encoding = $supportShiftTables + ? $this->detectEncodingWithShiftTables($text, $exChars) + : $this->detectEncoding($text, $exChars); + $length = count($unicodeArray); if ($encoding === self::GSM_7BIT_EX) { @@ -173,17 +265,56 @@ public function count($text) public function detectEncoding($text, &$exChars) { if (!is_array($text)) { - $text = self::utf8ToUnicode($text); + $text = $this->utf8ToUnicode($text); } $utf16Chars = array_diff($text, $this->getGsm7bitExMap()); - if (count($utf16Chars)) { return self::UTF16; } $exChars = array_intersect($text, $this->getAddedGsm7bitExMap()); + if (count($exChars)) { + return self::GSM_7BIT_EX; + } + + return self::GSM_7BIT; + } + /** + * Detects the encoding of a particular text. + * Supports language shift tables characters. + * + * @return string (GSM_7BIT|GSM_7BIT_EX|UTF16) + */ + public function detectEncodingWithShiftTables($text, &$exChars) + { + if (!is_array($text)) { + $text = $this->utf8ToUnicode($text); + } + + $gsmCharMap = array_merge( + $this->getGsm7bitExMap(), + $this->getTurkishGsm7bitMap(), + $this->getAddedTurkishGsm7bitExMap(), + $this->getAddedSpanishGsm7bitExMap(), + $this->getPortugueseGsm7bitMap(), + $this->getAddedPortugueseGsm7bitExMap() + ); + + $utf16Chars = array_diff($text, $gsmCharMap); + if (count($utf16Chars)) { + return self::UTF16; + } + + $addedGsmCharMap = array_merge( + $this->getAddedGsm7bitExMap(), + $this->getAddedTurkishGsm7bitExMap(), + $this->getAddedSpanishGsm7bitExMap(), + $this->getAddedPortugueseGsm7bitExMap() + ); + + $exChars = array_intersect($text, $addedGsmCharMap); if (count($exChars)) { return self::GSM_7BIT_EX; } @@ -296,7 +427,7 @@ public function removeNonGsmChars($str) public function replaceNonGsmChars($str, $replacement = null) { $validChars = $this->getGsm7bitExMap(); - $allChars = self::utf8ToUnicode($str); + $allChars = $this->utf8ToUnicode($str); if (strlen($replacement) > 1) { return false; @@ -525,27 +656,50 @@ public function removeAccents($str) * the encoding an multipart limits to apply the truncate. * * @param string $str Message text - * @param int $messages Number of SMS allowed + * @param int $limitSms Number of SMS allowed * * @return string Truncated message */ public function truncate($str, $limitSms) { - $count = $this->count($str); + return $this->doTruncate($str, $limitSms, false); + } + + /** + * Truncated to the limit of chars allowed by number of SMS. It will detect + * the encoding an multipart limits to apply the truncate. + * Supports language shift tables characters. + * + * @param string $str Message text + * @param int $limitSms Number of SMS allowed + * + * @return string Truncated message + */ + public function truncateWithShiftTables($str, $limitSms) + { + return $this->doTruncate($str, $limitSms, true); + } + + /** + * @return string Truncated message + */ + private function doTruncate($str, $limitSms, $supportShiftTables) + { + $count = $supportShiftTables + ? $this->countWithShiftTables($str) + : $this->count($str); if ($count->messages <= $limitSms) { return $str; } - if ($count->encoding == 'UTF16') { + if ($count->encoding === 'UTF16') { $limit = self::UTF16_LEN; if ($limitSms > 2) { $limit = self::UTF16_LEN_MULTIPART; } - } - - if ($count->encoding != 'UTF16') { + } else { $limit = self::GSM_7BIT_LEN; if ($limitSms > 2) { @@ -555,7 +709,9 @@ public function truncate($str, $limitSms) do { $str = mb_substr($str, 0, $limit * $limitSms); - $count = $this->count($str); + $count = $supportShiftTables + ? $this->countWithShiftTables($str) + : $this->count($str); $limit = $limit - 1; } while ($count->messages > $limitSms); diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 9376bc0..93695ca 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -24,6 +24,57 @@ public function testGSM() $this->assertEquals($expected, $count); } + public function testGSM_TR() + { + $text = 'a GSM TR ç Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + + public function testGSM_ES() + { + $text = 'a GSM ES Ú Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + + public function testGSM_PT() + { + $text = 'a GSM PT Ã Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + public function testGSMSymbols() { $text = 'a GSM +Text'; @@ -162,6 +213,17 @@ public function testTruncate1SmsGSM7() $this->assertEquals($expectedTExt, $output); } + public function testTruncate1SmsGSM7ShiftTable() + { + $text = 'ÚLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem.'; + $expectedTExt = 'ÚLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturien'; + + $smsCounter = new SMSCounter(); + $output = $smsCounter->truncateWithShiftTables($text, 1); + + $this->assertEquals($expectedTExt, $output); + } + public function testTruncate2SmsGSM7() { $text = 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient'; @@ -173,6 +235,17 @@ public function testTruncate2SmsGSM7() $this->assertEquals($expectedTExt, $output); } + public function testTruncate2SmsGSM7ShiftTable() + { + $text = 'çLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturie'; + $expectedTExt = 'çLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magni'; + + $smsCounter = new SMSCounter(); + $output = $smsCounter->truncateWithShiftTables($text, 2); + + $this->assertEquals($expectedTExt, $output); + } + public function testTruncate1SmsUnicode() { $text = 'Snowman shows off! ☃ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa';