From 0340d1995f8270704b725d7f3e9bde24125d58c9 Mon Sep 17 00:00:00 2001 From: John Hooks Date: Thu, 14 Dec 2023 16:42:16 -0800 Subject: [PATCH] feat: add stemmers from tntsearch --- bin/local-wp-shell.sh | 3 + phpstan.neon.dist | 4 + .../WP_HTML_Tag_Processor.php | 44 +- src/Exceptions/Exception.php | 5 - src/Exceptions/IndexNotFoundException.php | 5 + src/Exceptions/IndexedSearchException.php | 7 + src/Exceptions/TokenizationException.php | 5 + src/Helpers/Str.php | 30 +- src/Index/IndexRepository.php | 8 + src/Stemmer/ArabicStemmer.php | 180 +++ src/Stemmer/CroatianStemmer.php | 340 +++++ src/Stemmer/FrenchStemmer.php | 720 ++++++++++ src/Stemmer/GermanStemmer.php | 258 ++++ src/Stemmer/ItalianStemmer.php | 463 ++++++ src/Stemmer/LatvianStemmer.php | 212 +++ src/Stemmer/NoStemmer.php | 11 + src/Stemmer/PolishStemmer.php | 163 +++ src/Stemmer/PorterStemmer.php | 403 ++++++ src/Stemmer/PortugeseStemmer.php | 766 ++++++++++ src/Stemmer/RussianStemmer.php | 112 ++ src/Stemmer/Stemmer.php | 8 + src/Stemmer/UkrainianStemmer.php | 113 ++ src/Stopwords/croatian.json | 182 +++ src/Stopwords/english.json | 187 +++ src/Stopwords/french.json | 159 ++ src/Stopwords/german.json | 234 +++ src/Stopwords/italian.json | 281 ++++ src/Stopwords/latvian.json | 165 +++ src/Stopwords/russian.json | 153 ++ src/Stopwords/spanish.json | 315 ++++ src/Stopwords/ukrainian.json | 1279 +++++++++++++++++ src/Tokenizer/HtmlTokenizer.php | 90 ++ src/Tokenizer/Tokenizer.php | 4 +- 33 files changed, 6890 insertions(+), 19 deletions(-) create mode 100755 bin/local-wp-shell.sh rename src/{Tokenizer => Compat}/WP_HTML_Tag_Processor.php (98%) delete mode 100644 src/Exceptions/Exception.php create mode 100644 src/Exceptions/IndexNotFoundException.php create mode 100644 src/Exceptions/IndexedSearchException.php create mode 100644 src/Exceptions/TokenizationException.php create mode 100644 src/Index/IndexRepository.php create mode 100644 src/Stemmer/ArabicStemmer.php create mode 100644 src/Stemmer/CroatianStemmer.php create mode 100644 src/Stemmer/FrenchStemmer.php create mode 100644 src/Stemmer/GermanStemmer.php create mode 100644 src/Stemmer/ItalianStemmer.php create mode 100644 src/Stemmer/LatvianStemmer.php create mode 100644 src/Stemmer/NoStemmer.php create mode 100644 src/Stemmer/PolishStemmer.php create mode 100644 src/Stemmer/PorterStemmer.php create mode 100644 src/Stemmer/PortugeseStemmer.php create mode 100644 src/Stemmer/RussianStemmer.php create mode 100644 src/Stemmer/Stemmer.php create mode 100644 src/Stemmer/UkrainianStemmer.php create mode 100644 src/Stopwords/croatian.json create mode 100644 src/Stopwords/english.json create mode 100644 src/Stopwords/french.json create mode 100644 src/Stopwords/german.json create mode 100644 src/Stopwords/italian.json create mode 100644 src/Stopwords/latvian.json create mode 100644 src/Stopwords/russian.json create mode 100644 src/Stopwords/spanish.json create mode 100644 src/Stopwords/ukrainian.json create mode 100644 src/Tokenizer/HtmlTokenizer.php diff --git a/bin/local-wp-shell.sh b/bin/local-wp-shell.sh new file mode 100755 index 0000000..652f377 --- /dev/null +++ b/bin/local-wp-shell.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +docker compose --file docker-compose.yml --env-file .env exec -w /var/www/html/wp-content/plugins/indexed-search wordpress bash -c "vendor/bin/wp --allow-root shell" diff --git a/phpstan.neon.dist b/phpstan.neon.dist index af0014e..41ae88d 100644 --- a/phpstan.neon.dist +++ b/phpstan.neon.dist @@ -13,3 +13,7 @@ parameters: scanDirectories: - tests - vendor-prod + excludePaths: + - src/Compat/WP_HTML_Tag_Processor.php + - src/Stemmer/* + - tests/*/_wordpress/* diff --git a/src/Tokenizer/WP_HTML_Tag_Processor.php b/src/Compat/WP_HTML_Tag_Processor.php similarity index 98% rename from src/Tokenizer/WP_HTML_Tag_Processor.php rename to src/Compat/WP_HTML_Tag_Processor.php index 34d9e0d..cdc8a2c 100644 --- a/src/Tokenizer/WP_HTML_Tag_Processor.php +++ b/src/Compat/WP_HTML_Tag_Processor.php @@ -1,6 +1,6 @@ get_attribute('class'); if (!is_string($class)) { @@ -912,7 +914,7 @@ public function release_bookmark($name) * Skips contents of generic rawtext elements. * * @since 6.3.2 - * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm + * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm * * @param string $tag_name The uppercase tag name which will close the RAWTEXT region. * @@ -932,7 +934,7 @@ private function skip_rawtext($tag_name) * Skips contents of RCDATA elements, namely title and textarea tags. * * @since 6.2.0 - * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state * * @param string $tag_name The uppercase tag name which will close the RCDATA region. * @@ -1518,8 +1520,8 @@ private function after_tag() * (they are accumulated in different data formats for performance). * * @since 6.2.0 - * @see WP_HTML_Tag_Processor::$lexical_updates - * @see WP_HTML_Tag_Processor::$classname_updates + * @see WP_HTML_Tag_Processor::$lexical_updates + * @see WP_HTML_Tag_Processor::$classname_updates */ private function class_name_updates_to_attributes_updates() { @@ -1551,7 +1553,6 @@ private function class_name_updates_to_attributes_updates() * attribute, skipping removed classes on the way, and then appending * added classes at the end. Only when finished processing will the * value contain the final new value. - * * @var string $class */ @@ -1999,7 +2000,7 @@ public function get_attribute($name) * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 - * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * * @param string $prefix Prefix of requested attribute names. * @@ -2374,7 +2375,7 @@ public function remove_class($class_name) * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 - * @see WP_HTML_Tag_Processor::get_updated_html() + * @see WP_HTML_Tag_Processor::get_updated_html() * * @return string The processed HTML. */ @@ -2567,4 +2568,27 @@ private function matches() return true; } + + /** + * Modifications by WpBlocks\Search + */ + public function get_token_starts_at() + { + return $this->token_starts_at; + } + + public function get_token_ends_at() + { + return $this->token_starts_at + $this->token_length; + } + + public function get_is_closing_tag() + { + return $this->is_closing_tag; + } + + public function substr(int $offset, ?int $length = null) + { + return substr($this->html, $offset, $length); + } } diff --git a/src/Exceptions/Exception.php b/src/Exceptions/Exception.php deleted file mode 100644 index 37931fb..0000000 --- a/src/Exceptions/Exception.php +++ /dev/null @@ -1,5 +0,0 @@ - */ public static function splitOnWhitespace(string $text): array { - return preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY); + $result = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY); + + if ($result === false) { + throw new \WpBlocks\Search\Exceptions\TokenizationException(); + } + + return $result; + } + + /** + * @return array + */ + public static function splitWords(string $text): array + { + $result = preg_split(self::$splitWordsPattern, $text, -1, PREG_SPLIT_NO_EMPTY); + + if ($result === false) { + throw new \WpBlocks\Search\Exceptions\TokenizationException(); + } + + return $result; } } diff --git a/src/Index/IndexRepository.php b/src/Index/IndexRepository.php new file mode 100644 index 0000000..ba1bfb8 --- /dev/null +++ b/src/Index/IndexRepository.php @@ -0,0 +1,8 @@ + + * + * Simple stemmer for arabic language rewritten by Nenad Tičarić + * + * @link https://github.com/teamtnt/tntsearch + * + * Copyright (c) 2016 Nenad Tičarić nticaric@gmail.com + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +namespace WpBlocks\Search\Stemmer; + +class ArabicStemmer implements Stemmer +{ + private static $_verbPre = 'وأسفلي'; + + private static $_verbPost = 'ومكانيه'; + + private static $_verbMay; + + private static $_verbMaxPre = 4; + + private static $_verbMaxPost = 6; + + private static $_verbMinStem = 2; + + private static $_nounPre = 'ابفكلوأ'; + + private static $_nounPost = 'اتةكمنهوي'; + + private static $_nounMay; + + private static $_nounMaxPre = 4; + + private static $_nounMaxPost = 6; + + private static $_nounMinStem = 2; + + /** + * Loads initialize values + * + * @ignore + */ + public function __construct() + { + self::$_verbMay = self::$_verbPre . self::$_verbPost; + self::$_nounMay = self::$_nounPre . self::$_nounPost; + } + + /** + * Get rough stem of the given Arabic word + * + * @param string $word Arabic word you would like to get its stem + * + * @return string Arabic stem of the word + * + * @author Khaled Al-Sham'aa + */ + public static function stem($word) + { + $nounStem = self::roughStem( + $word, + self::$_nounMay, + self::$_nounPre, + self::$_nounPost, + self::$_nounMaxPre, + self::$_nounMaxPost, + self::$_nounMinStem + ); + $verbStem = self::roughStem( + $word, + self::$_verbMay, + self::$_verbPre, + self::$_verbPost, + self::$_verbMaxPre, + self::$_verbMaxPost, + self::$_verbMinStem + ); + + if (mb_strlen($nounStem, 'UTF-8') < mb_strlen($verbStem, 'UTF-8')) { + $stem = $nounStem; + } else { + $stem = $verbStem; + } + + return $stem; + } + + /** + * Get rough stem of the given Arabic word (under specific rules) + * + * @param string $word Arabic word you would like to get its stem + * @param string $notChars Arabic chars those can't be in postfix or prefix + * @param string $preChars Arabic chars those may exists in the prefix + * @param string $postChars Arabic chars those may exists in the postfix + * @param integer $maxPre Max prefix length + * @param integer $maxPost Max postfix length + * @param integer $minStem Min stem length + * + * @return string Arabic stem of the word under giving rules + * + * @author Khaled Al-Sham'aa + */ + protected static function roughStem( + $word, + $notChars, + $preChars, + $postChars, + $maxPre, + $maxPost, + $minStem + ) { + $right = -1; + $left = -1; + $max = mb_strlen($word, 'UTF-8'); + + for ($i = 0; $i < $max; $i++) { + $needle = mb_substr($word, $i, 1, 'UTF-8'); + if (mb_strpos($notChars, $needle, 0, 'UTF-8') === false) { + if ($right == -1) { + $right = $i; + } + $left = $i; + } + } + + if ($right > $maxPre) { + $right = $maxPre; + } + + if ($max - $left - 1 > $maxPost) { + $left = $max - $maxPost - 1; + } + + for ($i = 0; $i < $right; $i++) { + $needle = mb_substr($word, $i, 1, 'UTF-8'); + if (mb_strpos($preChars, $needle, 0, 'UTF-8') === false) { + $right = $i; + break; + } + } + + for ($i = $max - 1; $i > $left; $i--) { + $needle = mb_substr($word, $i, 1, 'UTF-8'); + if (mb_strpos($postChars, $needle, 0, 'UTF-8') === false) { + $left = $i; + break; + } + } + + if ($left - $right >= $minStem) { + $stem = mb_substr($word, $right, $left - $right + 1, 'UTF-8'); + } else { + $stem = null; + } + + return $stem; + } +} diff --git a/src/Stemmer/CroatianStemmer.php b/src/Stemmer/CroatianStemmer.php new file mode 100644 index 0000000..d44cae0 --- /dev/null +++ b/src/Stemmer/CroatianStemmer.php @@ -0,0 +1,340 @@ + 0) { + return true; + } + + return false; + } + + public static function transformiraj($pojavnica) + { + foreach (self::$transformations as $trazi => $zamijeni) { + if (self::endsWith($pojavnica, $trazi)) { + return substr($pojavnica, 0, -1 * strlen($trazi)) . $zamijeni; + } + } + return $pojavnica; + } + + public static function korjenuj($pojavnica) + { + foreach (self::$rules as $rule) { + $rules = explode(' ', $rule); + $osnova = $rules[0]; + $nastavak = $rules[1]; + preg_match('/^(' . $osnova . ')(' . $nastavak . ')$/', $pojavnica, $dioba); + if (!empty($dioba)) { + if (self::imaSamoglasnik($dioba[1]) && strlen($dioba[1]) > 1) { + return $dioba[1]; + } + } + } + return $pojavnica; + } + + public static function endsWith($haystack, $needle) + { + // search forward starting from end minus needle length characters + return $needle === '' || (($temp = strlen($haystack) - strlen($needle)) >= 0 && strpos($haystack, $needle, $temp) !== false); + } + + protected static $transformations = [ + 'lozi' => 'loga', + 'lozima' => 'loga', + 'pjesi' => 'pjeh', + 'pjesima' => 'pjeh', + 'vojci' => 'vojka', + 'bojci' => 'bojka', + 'jaci' => 'jak', + 'jacima' => 'jak', + 'čajan' => 'čajni', + 'ijeran' => 'ijerni', + 'laran' => 'larni', + 'ijesan' => 'ijesni', + 'anjac' => 'anjca', + 'ajac' => 'ajca', + 'ajaca' => 'ajca', + 'ljaca' => 'ljca', + 'ljac' => 'ljca', + 'ejac' => 'ejca', + 'ejaca' => 'ejca', + 'ojac' => 'ojca', + 'ojaca' => 'ojca', + 'ajaka' => 'ajka', + 'ojaka' => 'ojka', + 'šaca' => 'šca', + 'šac' => 'šca', + 'inzima' => 'ing', + 'inzi' => 'ing', + 'tvenici' => 'tvenik', + 'tetici' => 'tetika', + 'teticima' => 'tetika', + 'nstava' => 'nstva', + 'nicima' => 'nik', + 'ticima' => 'tik', + 'zicima' => 'zik', + 'snici' => 'snik', + 'kuse' => 'kusi', + 'kusan' => 'kusni', + 'kustava' => 'kustva', + 'dušan' => 'dušni', + 'antan' => 'antni', + 'bilan' => 'bilni', + 'tilan' => 'tilni', + 'avilan' => 'avilni', + 'silan' => 'silni', + 'gilan' => 'gilni', + 'rilan' => 'rilni', + 'nilan' => 'nilni', + 'alan' => 'alni', + 'ozan' => 'ozni', + 'rave' => 'ravi', + 'stavan' => 'stavni', + 'pravan' => 'pravni', + 'tivan' => 'tivni', + 'sivan' => 'sivni', + 'atan' => 'atni', + 'cenata' => 'centa', + 'denata' => 'denta', + 'genata' => 'genta', + 'lenata' => 'lenta', + 'menata' => 'menta', + 'jenata' => 'jenta', + 'venata' => 'venta', + 'tetan' => 'tetni', + 'pletan' => 'pletni', + 'šave' => 'šavi', + 'manata' => 'manta', + 'tanata' => 'tanta', + 'lanata' => 'lanta', + 'sanata' => 'santa', + 'ačak' => 'ačka', + 'ačaka' => 'ačka', + 'ušak' => 'uška', + 'atak' => 'atka', + 'ataka' => 'atka', + 'atci' => 'atka', + 'atcima' => 'atka', + 'etak' => 'etka', + 'etaka' => 'etka', + 'itak' => 'itka', + 'itaka' => 'itka', + 'itci' => 'itka', + 'otak' => 'otka', + 'otaka' => 'otka', + 'utak' => 'utka', + 'utaka' => 'utka', + 'utci' => 'utka', + 'utcima' => 'utka', + 'eskan' => 'eskna', + 'tičan' => 'tični', + 'ojsci' => 'ojska', + 'esama' => 'esma', + 'metara' => 'metra', + 'centar' => 'centra', + 'centara' => 'centra', + 'istara' => 'istra', + 'istar' => 'istra', + 'ošću' => 'osti', + 'daba' => 'dba', + 'čcima' => 'čka', + 'čci' => 'čka', + 'mac' => 'mca', + 'maca' => 'mca', + 'naca' => 'nca', + 'nac' => 'nca', + 'voljan' => 'voljni', + 'anaka' => 'anki', + 'vac' => 'vca', + 'vaca' => 'vca', + 'saca' => 'sca', + 'sac' => 'sca', + 'naca' => 'nca', + 'nac' => 'nca', + 'raca' => 'rca', + 'rac' => 'rca', + 'aoca' => 'alca', + 'alaca' => 'alca', + 'alac' => 'alca', + 'elaca' => 'elca', + 'elac' => 'elca', + 'olaca' => 'olca', + 'olac' => 'olca', + 'olce' => 'olca', + 'njac' => 'njca', + 'njaca' => 'njca', + 'ekata' => 'ekta', + 'ekat' => 'ekta', + 'izam' => 'izma', + 'izama' => 'izma', + 'jebe' => 'jebi', + 'baci' => 'baci', + 'ašan' => 'ašni', + ]; + + protected static $rules = [ + '.+(s|š)k ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u', + '.+(s|š)tv ima|om|o|a|u', + // N + '.+(t|m|p|r|g)anij ama|ima|om|a|u|e|i| ', + '.+an inom|ina|inu|ine|ima|in|om|u|i|a|e| ', + '.+in ima|ama|om|a|e|i|u|o| ', + '.+on ovima|ova|ove|ovi|ima|om|a|e|i|u| ', + '.+n ijima|ijega|ijemu|ijeg|ijem|ijim|ijih|ijoj|iji|ije|ija|iju|ima|ome|omu|oga|oj|om|ih|im|og|o|e|a|u|i| ', + // Ć + '.+(a|e|u)ć oga|ome|omu|ega|emu|ima|oj|ih|om|eg|em|og|uh|im|e|a', + // G + '.+ugov ima|i|e|a', + '.+ug ama|om|a|e|i|u|o', + '.+log ama|om|a|u|e| ', + '.+[^eo]g ovima|ama|ovi|ove|ova|om|a|e|i|u|o| ', + // I + '.+(rrar|ott|ss|ll)i jem|ja|ju|o| ', + // J + '.+uj ući|emo|ete|mo|em|eš|e|u| ', + '.+(c|č|ć|đ|l|r)aj evima|evi|eva|eve|ama|ima|em|a|e|i|u| ', + '.+(b|c|d|l|n|m|ž|g|f|p|r|s|t|z)ij ima|ama|om|a|e|i|u|o| ', + // L + //.+al inom|ina|inu|ine|ima|om|in|i|a|e + //.+[^(lo|ž)]il ima|om|a|e|u|i| + '.+[^z]nal ima|ama|om|a|e|i|u|o| ', + '.+ijal ima|ama|om|a|e|i|u|o| ', + '.+ozil ima|om|a|e|u|i| ', + '.+olov ima|i|a|e', + '.+ol ima|om|a|u|e|i| ', + // M + '.+lem ama|ima|om|a|e|i|u|o| ', + '.+ram ama|om|a|e|i|u|o', + //.+(es|e|u)m ama|om|a|e|i|u|o + // R + //.+(a|d|e|o|u)r ama|ima|om|u|a|e|i| + '.+(a|d|e|o)r ama|ima|om|u|a|e|i| ', + // S + '.+(e|i)s ima|om|e|a|u', + // Š + '.+(t|n|j|k|j|t|b|g|v)aš ama|ima|om|em|a|u|i|e| ', + '.+(e|i)š ima|ama|om|em|i|e|a|u| ', + // T + '.+ikat ima|om|a|e|i|u|o| ', + '.+lat ima|om|a|e|i|u|o| ', + '.+et ama|ima|om|a|e|i|u|o| ', + //.+ot ama|ima|om|a|u|e|i| + '.+(e|i|k|o)st ima|ama|om|a|e|i|u|o| ', + '.+išt ima|em|a|e|u', + //.+ut ovima|evima|ove|ovi|ova|eve|evi|eva|ima|om|a|u|e|i| + // V + '.+ova smo|ste|hu|ti|še|li|la|le|lo|t|h|o', + '.+(a|e|i)v ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|ama|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o| ', + '.+[^dkml]ov ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o| ', + '.+(m|l)ov ima|om|a|u|e|i| ', + // PRIDJEVI + '.+el ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o| ', + '.+(a|e|š)nj ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|ega|emu|eg|em|im|ih|oj|om|og|a|e|i|o|u', + '.+čin ama|ome|omu|oga|ima|og|om|im|ih|oj|a|u|i|o|e| ', + '.+roši vši|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o', + '.+oš ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e| ', + '.+(e|o)vit ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u| ', + //.+tit ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|e|o|a|u|i| + '.+ast ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u| ', + '.+k ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o| ', + // GLAGOLI + '.+(e|a|i|u)va jući|smo|ste|jmo|jte|ju|la|le|li|lo|mo|na|ne|ni|no|te|ti|še|hu|h|j|m|n|o|t|v|š| ', + '.+ir ujemo|ujete|ujući|ajući|ivat|ujem|uješ|ujmo|ujte|avši|asmo|aste|ati|amo|ate|aju|aše|ahu|ala|alo|ali|ale|uje|uju|uj|al|an|am|aš|at|ah|ao', + '.+ač ismo|iste|iti|imo|ite|iše|eći|ila|ilo|ili|ile|ena|eno|eni|ene|io|im|iš|it|ih|en|i|e', + '.+ača vši|smo|ste|smo|ste|hu|ti|mo|te|še|la|lo|li|le|ju|na|no|ni|ne|o|m|š|t|h|n', + //.+ači smo|ste|ti|li|la|lo|le|mo|te|še|m|š|t|h|o| + // Druga_vrsta + '.+n uvši|usmo|uste|ući|imo|ite|emo|ete|ula|ulo|ule|uli|uto|uti|uta|em|eš|uo|ut|e|u|i', + '.+ni vši|smo|ste|ti|mo|te|mo|te|la|lo|le|li|m|š|o', + // A + '.+((a|r|i|p|e|u)st|[^o]g|ik|uc|oj|aj|lj|ak|ck|čk|šk|uk|nj|im|ar|at|et|št|it|ot|ut|zn|zv)a jući|vši|smo|ste|jmo|jte|jem|mo|te|je|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š', + '.+ur ajući|asmo|aste|ajmo|ajte|amo|ate|aju|ati|aše|ahu|ala|ali|ale|alo|ana|ano|ani|ane|al|at|ah|ao|aj|an|am|aš', + '.+(a|i|o)staj asmo|aste|ahu|ati|emo|ete|aše|ali|ući|ala|alo|ale|mo|ao|em|eš|at|ah|te|e|u| ', + '.+(b|c|č|ć|d|e|f|g|j|k|n|r|t|u|v)a lama|lima|lom|lu|li|la|le|lo|l', + '.+(t|č|j|ž|š)aj evima|evi|eva|eve|ama|ima|em|a|e|i|u| ', + //.+(e|j|k|r|u|v)al ama|ima|om|u|i|a|e|o| + //.+(e|j|k|r|t|u|v)al ih|im + '.+([^o]m|ič|nč|uč|b|c|ć|d|đ|h|j|k|l|n|p|r|s|š|v|z|ž)a jući|vši|smo|ste|jmo|jte|mo|te|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š', + '.+(a|i|o)sta dosmo|doste|doše|nemo|demo|nete|dete|nimo|nite|nila|vši|nem|dem|neš|deš|doh|de|ti|ne|nu|du|la|li|lo|le|t|o', + '.+ta smo|ste|jmo|jte|vši|ti|mo|te|ju|še|la|lo|le|li|na|no|ni|ne|n|j|o|m|š|t|h', + '.+inj asmo|aste|ati|emo|ete|ali|ala|alo|ale|aše|ahu|em|eš|at|ah|ao', + '.+as temo|tete|timo|tite|tući|tem|teš|tao|te|li|ti|la|lo|le', + // I + '.+(elj|ulj|tit|ac|ič|od|oj|et|av|ov)i vši|eći|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o', + '.+(tit|jeb|ar|ed|uš|ič)i jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o', + '.+(b|č|d|l|m|p|r|s|š|ž)i jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|lu|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o', + '.+luč ujete|ujući|ujemo|ujem|uješ|ismo|iste|ujmo|ujte|uje|uju|iše|iti|imo|ite|ila|ilo|ili|ile|ena|eno|eni|ene|uj|io|en|im|iš|it|ih|e|i', + '.+jeti smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o', + '.+e lama|lima|lom|lu|li|la|le|lo|l', + '.+i lama|lima|lom|lu|li|la|le|lo|l', + // Pridjev_t + '.+at ijega|ijemu|ijima|ijeg|ijem|ijih|ijim|ima|oga|ome|omu|iji|ije|ija|iju|oj|og|om|im|ih|a|u|i|e|o| ', + // Pridjev + '.+et avši|ući|emo|imo|em|eš|e|u|i', + '.+ ajući|alima|alom|avši|asmo|aste|ajmo|ajte|ivši|amo|ate|aju|ati|aše|ahu|ali|ala|ale|alo|ana|ano|ani|ane|am|aš|at|ah|ao|aj|an', + '.+ anje|enje|anja|enja|enom|enoj|enog|enim|enih|anom|anoj|anog|anim|anih|eno|ovi|ova|oga|ima|ove|enu|anu|ena|ama', + '.+ nijega|nijemu|nijima|nijeg|nijem|nijim|nijih|nima|niji|nije|nija|niju|noj|nom|nog|nim|nih|an|na|nu|ni|ne|no', + '.+ om|og|im|ih|em|oj|an|u|o|i|e|a', + ]; +} diff --git a/src/Stemmer/FrenchStemmer.php b/src/Stemmer/FrenchStemmer.php new file mode 100644 index 0000000..45d0897 --- /dev/null +++ b/src/Stemmer/FrenchStemmer.php @@ -0,0 +1,720 @@ +analyze($word); + } + + public function analyze($word) + { + $this->word = mb_strtolower($word); + + $this->plainVowels = implode('', static::$vowels); + + $this->step0(); + + $this->rv(); + $this->r1(); + $this->r2(); + + // to know if step1, 2a or 2b have altered the word + $this->originalWord = $this->word; + + $nextStep = $this->step1(); + + // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. + if (($nextStep == 2) || ($this->originalWord === $this->word)) { + $modified = $this->step2a(); + + if (!$modified) { + $this->step2b(); + } + } + + if ($this->word != $this->originalWord) { + $this->step3(); + } else { + $this->step4(); + } + + $this->step5(); + $this->step6(); + $this->finish(); + + return $this->word; + } + + /** + * Assume the word is in lower case. + * Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel. + * u after q is also put into upper case. For example, + * jouer -> joUer + * ennuie -> ennuIe + * yeux -> Yeux + * quand -> qUand + */ + private function step0() + { + $this->word = preg_replace('#([q])u#u', '$1U', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); + $this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); + } + + /** + * Step 1 + * Search for the longest among the following suffixes, and perform the action indicated. + * + * @return integer Next step number + */ + private function step1() + { + // ance iqUe isme able iste eux ances iqUes ismes ables istes + // delete if in R2 + if (($position = $this->search([ + 'ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe', 'isme', 'able', 'iste', 'eux', + ])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } + + return 3; + } + + // atrice ateur ation atrices ateurs ations + // delete if in R2 + // if preceded by ic, delete if in R2, else replace by iqU + if (($position = $this->search(['atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + + if (($position2 = $this->searchIfInR2(['ic'])) !== false) { + $this->word = mb_substr($this->word, 0, $position2); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + } + + return 3; + } + + // logie logies + // replace with log if in R2 + if (($position = $this->search(['logies', 'logie'])) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word); + } + + return 3; + } + + // usion ution usions utions + // replace with u if in R2 + if (($position = $this->search(['usions', 'utions', 'usion', 'ution'])) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word); + } + + return 3; + } + + // ence ences + // replace with ent if in R2 + if (($position = $this->search(['ences', 'ence'])) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word); + } + + return 3; + } + + // issement issements + // delete if in R1 and preceded by a non-vowel + if (($position = $this->search(['issements', 'issement'])) != false) { + if ($this->inR1($position)) { + $before = $position - 1; + $letter = mb_substr($this->word, $before, 1); + + if (!in_array($letter, static::$vowels)) { + $this->word = mb_substr($this->word, 0, $position); + } + } + + return 3; + } + + // ement ements + // delete if in RV + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, + // if preceded by abl or iqU, delete if in R2, otherwise, + // if preceded by ièr or Ièr, replace by i if in RV + if (($position = $this->search(['ements', 'ement'])) !== false) { + if ($this->inRv($position)) { + $this->word = mb_substr($this->word, 0, $position); + } + + if (($position = $this->searchIfInR2(['iv'])) !== false) { + $this->word = mb_substr($this->word, 0, $position); + + if (($position2 = $this->searchIfInR2(['at'])) !== false) { + $this->word = mb_substr($this->word, 0, $position2); + } + } elseif (($position = $this->search(['eus'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } elseif ($this->inR1($position)) { + $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); + } + } elseif (($position = $this->searchIfInR2(['abl', 'iqU'])) !== false) { + $this->word = mb_substr($this->word, 0, $position); + } elseif (($this->searchIfInRv(['ièr', 'Ièr'])) !== false) { + $this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word); + } + + return 3; + } + + // ité ités + // delete if in R2 + // if preceded by abil, delete if in R2, else replace by abl, otherwise, + // if preceded by ic, delete if in R2, else replace by iqU, otherwise, + // if preceded by iv, delete if in R2 + if (($position = $this->search(['ités', 'ité'])) !== false) { + // delete if in R2 + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } + + // if preceded by abil, delete if in R2, else replace by abl, otherwise, + if (($position = $this->search(['abil'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } else { + $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); + } + + // if preceded by ic, delete if in R2, else replace by iqU, otherwise, + } elseif (($position = $this->search(['ic'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + + // if preceded by iv, delete if in R2 + } elseif (($position = $this->searchIfInR2(['iv'])) !== false) { + $this->word = mb_substr($this->word, 0, $position); + } + + return 3; + } + + // if ive ifs ives + // delete if in R2 + // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU) + if (($position = $this->search(['ifs', 'ives', 'if', 'ive'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } + + if (($position = $this->searchIfInR2(['at'])) !== false) { + $this->word = mb_substr($this->word, 0, $position); + + if (($position2 = $this->search(['ic'])) !== false) { + if ($this->inR2($position2)) { + $this->word = mb_substr($this->word, 0, $position2); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + } + } + + return 3; + } + + // eaux + // replace with eau + if (($this->search(['eaux'])) !== false) { + $this->word = preg_replace('#(eaux)$#u', 'eau', $this->word); + + return 3; + } + + // aux + // replace with al if in R1 + if (($position = $this->search(['aux'])) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(aux)$#u', 'al', $this->word); + } + + return 3; + } + + // euse euses + // delete if in R2, else replace by eux if in R1 + if (($position = $this->search(['euses', 'euse'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } elseif ($this->inR1($position)) { + $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); + } + + return 3; + } + + // amment + // replace with ant if in RV + if (($position = $this->search(['amment'])) !== false) { + if ($this->inRv($position)) { + $this->word = preg_replace('#(amment)$#u', 'ant', $this->word); + } + return 2; + } + + // emment + // replace with ent if in RV + if (($position = $this->search(['emment'])) !== false) { + if ($this->inRv($position)) { + $this->word = preg_replace('#(emment)$#u', 'ent', $this->word); + } + + return 2; + } + + // ment ments + // delete if preceded by a vowel in RV + if (($position = $this->search(['ments', 'ment'])) != false) { + $before = $position - 1; + $letter = mb_substr($this->word, $before, 1); + + if ($this->inRv($before) && (in_array($letter, static::$vowels))) { + $this->word = mb_substr($this->word, 0, $position); + } + + return 2; + } + + return 2; + } + + /** + * Step 2a: Verb suffixes beginning i + * In steps 2a and 2b all tests are confined to the RV region. + * Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel. + * îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez + * irions irons iront is issaIent issais issait issant issante issantes issants isse + * issent isses issez issiez issions issons it + * (Note that the non-vowel itself must also be in RV.) + */ + private function step2a() + { + if (($position = $this->searchIfInRv([ + 'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez', + 'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants', + 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'])) !== false) { + $before = $position - 1; + $letter = mb_substr($this->word, $before, 1); + + if ($this->inRv($before) && (!in_array($letter, static::$vowels))) { + $this->word = mb_substr($this->word, 0, $position); + + return true; + } + } + + return false; + } + + /** + * Do step 2b if step 2a was done, but failed to remove a suffix. + * Step 2b: Other verb suffixes + */ + private function step2b() + { + // é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez + // delete + if (($position = $this->searchIfInRv([ + 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', + 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez', 'és', 'ée', 'é'])) !== false) { + $this->word = mb_substr($this->word, 0, $position); + + return true; + } + + // âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions + // delete + // if preceded by e, delete + if (($position = $this->searchIfInRv([ + 'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant', + 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'])) !== false) { + $before = $position - 1; + $letter = mb_substr($this->word, $before, 1); + + if ($this->inRv($before) && ($letter === 'e')) { + $this->word = mb_substr($this->word, 0, $before); + } else { + $this->word = mb_substr($this->word, 0, $position); + } + + return true; + } + + // ions + // delete if in R2 + if (($position = $this->searchIfInRv(['ions'])) !== false) { + if ($this->inR2($position)) { + $this->word = mb_substr($this->word, 0, $position); + } + + return true; + } + + return false; + } + + /** + * Step 3: Replace final Y with i or final ç with c + */ + private function step3() + { + $this->word = preg_replace('#(Y)$#u', 'i', $this->word); + $this->word = preg_replace('#(ç)$#u', 'c', $this->word); + } + + /** + * Step 4: Residual suffix + */ + private function step4() + { + //If the word ends s, not preceded by a, i, o, u, è or s, delete it. + if (preg_match('#[^aiouès]s$#', $this->word)) { + $this->word = mb_substr($this->word, 0, -1); + } + + // In the rest of step 4, all tests are confined to the RV region. + // ion + // delete if in R2 and preceded by s or t + if ((($position = $this->searchIfInRv(['ion'])) !== false) && ($this->inR2($position))) { + $before = $position - 1; + $letter = mb_substr($this->word, $before, 1); + + if ($this->inRv($before) && (($letter === 's') || ($letter === 't'))) { + $this->word = mb_substr($this->word, 0, $position); + } + + return true; + } + + // ier ière Ier Ière + // replace with i + if (($this->searchIfInRv(['ier', 'ière', 'Ier', 'Ière'])) !== false) { + $this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word); + + return true; + } + + // e + // delete + if (($this->searchIfInRv(['e'])) !== false) { + $this->word = mb_substr($this->word, 0, -1); + + return true; + } + + // ë + // if preceded by gu, delete + if (($position = $this->searchIfInRv(['guë'])) !== false) { + if ($this->inRv($position + 2)) { + $this->word = mb_substr($this->word, 0, -1); + + return true; + } + } + + return false; + } + + /** + * Step 5: Undouble + * If the word ends enn, onn, ett, ell or eill, delete the last letter + */ + private function step5() + { + if ($this->search(['enn', 'onn', 'ett', 'ell', 'eill']) !== false) { + $this->word = mb_substr($this->word, 0, -1); + } + } + + /** + * Step 6: Un-accent + * If the words ends é or è followed by at least one non-vowel, remove the accent from the e. + */ + private function step6() + { + $this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word); + } + + /** + * And finally: + * Turn any remaining I, U and Y letters in the word back into lower case. + */ + private function finish() + { + $this->word = str_replace(['I', 'U', 'Y'], ['i', 'u', 'y'], $this->word); + } + + /** + * If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found. + * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) + */ + protected function rv() + { + $length = mb_strlen($this->word); + + $this->rv = ''; + $this->rvIndex = $length; + + if ($length < 3) { + return true; + } + + // If the word begins with two vowels, RV is the region after the third letter + $first = mb_substr($this->word, 0, 1); + $second = mb_substr($this->word, 1, 1); + + if ((in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) { + $this->rv = mb_substr($this->word, 3); + $this->rvIndex = 3; + + return true; + } + + // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) + $begin3 = mb_substr($this->word, 0, 3); + + if (in_array($begin3, ['par', 'col', 'tap'])) { + $this->rv = mb_substr($this->word, 3); + $this->rvIndex = 3; + + return true; + } + + // otherwise the region after the first vowel not at the beginning of the word, + for ($i = 1; $i < $length; ++$i) { + $letter = mb_substr($this->word, $i, 1); + + if (in_array($letter, static::$vowels)) { + $this->rv = mb_substr($this->word, ($i + 1)); + $this->rvIndex = $i + 1; + + return true; + } + } + + return false; + } + + protected function inRv($position) + { + return ($position >= $this->rvIndex); + } + + protected function inR1($position) + { + return ($position >= $this->r1Index); + } + + protected function inR2($position) + { + return ($position >= $this->r2Index); + } + + protected function searchIfInRv($suffixes) + { + return $this->search($suffixes, $this->rvIndex); + } + + protected function searchIfInR2($suffixes) + { + return $this->search($suffixes, $this->r2Index); + } + + protected function search($suffixes, $offset = 0) + { + $length = mb_strlen($this->word); + + if ($offset > $length) { + return false; + } + + foreach ($suffixes as $suffixe) { + if ((($position = mb_strrpos($this->word, $suffixe, $offset)) !== false) + && ((mb_strlen($suffixe) + $position) == $length)) { + return $position; + } + } + + return false; + } + + /** + * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. + */ + protected function r1() + { + [$this->r1Index, $this->r1] = $this->rx($this->word); + } + + /** + * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. + */ + protected function r2() + { + [$index, $value] = $this->rx($this->r1); + + $this->r2 = $value; + $this->r2Index = $this->r1Index + $index; + } + + /** + * Common function for R1 and R2 + * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. + * R1 : $in = $this->word + * R2 : $in = R1 + */ + protected function rx($in) + { + $length = mb_strlen($in); + + // defaults + $value = ''; + $index = $length; + + // we search all vowels + $vowels = []; + + for ($i = 0; $i < $length; ++$i) { + $letter = mb_substr($in, $i, 1); + + if (in_array($letter, static::$vowels)) { + $vowels[] = $i; + } + } + + // search the non-vowel following a vowel + foreach ($vowels as $position) { + $after = $position + 1; + $letter = mb_substr($in, $after, 1); + + if (!in_array($letter, static::$vowels)) { + $index = $after + 1; + $value = mb_substr($in, ($after + 1)); + + break; + } + } + + return [$index, $value]; + } +} diff --git a/src/Stemmer/GermanStemmer.php b/src/Stemmer/GermanStemmer.php new file mode 100644 index 0000000..e79f6a0 --- /dev/null +++ b/src/Stemmer/GermanStemmer.php @@ -0,0 +1,258 @@ + + * @author Pascal Landau + */ + +class GermanStemmer implements Stemmer +{ + /** + * R1 and R2 regions (see the Porter algorithm) + */ + private static $R1; + + private static $R2; + + private static $cache = []; + + private static $vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü']; + + private static $s_ending = ['b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't']; + + private static $st_ending = ['b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't']; + + /** + * Gets the stem of $word. + * + * @param string $word + * + * @return string + */ + public static function stem($word) + { + $word = mb_strtolower($word); + //check for invalid characters + preg_match('#.#u', $word); + if (preg_last_error() !== 0) { + throw new \InvalidArgumentException("Word '$word' seems to be errornous. Error code from preg_last_error(): " . preg_last_error()); + } + if (!isset(self::$cache[$word])) { + $result = self::getStem($word); + self::$cache[$word] = $result; + } + + return self::$cache[$word]; + } + + /** + * @param $word + * + * @return string + */ + private static function getStem($word) + { + $word = self::step0a($word); + $word = self::step1($word); + $word = self::step2($word); + $word = self::step3($word); + $word = self::step0b($word); + + return $word; + } + + /** + * Replaces to protect some characters + * + * @param string $word + * + * @return string mixed + */ + private static function step0a($word) + { + $vstr = implode('', self::$vowels); + $word = preg_replace('#([' . $vstr . '])u([' . $vstr . '])#u', '$1U$2', $word); + $word = preg_replace('#([' . $vstr . '])y([' . $vstr . '])#u', '$1Y$2', $word); + + return $word; + } + + /** + * Undo the initial replaces + * + * @param string $word + * + * @return string + */ + private static function step0b($word) + { + $word = str_replace(['ä', 'ö', 'ü', 'U', 'Y'], ['a', 'o', 'u', 'u', 'y'], $word); + + return $word; + } + + private static function step1($word) + { + $word = str_replace('ß', 'ss', $word); + + self::getR($word); + + $replaceCount = 0; + + $arr = ['em', 'ern', 'er']; + foreach ($arr as $s) { + self::$R1 = preg_replace('#' . $s . '$#u', '', self::$R1, -1, $replaceCount); + if ($replaceCount > 0) { + $word = preg_replace('#' . $s . '$#u', '', $word); + } + } + + $arr = ['en', 'es', 'e']; + foreach ($arr as $s) { + self::$R1 = preg_replace('#' . $s . '$#u', '', self::$R1, -1, $replaceCount); + if ($replaceCount > 0) { + $word = preg_replace('#' . $s . '$#u', '', $word); + $word = preg_replace('#niss$#u', 'nis', $word); + } + } + + $word = preg_replace('/([' . implode('', self::$s_ending) . '])s$/u', '$1', $word); + + return $word; + } + + private static function step2($word) + { + self::getR($word); + + $replaceCount = 0; + + $arr = ['est', 'er', 'en']; + foreach ($arr as $s) { + self::$R1 = preg_replace('#' . $s . '$#u', '', self::$R1, -1, $replaceCount); + if ($replaceCount > 0) { + $word = preg_replace('#' . $s . '$#u', '', $word); + } + } + + if (strpos(self::$R1, 'st') !== false) { + self::$R1 = preg_replace('#st$#u', '', self::$R1); + $word = preg_replace('#(...[' . implode('', self::$st_ending) . '])st$#u', '$1', $word); + } + + return $word; + } + + private static function step3($word) + { + self::getR($word); + + $replaceCount = 0; + + $arr = ['end', 'ung']; + foreach ($arr as $s) { + if (preg_match('#' . $s . '$#u', self::$R2)) { + $word = preg_replace('#([^e])' . $s . '$#u', '$1', $word, -1, $replaceCount); + if ($replaceCount > 0) { + self::$R2 = preg_replace('#' . $s . '$#u', '', self::$R2, -1, $replaceCount); + } + } + } + + $arr = ['isch', 'ik', 'ig']; + foreach ($arr as $s) { + if (preg_match('#' . $s . '$#u', self::$R2)) { + $word = preg_replace('#([^e])' . $s . '$#u', '$1', $word, -1, $replaceCount); + if ($replaceCount > 0) { + self::$R2 = preg_replace('#' . $s . '$#u', '', self::$R2); + } + } + } + + $arr = ['lich', 'heit']; + foreach ($arr as $s) { + self::$R2 = preg_replace('#' . $s . '$#u', '', self::$R2, -1, $replaceCount); + if ($replaceCount > 0) { + $word = preg_replace('#' . $s . '$#u', '', $word); + } else { + if (preg_match('#' . $s . '$#u', self::$R1)) { + $word = preg_replace('#(er|en)' . $s . '$#u', '$1', $word, -1, $replaceCount); + if ($replaceCount > 0) { + self::$R1 = preg_replace('#' . $s . '$#u', '', self::$R1); + } + } + } + } + + $arr = ['keit']; + foreach ($arr as $s) { + self::$R2 = preg_replace('#' . $s . '$#u', '', self::$R2, -1, $replaceCount); + if ($replaceCount > 0) { + $word = preg_replace('#' . $s . '$#u', '', $word); + } + } + + return $word; + } + + /** + * Find R1 and R2 + * + * @param string $word + */ + private static function getR($word) + { + self::$R1 = ''; + self::$R2 = ''; + + $vowels = implode('', self::$vowels); + $vowelGroup = "[{$vowels}]"; + $nonVowelGroup = "[^{$vowels}]"; + // R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. + $pattern = "#(?P.*?{$vowelGroup}{$nonVowelGroup})(?P.*)#u"; + if (preg_match($pattern, $word, $match)) { + $rest = $match['rest']; + $r1 = $match['r']; + // [...], but then R1 is adjusted so that the region before it contains at least 3 letters. + $cutOff = 3 - mb_strlen($rest); + if ($cutOff > 0) { + $r1 = mb_substr($r1, $cutOff); + } + self::$R1 = $r1; + } + + //R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel. + if (preg_match($pattern, self::$R1, $match)) { + self::$R2 = $match['r']; + } + } +} diff --git a/src/Stemmer/ItalianStemmer.php b/src/Stemmer/ItalianStemmer.php new file mode 100644 index 0000000..40b8bb6 --- /dev/null +++ b/src/Stemmer/ItalianStemmer.php @@ -0,0 +1,463 @@ +, + * was originally written by Roberto Mirizzi (, + * ) in February 2007. It was the PHP5 implementation + * of Martin Porter's stemming algorithm for Italian language. This algorithm can be found + * at the address: . + * + * It was rewritten in March 2017 for TNTSearch by GaspariLab S.r.l., . + */ + +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +class ItalianStemmer implements Stemmer +{ + private static $cache = []; + + private static $vocali = ['a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù']; + + private static $consonanti = [ + 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z', + 'I', 'U', + ]; + + private static $accenti_acuti = ['á', 'é', 'í', 'ó', 'ú']; + + private static $accenti_gravi = ['à', 'è', 'ì', 'ò', 'ù']; + + private static $suffissi_step0 = [ + 'ci', 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'sene', + 'gliela', 'gliele', 'glieli', 'glielo', 'gliene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', + 'teli', 'telo', 'tene', 'cela', 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', + ]; + + private static $suffissi_step1_a = [ + 'anza', 'anze', 'ico', 'ici', 'ica', 'ice', 'iche', 'ichi', 'ismo', 'ismi', 'abile', 'abili', 'ibile', + 'ibili', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'oso', 'osi', 'osa', 'ose', 'mente', 'atrice', + 'atrici', 'ante', 'anti', + ]; + + private static $suffissi_step1_b = ['azione', 'azioni', 'atore', 'atori']; + + private static $suffissi_step1_c = ['logia', 'logie']; + + private static $suffissi_step1_d = ['uzione', 'uzioni', 'usione', 'usioni']; + + private static $suffissi_step1_e = ['enza', 'enze']; + + private static $suffissi_step1_f = ['amento', 'amenti', 'imento', 'imenti']; + + private static $suffissi_step1_g = ['amente']; + + private static $suffissi_step1_h = ['ità']; + + private static $suffissi_step1_i = ['ivo', 'ivi', 'iva', 'ive']; + + private static $suffissi_step2 = [ + 'ammo', 'ando', 'ano', 'are', 'arono', 'asse', 'assero', 'assi', 'assimo', 'ata', 'ate', 'ati', 'ato', 'ava', + 'avamo', 'avano', 'avate', 'avi', 'avo', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erà', 'erai', 'eranno', + 'ere', 'erebbe', 'erebbero', 'erei', 'eremmo', 'eremo', 'ereste', 'eresti', 'erete', 'erò', 'erono', 'essero', + 'ete', 'eva', 'evamo', 'evano', 'evate', 'evi', 'evo', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'iranno', 'ire', + 'irebbe', 'irebbero', 'irei', 'iremmo', 'iremo', 'ireste', 'iresti', 'irete', 'irò', 'irono', 'isca', + 'iscano', 'isce', 'isci', 'isco', 'iscono', 'issero', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivamo', 'ivano', + 'ivate', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'ar', 'ir', + ]; + + private static $ante_suff_a = ['ando', 'endo']; + + private static $ante_suff_b = ['ar', 'er', 'ir']; + + public function __construct() + { + usort(self::$suffissi_step0, function ($a, $b) { return mb_strlen($a) > mb_strlen($b) ? -1 : 1; }); + usort(self::$suffissi_step1_a, function ($a, $b) { return mb_strlen($a) > mb_strlen($b) ? -1 : 1;}); + usort(self::$suffissi_step2, function ($a, $b) { return mb_strlen($a) > mb_strlen($b) ? -1 : 1;}); + } + + /** + * Gets the stem of $word. + * + * @param string $word + * + * @return string + */ + public static function stem($word) + { + $word = mb_strtolower($word); + + // Check for invalid characters + preg_match('#.#u', $word); + if (preg_last_error() !== 0) { + throw new \InvalidArgumentException('Word "'.$word.'" seems to be errornous. + Error code from preg_last_error(): '.preg_last_error()); + } + + if (!isset(self::$cache[$word])) { + $result = self::getStem($word); + self::$cache[$word] = $result; + } + + return self::$cache[$word]; + } + + /** + * @param $word + * + * @return string + */ + private static function getStem($word) + { + $str = self::trim($word); + $str = self::toLower($str); + $str = self::replaceAccAcuti($str); + $str = self::putUAfterQToUpper($str); + $str = self::IUBetweenVowToUpper($str); + $step0 = self::step0($str); + $step1 = self::step1($step0); + $step2 = self::step2($step0, $step1); + $step3a = self::step3a($step2); + $step3b = self::step3b($step3a); + $step4 = self::step4($step3b); + + return $step4; + } + + private static function trim($str) + { + return trim($str); + } + + private static function toLower($str) + { + return strtolower($str); + } + + private static function replaceAccAcuti($str) + { + return str_replace(self::$accenti_acuti, self::$accenti_gravi, $str); //strtr + } + + private static function putUAfterQToUpper($str) + { + return str_replace('qu', 'qU', $str); + } + + private static function IUBetweenVowToUpper($str) + { + $pattern = '/([aeiouàèìòù])([iu])([aeiouàèìòù])/'; + + return preg_replace_callback($pattern, function ($matches) { + return strtoupper($matches[0]); + }, $str); + } + + private static function returnRV($str) + { + /* + If the second letter is a consonant, RV is the region after the next following vowel, + or if the first two letters are vowels, RV is the region after the next consonant, and otherwise + (consonant-vowel case) RV is the region after the third letter. + But RV is the end of the word if these positions cannot be found. Example: + m a c h o [ho] o l i v a [va] t r a b a j o [bajo] á u r e o [eo] prezzo sprezzante + */ + + if (mb_strlen($str) < 2) { + return ''; + } //$str; + + if (in_array($str[1], self::$consonanti)) { + $str = mb_substr($str, 2); + $str = strpbrk($str, implode(self::$vocali)); + + return mb_substr($str, 1); //secondo me devo mettere 1 + } elseif (in_array($str[0], self::$vocali) && in_array($str[1], self::$vocali)) { + $str = strpbrk($str, implode(self::$consonanti)); + + return mb_substr($str, 1); + } elseif (in_array($str[0], self::$consonanti) && in_array($str[1], self::$vocali)) { + return mb_substr($str, 3); + } + } + + private static function returnR1($str) + { + /* + R1 is the region after the first non-vowel following a vowel, or is the null region at the end + of the word if there is no such non-vowel. Example: + beautiful [iful] beauty [y] beau [NULL] animadversion [imadversion] sprinkled [kled] eucharist [harist] + */ + + $pattern = '/['.implode(self::$vocali).']+'.'['.implode(self::$consonanti).']'.'(.*)/'; + preg_match($pattern, $str, $matches); + + return count($matches) >= 1 ? $matches[1] : ''; + } + + private static function returnR2($str) + { + /* + R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end + of the word if there is no such non-vowel. Example: + beautiful [ul] beauty [NULL] beau [NULL] animadversion [adversion] sprinkled [NULL] eucharist [ist] + */ + + $R1 = self::returnR1($str); + + $pattern = '/['.implode(self::$vocali).']+'.'['.implode(self::$consonanti).']'.'(.*)/'; + preg_match($pattern, $R1, $matches); + + return count($matches) >= 1 ? $matches[1] : ''; + } + + private static function step0($str) + { + //Step 0: Attached pronoun + //Always do steps 0 + + $str_len = mb_strlen($str); + $rv = self::returnRV($str); + $rv_len = mb_strlen($rv); + + $pos = 0; + foreach (self::$suffissi_step0 as $suff) { + if ($rv_len - mb_strlen($suff) < 0) { + continue; + } + $pos = mb_strpos($rv, $suff, $rv_len - mb_strlen($suff)); + if ($pos !== false) { + break; + } + } + + $ante_suff = mb_substr($rv, 0, $pos); + $ante_suff_len = mb_strlen($ante_suff); + + foreach (self::$ante_suff_a as $ante_a) { + if ($ante_suff_len - mb_strlen($ante_a) < 0) { + continue; + } + $pos_a = mb_strpos($ante_suff, $ante_a, $ante_suff_len - mb_strlen($ante_a)); + if ($pos_a !== false) { + return mb_substr($str, 0, $pos + $str_len - $rv_len); + } + } + + foreach (self::$ante_suff_b as $ante_b) { + if ($ante_suff_len - mb_strlen($ante_b) < 0) { + continue; + } + $pos_b = mb_strpos($ante_suff, $ante_b, $ante_suff_len - mb_strlen($ante_b)); + if ($pos_b !== false) { + return mb_substr($str, 0, $pos + $str_len - $rv_len).'e'; + } + } + + return $str; + } + + private static function deleteStuff($arr_suff, $str, $str_len, $where, $ovunque = false) + { + if ($where === 'r2') { + $r = self::returnR2($str); + } elseif ($where === 'rv') { + $r = self::returnRV($str); + } elseif ($where === 'r1') { + $r = self::returnR1($str); + } + + $r_len = mb_strlen($r); + + if ($ovunque) { + foreach ($arr_suff as $suff) { + if ($str_len - mb_strlen($suff) < 0) { + continue; + } + $pos = mb_strpos($str, $suff, $str_len - mb_strlen($suff)); + if ($pos !== false) { + $pattern = '/'.$suff.'$/'; + $ret_str = preg_match($pattern, $r) ? mb_substr($str, 0, $pos) : ''; + if ($ret_str !== '') { + return $ret_str; + } + break; + } + } + } else { + foreach ($arr_suff as $suff) { + if ($r_len - mb_strlen($suff) < 0) { + continue; + } + $pos = mb_strpos($r, $suff, $r_len - mb_strlen($suff)); + if ($pos !== false) { + return mb_substr($str, 0, $pos + $str_len - $r_len); + } + } + } + } + + private static function step1($str) + { + // Step 1: Standard suffix removal + // Always do steps 1 + + $str_len = mb_strlen($str); + + // Delete if in R1, if preceded by 'iv', delete if in R2 (and if further preceded by 'at', delete if in R2), + // otherwise, if preceded by 'os', 'ic' or 'abil', delete if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_g, $str, $str_len, 'r1'))) { + if (!empty($ret_str1 = self::deleteStuff(['iv'], $ret_str, mb_strlen($ret_str), 'r2'))) { + if (!empty($ret_str2 = self::deleteStuff(['at'], $ret_str1, mb_strlen($ret_str1), 'r2'))) { + return $ret_str2; + } else { + return $ret_str1; + } + } elseif (!empty( + $ret_str1 = self::deleteStuff(['os', 'ic', 'abil'], $ret_str, mb_strlen($ret_str), 'r2') + )) { + return $ret_str1; + } else { + return $ret_str; + } + } + + // Delete if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_a, $str, $str_len, 'r2', true))) { + return $ret_str; + } + + // Delete if in R2, if preceded by 'ic', delete if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_b, $str, $str_len, 'r2'))) { + if (!empty($ret_str1 = self::deleteStuff(['ic'], $ret_str, mb_strlen($ret_str), 'r2'))) { + return $ret_str1; + } else { + return $ret_str; + } + } + + // Replace with 'log' if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_c, $str, $str_len, 'r2'))) { + return $ret_str.'log'; + } + + // Replace with 'u' if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_d, $str, $str_len, 'r2'))) { + return $ret_str.'u'; + } + + // Replace with 'ente' if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_e, $str, $str_len, 'r2'))) { + return $ret_str.'ente'; + } + + // Delete if in RV + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_f, $str, $str_len, 'rv'))) { + return $ret_str; + } + + // Delete if in R2, if preceded by 'abil', 'ic' or 'iv', delete if in R2 + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_h, $str, $str_len, 'r2'))) { + if (!empty($ret_str1 = self::deleteStuff(['abil', 'ic', 'iv'], $ret_str, mb_strlen($ret_str), 'r2'))) { + return $ret_str1; + } else { + return $ret_str; + } + } + + // Delete if in R2, if preceded by 'at', delete if in R2 (and if further preceded by 'ic', delete if in R2) + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_i, $str, $str_len, 'r2'))) { + if (!empty($ret_str1 = self::deleteStuff(['at'], $ret_str, mb_strlen($ret_str), 'r2'))) { + if (!empty($ret_str2 = self::deleteStuff(['ic'], $ret_str1, mb_strlen($ret_str1), 'r2'))) { + return $ret_str2; + } else { + return $ret_str1; + } + } else { + return $ret_str; + } + } + + return $str; + } + + private static function step2($str, $str_step1) + { + //Step 2: Verb suffixes + //Do step 2 if no ending was removed by step 1 + + if ($str != $str_step1) { + return $str_step1; + } + + $str_len = mb_strlen($str); + + if (!empty($ret_str = self::deleteStuff(self::$suffissi_step2, $str, $str_len, 'rv'))) { + return $ret_str; + } + + return $str; + } + + private static function step3a($str) + { + // Step 3a: Delete a final 'a', 'e', 'i', 'o',' à', 'è', 'ì' or 'ò' if it is in RV, + // and a preceding 'i' if it is in RV ('crocchi' -> 'crocch', 'crocchio' -> 'crocch') + // Always do steps 3a + + $vocale_finale = ['a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò']; + + $str_len = mb_strlen($str); + + if (!empty($ret_str = self::deleteStuff($vocale_finale, $str, $str_len, 'rv'))) { + if (!empty($ret_str1 = self::deleteStuff(['i'], $ret_str, mb_strlen($ret_str), 'rv'))) { + return $ret_str1; + } else { + return $ret_str; + } + } + + return $str; + } + + private static function step3b($str) + { + // Step 3b: Replace final 'ch' (or 'gh') with 'c' (or 'g') if in 'RV' ('crocch' -> 'crocc') + // Always do steps 3b + + $rv = self::returnRV($str); + + $pattern = '/([cg])h$/'; + + return mb_substr($str, 0, mb_strlen($str) - mb_strlen($rv)) + . preg_replace_callback( + $pattern, + function ($matches) { + return $matches[0]; + }, + $rv + ); + } + + private static function step4($str) + { + // Step 4: Finally, turn I and U back into lower case + + return strtolower($str); + } +} diff --git a/src/Stemmer/LatvianStemmer.php b/src/Stemmer/LatvianStemmer.php new file mode 100644 index 0000000..4ec249b --- /dev/null +++ b/src/Stemmer/LatvianStemmer.php @@ -0,0 +1,212 @@ + + * This is a light version of the algorithm in Karlis Kreslin's PhD thesis + * A stemming algorithm for Latvian with the following modifications: + *
    + *
  • Only explicitly stems noun and adjective morphology + *
  • Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed) + *
  • Removes only the primary inflectional suffixes: case and number for nouns + * case, number, gender, and definitiveness for adjectives. + *
  • Palatalization is only handled when a declension II,V,VI noun suffix is removed. + *
+ */ + +class LatvianStemmer implements Stemmer +{ + private static $affixes = [ + ['ajiem', 3, false], + ['ajai', 3, false], + ['ajam', 2, false], + ['ajām', 2, false], + ['ajos', 2, false], + ['ajās', 2, false], + ['iem', 2, true], + ['ajā', 2, false], + ['ais', 2, false], + ['ai', 2, false], + ['ei', 2, false], + ['ām', 1, false], + ['am', 1, false], + ['ēm', 1, false], + ['īm', 1, false], + ['im', 1, false], + ['um', 1, false], + ['us', 1, true], + ['as', 1, false], + ['ās', 1, false], + ['es', 1, false], + ['os', 1, true], + ['ij', 1, false], + ['īs', 1, false], + ['ēs', 1, false], + ['is', 1, false], + ['ie', 1, false], + ['u', 1, true], + ['a', 1, true], + ['i', 1, true], + ['e', 1, false], + ['ā', 1, false], + ['ē', 1, false], + ['ī', 1, false], + ['ū', 1, false], + ['o', 1, false], + ['s', 0, false], + ['š', 0, false], + ]; + + private static $VOWELS = 'aāeēiīouū'; + + /** + * @param $word string + * + * @return string + */ + public static function stem($word) + { + $word = mb_strtolower($word); + $s = mb_str_split($word); + $numVowels = self::numVowels($s); + $length = count($s); + + foreach (self::$affixes as $affix) { + if ($numVowels > $affix[1] and $length >= mb_strlen($affix[0]) + 3 and self::endswith( + $s, + $length, + $affix[0] + )) { + $length -= mb_strlen($affix[0]); + if ($affix[2]) { + $s = self::unPalatalize($s, $length); + } else { + $s = array_slice($s, 0, $length); + } + break; + } + } + return implode('', $s); + } + + /** + * @param $s array + * + * @return int + */ + private static function numVowels($s) + { + $count = 0; + foreach ($s as $char) { + if (mb_substr_count(self::$VOWELS, $char) > 0) { + $count++; + } + } + return $count; + } + + /** + * @param $s array + * @param $length integer + * @param $suffix string + * + * @return bool + */ + public static function endswith($s, $length, $suffix) + { + return str_ends_with(implode('', array_splice($s, 0, $length)), $suffix); + } + + /** + * @param $s array + * @param $length integer + * + * @return array + */ + public static function unPalatalize($s, $length) + { + // we check the character removed: if its -u then + // its 2,5, or 6 gen pl., and these two can only apply then. + if ($s[$length] === 'u') { + // kš -> kst + if (self::endswith($s, $length, 'kš')) { + $length += 1; + $s[$length - 2] = 's'; + $s[$length - 1] = 't'; + return array_splice($s, 0, $length); + } elseif (self::endswith($s, $length, 'ņņ')) { + $s[$length - 2] = 'n'; + $s[$length - 1] = 'n'; + return array_splice($s, 0, $length); + } + } + // otherwise all other rules + if (self::endswith($s, $length, 'pj') or self::endswith($s, $length, 'bj') or self::endswith( + $s, + $length, + 'mj' + ) or self::endswith($s, $length, 'vj')) { + $length--; + } elseif (self::endswith($s, $length, 'šņ')) { + $s[$length - 2] = 's'; + $s[$length - 1] = 'n'; + } elseif (self::endswith($s, $length, 'žņ')) { + $s[$length - 2] = 'z'; + $s[$length - 1] = 'n'; + } elseif (self::endswith($s, $length, 'šļ')) { + $s[$length - 2] = 's'; + $s[$length - 1] = 'l'; + } elseif (self::endswith($s, $length, 'žļ')) { + $s[$length - 2] = 'z'; + $s[$length - 1] = 'l'; + } elseif (self::endswith($s, $length, 'ļņ')) { + $s[$length - 2] = 'l'; + $s[$length - 1] = 'n'; + } elseif (self::endswith($s, $length, 'ļļ')) { + $s[$length - 2] = 'l'; + $s[$length - 1] = 'l'; + } elseif (self::endswith($s, $length, 'č')) { + $s[$length - 1] = 'c'; + } elseif (self::endswith($s, $length, 'ļ')) { + $s[$length - 1] = 'l'; + } elseif (self::endswith($s, $length, 'ņ')) { + $s[$length - 1] = 'n'; + } + return array_splice($s, 0, $length); + } +} diff --git a/src/Stemmer/NoStemmer.php b/src/Stemmer/NoStemmer.php new file mode 100644 index 0000000..150ee38 --- /dev/null +++ b/src/Stemmer/NoStemmer.php @@ -0,0 +1,11 @@ + 7 && in_array(mb_substr($word, -5), ['zacja', 'zacją', 'zacji'])) { + return mb_substr($word, 0, -4); + } + if (strlen($word) > 6 && in_array(mb_substr($word, -4), ['acja', 'acji', 'acją', 'tach', 'anie', 'enie', 'eniu', 'aniu'])) { + return mb_substr($word, 0, -4); + } + if (strlen($word) > 6 && (mb_substr($word, -4) == 'tyka')) { + return mb_substr($word, 0, -2); + } + if (strlen($word) > 5 && in_array(mb_substr($word, -3), ['ach', 'ami', 'nia', 'niu', 'cia', 'ciu'])) { + return mb_substr($word, 0, -3); + } + if (strlen($word) > 5 && in_array(mb_substr($word, -3), ['cji', 'cja', 'cją'])) { + return mb_substr($word, 0, -2); + } + if (strlen($word) > 5 && in_array(mb_substr($word, -2), ['ce', 'ta'])) { + return mb_substr($word, 0, -2); + } + return $word; + } + + public static function removeDiminutive($word) + { + if (strlen($word) > 6) { + if (in_array(mb_substr($word, -5), ['eczek', 'iczek', 'iszek', 'aszek', 'uszek'])) { + return mb_substr($word, 0, -5); + } + if (in_array(mb_substr($word, -4), ['enek', 'ejek', 'erek'])) { + return mb_substr($word, 0, -2); + } + } + if (strlen($word) > 4) { + if (in_array(mb_substr($word, -2), ['ek', 'ak'])) { + return mb_substr($word, 0, -2); + } + } + return $word; + } + + public static function removeAdjectiveEnds($word) + { + if (strlen($word) > 7 && (mb_substr($word, 0, 3) == 'naj') && in_array(mb_substr($word, -3), ['sze', 'szy'])) { + return mb_substr($word, 3, -3); + } + if (strlen($word) > 7 && (mb_substr($word, 0, 3) == 'naj') && (mb_substr($word, 0, 5) == 'szych')) { + return mb_substr($word, 3, -5); + } + if (strlen($word) > 6 && (mb_substr($word, -4) == 'czny')) { + return mb_substr($word, 0, -4); + } + if (strlen($word) > 5 && in_array(mb_substr($word, -3), ['owy', 'owa', 'owe', 'ych', 'ego'])) { + return mb_substr($word, 0, -3); + } + if (strlen($word) > 5 && (mb_substr($word, -2) == 'ej')) { + return mb_substr($word, 0, -2); + } + return $word; + } + + public static function removeVerbsEnds($word) + { + if (strlen($word) > 5 && (mb_substr($word, -3) == 'bym')) { + return mb_substr($word, 0, -3); + } + if (strlen($word) > 5 && in_array(mb_substr($word, -3), ['esz', 'asz', 'cie', 'eść', 'aść', 'łem', 'amy', 'emy'])) { + return mb_substr($word, 0, -3); + } + if (strlen($word) > 3 && in_array(mb_substr($word, -3), ['esz', 'asz', 'eść', 'aść', 'eć', 'ać'])) { + return mb_substr($word, 0, -2); + } + if (strlen($word) > 3 && in_array(mb_substr($word, -2), ['aj'])) { + return mb_substr($word, 0, -1); + } + if (strlen($word) > 3 && in_array(mb_substr($word, -2), ['ać', 'em', 'am', 'ał', 'ił', 'ić', 'ąc'])) { + return mb_substr($word, 0, -2); + } + return $word; + } + + public static function removeAdverbsEnds($word) + { + if (strlen($word) > 4 && in_array(mb_substr($word, -3), ['nie', 'wie', 'rze'])) { + return mb_substr($word, 0, -2); + } + return $word; + } + + public static function removePluralForms($word) + { + if (strlen($word) > 4 && in_array(mb_substr($word, -2), ['ów', 'om'])) { + return mb_substr($word, 0, -2); + } + if (strlen($word) > 4 && (mb_substr($word, -3) == 'ami')) { + return mb_substr($word, 0, -3); + } + return $word; + } + + public static function removeGeneralEnds($word) + { + if (strlen($word) > 4 && in_array(substr($word, -2), ['ia', 'ie'])) { + return substr($word, 0, -2); + } + if (strlen($word) > 4 && in_array(substr($word, -1), ['u', 'ą', 'i', 'a', 'ę', 'y', 'ę', 'ł'])) { + return substr($word, 0, -1); + } + return $word; + } + + public static function stem($word) + { + $word = mb_strtolower($word); + + $stem = $word; + + $stem = self::removeNouns($stem); + $stem = self::removeDiminutive($stem); + $stem = self::removeAdjectiveEnds($stem); + $stem = self::removeVerbsEnds($stem); + $stem = self::removeAdverbsEnds($stem); + $stem = self::removePluralForms($stem); + $stem = self::removeGeneralEnds($stem); + + return $stem; + } +} diff --git a/src/Stemmer/PorterStemmer.php b/src/Stemmer/PorterStemmer.php new file mode 100644 index 0000000..a7646a9 --- /dev/null +++ b/src/Stemmer/PorterStemmer.php @@ -0,0 +1,403 @@ + 1) { + self::replace($word, 'e', ''); + } elseif (self::m(substr($word, 0, -1)) == 1) { + if (!self::cvc(substr($word, 0, -1))) { + self::replace($word, 'e', ''); + } + } + } + + // Part b + if (self::m($word) > 1 && self::doubleConsonant($word) && substr($word, -1) == 'l') { + $word = substr($word, 0, -1); + } + + return $word; + } + + /** + * Replaces the first string with the second, at the end of the string. If third + * arg is given, then the preceding string must match that m count at least. + * + * @param string $str String to check + * @param string $check Ending to check for + * @param string $repl Replacement string + * @param int $m Optional minimum number of m() to meet + * + * @return bool Whether the $check string was at the end + * of the $str string. True does not necessarily mean + * that it was replaced. + */ + private static function replace(&$str, $check, $repl, $m = null) + { + $len = 0 - strlen($check); + + if (substr($str, $len) == $check) { + $substr = substr($str, 0, $len); + if (is_null($m) || self::m($substr) > $m) { + $str = $substr.$repl; + } + + return true; + } + + return false; + } + + /** + * What, you mean it's not obvious from the name? + * + * Measures the number of consonant sequences in $str. if c is + * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + * presence, + * + * gives 0 + * vc gives 1 + * vcvc gives 2 + * vcvcvc gives 3 + * + * @param string $str The string to return the m count for + * + * @return int The m count + */ + private static function m($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + $str = preg_replace("#^$c+#", '', $str); + $str = preg_replace("#$v+$#", '', $str); + + preg_match_all("#($v+$c+)#", $str, $matches); + + return count($matches[1]); + } + + /** + * Returns true/false as to whether the given string contains two + * of the same consonant next to each other at the end of the string. + * + * @param string $str String to check + * + * @return bool Result + */ + private static function doubleConsonant($str) + { + $c = self::$regex_consonant; + + return preg_match("#$c{2}$#", $str, $matches) && $matches[0][0] == $matches[0][1]; + } + + /** + * Checks for ending CVC sequence where second C is not W, X or Y + * + * @param string $str String to check + * + * @return bool Result + */ + private static function cvc($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + $matchFound = preg_match("#($c$v$c)$#", $str, $matches); + + $return = false; + + if ($matchFound && strlen($matches[1]) == 3) { + $return = true; + if (in_array($matches[1][2], ['w', 'x', 'y'])) { + $return = false; + } + } + + return $return; + } +} diff --git a/src/Stemmer/PortugeseStemmer.php b/src/Stemmer/PortugeseStemmer.php new file mode 100644 index 0000000..956bab4 --- /dev/null +++ b/src/Stemmer/PortugeseStemmer.php @@ -0,0 +1,766 @@ +. + */ + +/** + * This is a reimplementation of the Porter Stemmer Algorithm for Portuguese. + * This script is based on the implementation found on + * and has been rewriten to work with TNTSearch by Lucas Padilha + * + * Takes a word and reduces it to its Portuguese stem using the Porter stemmer algorithm. + * + * References: + * - http://snowball.tartarus.org/algorithms/porter/stemmer.html + * - http://snowball.tartarus.org/algorithms/portuguese/stemmer.html + * + * Usage: + * $stem = PortugueseStemmer::stem($word); + * + * @author Lucas Padilha + */ + +class PortugueseStemmer implements Stemmer +{ + /** + * UTF-8 Case lookup table + * + * This lookuptable defines the upper case letters to their correspponding + * lower case letter in UTF-8 + * + * @author Andreas Gohr + */ + private static $utf8_lower_to_upper = [ + 0x0061 => 0x0041, 0x03C6 => 0x03A6, 0x0163 => 0x0162, 0x00E5 => 0x00C5, 0x0062 => 0x0042, + 0x013A => 0x0139, 0x00E1 => 0x00C1, 0x0142 => 0x0141, 0x03CD => 0x038E, 0x0101 => 0x0100, + 0x0491 => 0x0490, 0x03B4 => 0x0394, 0x015B => 0x015A, 0x0064 => 0x0044, 0x03B3 => 0x0393, + 0x00F4 => 0x00D4, 0x044A => 0x042A, 0x0439 => 0x0419, 0x0113 => 0x0112, 0x043C => 0x041C, + 0x015F => 0x015E, 0x0144 => 0x0143, 0x00EE => 0x00CE, 0x045E => 0x040E, 0x044F => 0x042F, + 0x03BA => 0x039A, 0x0155 => 0x0154, 0x0069 => 0x0049, 0x0073 => 0x0053, 0x1E1F => 0x1E1E, + 0x0135 => 0x0134, 0x0447 => 0x0427, 0x03C0 => 0x03A0, 0x0438 => 0x0418, 0x00F3 => 0x00D3, + 0x0440 => 0x0420, 0x0454 => 0x0404, 0x0435 => 0x0415, 0x0449 => 0x0429, 0x014B => 0x014A, + 0x0431 => 0x0411, 0x0459 => 0x0409, 0x1E03 => 0x1E02, 0x00F6 => 0x00D6, 0x00F9 => 0x00D9, + 0x006E => 0x004E, 0x0451 => 0x0401, 0x03C4 => 0x03A4, 0x0443 => 0x0423, 0x015D => 0x015C, + 0x0453 => 0x0403, 0x03C8 => 0x03A8, 0x0159 => 0x0158, 0x0067 => 0x0047, 0x00E4 => 0x00C4, + 0x03AC => 0x0386, 0x03AE => 0x0389, 0x0167 => 0x0166, 0x03BE => 0x039E, 0x0165 => 0x0164, + 0x0117 => 0x0116, 0x0109 => 0x0108, 0x0076 => 0x0056, 0x00FE => 0x00DE, 0x0157 => 0x0156, + 0x00FA => 0x00DA, 0x1E61 => 0x1E60, 0x1E83 => 0x1E82, 0x00E2 => 0x00C2, 0x0119 => 0x0118, + 0x0146 => 0x0145, 0x0070 => 0x0050, 0x0151 => 0x0150, 0x044E => 0x042E, 0x0129 => 0x0128, + 0x03C7 => 0x03A7, 0x013E => 0x013D, 0x0442 => 0x0422, 0x007A => 0x005A, 0x0448 => 0x0428, + 0x03C1 => 0x03A1, 0x1E81 => 0x1E80, 0x016D => 0x016C, 0x00F5 => 0x00D5, 0x0075 => 0x0055, + 0x0177 => 0x0176, 0x00FC => 0x00DC, 0x1E57 => 0x1E56, 0x03C3 => 0x03A3, 0x043A => 0x041A, + 0x006D => 0x004D, 0x016B => 0x016A, 0x0171 => 0x0170, 0x0444 => 0x0424, 0x00EC => 0x00CC, + 0x0169 => 0x0168, 0x03BF => 0x039F, 0x006B => 0x004B, 0x00F2 => 0x00D2, 0x00E0 => 0x00C0, + 0x0434 => 0x0414, 0x03C9 => 0x03A9, 0x1E6B => 0x1E6A, 0x00E3 => 0x00C3, 0x044D => 0x042D, + 0x0436 => 0x0416, 0x01A1 => 0x01A0, 0x010D => 0x010C, 0x011D => 0x011C, 0x00F0 => 0x00D0, + 0x013C => 0x013B, 0x045F => 0x040F, 0x045A => 0x040A, 0x00E8 => 0x00C8, 0x03C5 => 0x03A5, + 0x0066 => 0x0046, 0x00FD => 0x00DD, 0x0063 => 0x0043, 0x021B => 0x021A, 0x00EA => 0x00CA, + 0x03B9 => 0x0399, 0x017A => 0x0179, 0x00EF => 0x00CF, 0x01B0 => 0x01AF, 0x0065 => 0x0045, + 0x03BB => 0x039B, 0x03B8 => 0x0398, 0x03BC => 0x039C, 0x045C => 0x040C, 0x043F => 0x041F, + 0x044C => 0x042C, 0x00FE => 0x00DE, 0x00F0 => 0x00D0, 0x1EF3 => 0x1EF2, 0x0068 => 0x0048, + 0x00EB => 0x00CB, 0x0111 => 0x0110, 0x0433 => 0x0413, 0x012F => 0x012E, 0x00E6 => 0x00C6, + 0x0078 => 0x0058, 0x0161 => 0x0160, 0x016F => 0x016E, 0x03B1 => 0x0391, 0x0457 => 0x0407, + 0x0173 => 0x0172, 0x00FF => 0x0178, 0x006F => 0x004F, 0x043B => 0x041B, 0x03B5 => 0x0395, + 0x0445 => 0x0425, 0x0121 => 0x0120, 0x017E => 0x017D, 0x017C => 0x017B, 0x03B6 => 0x0396, + 0x03B2 => 0x0392, 0x03AD => 0x0388, 0x1E85 => 0x1E84, 0x0175 => 0x0174, 0x0071 => 0x0051, + 0x0437 => 0x0417, 0x1E0B => 0x1E0A, 0x0148 => 0x0147, 0x0105 => 0x0104, 0x0458 => 0x0408, + 0x014D => 0x014C, 0x00ED => 0x00CD, 0x0079 => 0x0059, 0x010B => 0x010A, 0x03CE => 0x038F, + 0x0072 => 0x0052, 0x0430 => 0x0410, 0x0455 => 0x0405, 0x0452 => 0x0402, 0x0127 => 0x0126, + 0x0137 => 0x0136, 0x012B => 0x012A, 0x03AF => 0x038A, 0x044B => 0x042B, 0x006C => 0x004C, + 0x03B7 => 0x0397, 0x0125 => 0x0124, 0x0219 => 0x0218, 0x00FB => 0x00DB, 0x011F => 0x011E, + 0x043E => 0x041E, 0x1E41 => 0x1E40, 0x03BD => 0x039D, 0x0107 => 0x0106, 0x03CB => 0x03AB, + 0x0446 => 0x0426, 0x00FE => 0x00DE, 0x00E7 => 0x00C7, 0x03CA => 0x03AA, 0x0441 => 0x0421, + 0x0432 => 0x0412, 0x010F => 0x010E, 0x00F8 => 0x00D8, 0x0077 => 0x0057, 0x011B => 0x011A, + 0x0074 => 0x0054, 0x006A => 0x004A, 0x045B => 0x040B, 0x0456 => 0x0406, 0x0103 => 0x0102, + 0x03BB => 0x039B, 0x00F1 => 0x00D1, 0x043D => 0x041D, 0x03CC => 0x038C, 0x00E9 => 0x00C9, + 0x00F0 => 0x00D0, 0x0457 => 0x0407, 0x0123 => 0x0122, + ]; + + private static $vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô']; + + public static function stem($word) + { + // we do ALL in UTF-8 + if (!self::check($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $word = self::strtolower($word); + $word = self::str_replace(['ã', 'õ'], ['a~', 'o~'], $word); + + $rv = ''; + $rvIndex = ''; + self::rv($word, $rv, $rvIndex); + + $r1 = ''; + $r1Index = ''; + self::r1($word, $r1, $r1Index); + + $r2 = ''; + $r2Index = ''; + self::r2($r1, $r1Index, $r2, $r2Index); + + $initialWord = $word; + + self::step1($word, $r1Index, $r2Index, $rvIndex); + + if ($initialWord == $word) { + self::step2($word, $rvIndex); + } + + if ($initialWord != $word) { + self::step3($word, $rvIndex); + } else { + self::step4($word, $rvIndex); + } + + self::step5($word, $rvIndex); + + self::finish($word); + + return $word; + } + + /** + * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. + */ + private static function r1($word, &$r1, &$r1Index) + { + [$index, $value] = self::rx($word); + + $r1 = $value; + $r1Index = $index; + + return true; + } + + /** + * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. + */ + private static function r2($r1, $r1Index, &$r2, &$r2Index) + { + [$index, $value] = self::rx($r1); + + $r2 = $value; + $r2Index = $r1Index + $index; + + return true; + } + + /** + * Common function for R1 and R2 + * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. + * R1 : $in = $this->word + * R2 : $in = R1 + */ + private static function rx($in) + { + $length = self::strlen($in); + + // Defaults + $value = ''; + $index = $length; + + // Search all vowels + $vowels = []; + for ($i = 0; $i < $length; $i++) { + $letter = self::substr($in, $i, 1); + + if (in_array($letter, static::$vowels)) { + $vowels[] = $i; + } + } + + // Search the non-vowel following a vowel + foreach ($vowels as $position) { + $after = $position + 1; + $letter = self::substr($in, $after, 1); + + if (!in_array($letter, static::$vowels)) { + $index = $after + 1; + $value = self::substr($in, ($after + 1)); + break; + } + } + + return [$index, $value]; + } + + /** + * Used by spanish, italian, portuguese, etc (but not by french) + * + * If the second letter is a consonant, RV is the region after the next following vowel, + * or if the first two letters are vowels, RV is the region after the next consonant, + * and otherwise (consonant-vowel case) RV is the region after the third letter. + * But RV is the end of the word if these positions cannot be found. + */ + private static function rv($word, &$rv, &$rvIndex) + { + $length = self::strlen($word); + + if ($length < 3) { + return true; + } + + $first = self::substr($word, 0, 1); + $second = self::substr($word, 1, 1); + + // If the second letter is a consonant, RV is the region after the next following vowel, + if (!in_array($second, static::$vowels)) { + for ($i = 2; $i < $length; $i++) { + $letter = self::substr($word, $i, 1); + + if (in_array($letter, static::$vowels)) { + $rv = self::substr($word, ($i + 1)); + $rvIndex = $i + 1; + + return true; + } + } + } + + // or if the first two letters are vowels, RV is the region after the next consonant, + if ((in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) { + for ($i = 2; $i < $length; $i++) { + $letter = self::substr($word, $i, 1); + + if (!in_array($letter, static::$vowels)) { + $rv = self::substr($word, ($i + 1)); + $rvIndex = $i + 1; + + return true; + } + } + } + + // and otherwise (consonant-vowel case) RV is the region after the third letter. + if ((!in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) { + $rv = self::substr($word, 3); + $rvIndex = 3; + + return true; + } + + return false; + } + + private static function inRv($position, $rvIndex) + { + return ($position >= $rvIndex); + } + + private static function inR1($position, $r1Index) + { + return ($position >= $r1Index); + } + + private static function inR2($position, $r2Index) + { + return ($position >= $r2Index); + } + + private static function searchIfInRv($word, $suffixes, $rvIndex) + { + return self::search($word, $suffixes, $rvIndex); + } + + private static function searchIfInR2($word, $suffixes, $r2Index) + { + return self::search($word, $suffixes, $r2Index); + } + + private static function search($word, $suffixes, $offset = 0) + { + $length = self::strlen($word); + + if ($offset > $length) { + return false; + } + + foreach ($suffixes as $suffix) { + if ((($position = self::strrpos($word, $suffix, $offset)) !== false) && ((self::strlen($suffix) + $position) == $length)) { + return $position; + } + } + return false; + } + + /** + * Step 1: Standard suffix removal + */ + private static function step1(&$word, $r1Index, $r2Index, $rvIndex) + { + // delete if in R2 + if (($position = self::search($word, ['amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância', 'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso', 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es', 'ante'])) !== false) { + if (self::inR2($position, $r2Index)) { + $word = self::substr($word, 0, $position); + } + + return true; + } + + // replace with log if in R2 + if (($position = self::search($word, ['logías', 'logía'])) !== false) { + if (self::inR2($position, $r2Index)) { + $word = preg_replace('#(logías|logía)$#u', 'log', $word); + } + + return true; + } + + // replace with u if in R2 + if (($position = self::search($word, ['uciones', 'ución'])) !== false) { + if (self::inR2($position, $r2Index)) { + $word = preg_replace('#(uciones|ución)$#u', 'u', $word); + } + + return true; + } + + // replace with ente if in R2 + if (($position = self::search($word, ['ências', 'ência'])) !== false) { + if (self::inR2($position, $r2Index)) { + $word = preg_replace('#(ências|ência)$#u', 'ente', $word); + } + + return true; + } + + // delete if in R1 + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by os, ic or ad, delete if in R2 + if (($position = self::search($word, ['amente'])) !== false) { + // delete if in R1 + if (self::inR1($position, $r1Index)) { + $word = self::substr($word, 0, $position); + } + + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + if (($position2 = self::searchIfInR2($word, ['iv'], $r2Index)) !== false) { + $word = self::substr($word, 0, $position2); + + if (($position3 = self::searchIfInR2($word, ['at'], $r2Index)) !== false) { + $word = self::substr($word, 0, $position3); + } + + // if preceded by os, ic or ad, delete if in R2 + } elseif (($position4 = self::searchIfInR2($word, ['os', 'ic', 'ad'], $r2Index)) !== false) { + $word = self::substr($word, 0, $position4); + } + + return true; + } + + // delete if in R2 + // if preceded by ante, avel or ível, delete if in R2 + if (($position = self::search($word, ['mente'])) !== false) { + // delete if in R2 + if (self::inR2($position, $r2Index)) { + $word = self::substr($word, 0, $position); + } + + // if preceded by ante, avel or ível, delete if in R2 + if (($position2 = self::searchIfInR2($word, ['ante', 'avel', 'ível'], $r2Index)) != false) { + $word = self::substr($word, 0, $position2); + } + + return true; + } + + // delete if in R2 + // if preceded by abil, ic or iv, delete if in R2 + if (($position = self::search($word, ['idades', 'idade'])) !== false) { + // delete if in R2 + if (self::inR2($position, $r2Index)) { + $word = self::substr($word, 0, $position); + } + + // if preceded by abil, ic or iv, delete if in R2 + if (($position2 = self::searchIfInR2($word, ['abil', 'ic', 'iv'], $r2Index)) !== false) { + $word = self::substr($word, 0, $position2); + } + + return true; + } + + // delete if in R2 + // if preceded by at, delete if in R2 + if (($position = self::search($word, ['ivas', 'ivos', 'iva', 'ivo'])) !== false) { + // delete if in R2 + if (self::inR2($position, $r2Index)) { + $word = self::substr($word, 0, $position); + } + + // if preceded by at, delete if in R2 + if (($position2 = self::searchIfInR2($word, ['at'], $r2Index)) !== false) { + $word = self::substr($word, 0, $position2); + } + + return true; + } + + // replace with ir if in RV and preceded by e + if (($position = self::search($word, ['iras', 'ira'])) !== false) { + if (self::inRv($position, $rvIndex)) { + $before = $position - 1; + $letter = self::substr($word, $before, 1); + + if ($letter == 'e') { + $word = preg_replace('#(iras|ira)$#u', 'ir', $word); + } + } + + return true; + } + + return false; + } + + /** + * Step 2: Verb suffixes + * Search for the longest among the following suffixes in RV, and if found, delete. + */ + private static function step2(&$word, $rvIndex) + { + if (($position = self::searchIfInRv($word, ['aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos', 'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos', 'aremos', 'eremos', 'iremos', 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis', 'áveis', 'íamos', 'armos', 'ermos', 'irmos', 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas', 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o', 'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou'], $rvIndex)) !== false) { + $word = self::substr($word, 0, $position); + + return true; + } + + return false; + } + + /** + * Step 3: d-suffixes + */ + private static function step3(&$word, $rvIndex) + { + // Delete suffix i if in RV and preceded by c + if (self::searchIfInRv($word, ['i'], $rvIndex) !== false) { + $letter = self::substr($word, -2, 1); + + if ($letter == 'c') { + $word = self::substr($word, 0, -1); + } + + return true; + } + + return false; + } + + /** + * Step 4 + */ + private static function step4(&$word, $rvIndex) + { + // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it + if (($position = self::searchIfInRv($word, ['os', 'a', 'i', 'o', 'á', 'í', 'ó'], $rvIndex)) !== false) { + $word = self::substr($word, 0, $position); + + return true; + } + + return false; + } + + /** + * Step 5 + */ + private static function step5(&$word, $rvIndex) + { + // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). + if (self::searchIfInRv($word, ['e', 'é', 'ê'], $rvIndex) !== false) { + $word = self::substr($word, 0, -1); + + if (($position2 = self::search($word, ['gu', 'ci'])) !== false) { + if (self::inRv(($position2 + 1), $rvIndex)) { + $word = self::substr($word, 0, -1); + } + } + + return true; + } elseif (self::search($word, ['ç']) !== false) { + $word = preg_replace('#(ç)$#u', 'c', $word); + + return true; + } + + return false; + } + + private static function finish(&$word) + { + // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. + $word = self::str_replace(['a~', 'o~'], ['ã', 'õ'], $word); + } + + /** + * Tries to detect if a string is in Unicode encoding + * + * @author + * + * @link http://www.php.net/manual/en/function.utf8-encode.php + */ + private static function check($str) + { + for ($i = 0; $i < strlen($str); $i++) { + if (ord($str[$i]) < 0x80) { + continue; + } + # 0bbbbbbb + elseif ((ord($str[$i]) & 0xE0) == 0xC0) { + $n = 1; + } + # 110bbbbb + elseif ((ord($str[$i]) & 0xF0) == 0xE0) { + $n = 2; + } + # 1110bbbb + elseif ((ord($str[$i]) & 0xF8) == 0xF0) { + $n = 3; + } + # 11110bbb + elseif ((ord($str[$i]) & 0xFC) == 0xF8) { + $n = 4; + } + # 111110bb + elseif ((ord($str[$i]) & 0xFE) == 0xFC) { + $n = 5; + } + # 1111110b + else { + return false; + } + # Does not match any model + for ($j = 0; $j < $n; $j++) { + # n bytes matching 10bbbbbb follow ? + if ((++$i == strlen($str)) || ((ord($str[$i]) & 0xC0) != 0x80)) { + return false; + } + } + } + return true; + } + + /** + * Unicode aware replacement for strlen() + * + * utf8_decode() converts characters that are not in ISO-8859-1 + * to '?', which, for the purpose of counting, is alright - It's + * even faster than mb_strlen. + * + * @author + * + * @see strlen() + * @see utf8_decode() + */ + private static function strlen($string) + { + return mb_strlen($string, 'UTF-8'); + } + + /** + * Unicode aware replacement for substr() + * + * @author lmak at NOSPAM dot iti dot gr + * + * @link http://www.php.net/manual/en/function.substr.php + * @see substr() + */ + private static function substr($str, $start, $length = null) + { + $ar = []; + preg_match_all('/./u', $str, $ar); + + if ($length != null) { + return join('', array_slice($ar[0], $start, $length)); + } else { + return join('', array_slice($ar[0], $start)); + } + } + + /** + * Unicode aware replacement for strrepalce() + * + * @author Harry Fuecks + * + * @see strreplace(); + */ + private static function str_replace($s, $r, $str) + { + if (!is_array($s)) { + $s = '!' . preg_quote($s, '!') . '!u'; + } else { + foreach ($s as $k => $v) { + $s[$k] = '!' . preg_quote($v) . '!u'; + } + } + return preg_replace($s, $r, $str); + } + + /** + * This is a unicode aware replacement for strtolower() + * + * Uses mb_string extension if available + * + * @author Andreas Gohr + * + * @see strtolower() + * @see utf8_strtoupper() + */ + private static function strtolower($string) + { + if (!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) { + return mb_strtolower($string, 'utf-8'); + } + + //global $utf8_upper_to_lower; + $utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper); + $uni = self::utf8_to_unicode($string); + $cnt = count($uni); + for ($i = 0; $i < $cnt; $i++) { + if ($utf8_upper_to_lower[$uni[$i]]) { + $uni[$i] = $utf8_upper_to_lower[$uni[$i]]; + } + } + return self::unicode_to_utf8($uni); + } + + /** + * This function returns any UTF-8 encoded text as a list of + * Unicode values: + * + * @author Scott Michael Reynen + * + * @link http://www.randomchaos.com/document.php?source=php_and_unicode + * @see unicode_to_utf8() + */ + private static function utf8_to_unicode(&$str) + { + $unicode = []; + $values = []; + $looking_for = 1; + + for ($i = 0; $i < strlen($str); $i++) { + $this_value = ord($str[$i]); + if ($this_value < 128) { + $unicode[] = $this_value; + } else { + if (count($values) == 0) { + $looking_for = ($this_value < 224) ? 2 : 3; + } + + $values[] = $this_value; + if (count($values) == $looking_for) { + $number = ($looking_for == 3) ? + (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : + (($values[0] % 32) * 64) + ($values[1] % 64); + $unicode[] = $number; + $values = []; + $looking_for = 1; + } + } + } + return $unicode; + } + + /** + * This function converts a Unicode array back to its UTF-8 representation + * + * @author Scott Michael Reynen + * + * @link http://www.randomchaos.com/document.php?source=php_and_unicode + * @see utf8_to_unicode() + */ + private static function unicode_to_utf8(&$str) + { + if (!is_array($str)) { + return ''; + } + + $utf8 = ''; + foreach ($str as $unicode) { + if ($unicode < 128) { + $utf8 .= chr($unicode); + } elseif ($unicode < 2048) { + $utf8 .= chr(192 + (($unicode - ($unicode % 64)) / 64)); + $utf8 .= chr(128 + ($unicode % 64)); + } else { + $utf8 .= chr(224 + (($unicode - ($unicode % 4096)) / 4096)); + $utf8 .= chr(128 + ((($unicode % 4096) - ($unicode % 64)) / 64)); + $utf8 .= chr(128 + ($unicode % 64)); + } + } + return $utf8; + } + + /** + * This is an Unicode aware replacement for strrpos + * + * Uses mb_string extension if available + * + * @author Harry Fuecks + * + * @see strpos() + */ + private static function strrpos($haystack, $needle, $offset = 0) + { + if (!defined('UTF8_NOMBSTRING') && function_exists('mb_strrpos')) { + return mb_strrpos($haystack, $needle, $offset, 'utf-8'); + } + + if (!$offset) { + $ar = self::explode($needle, $haystack); + $count = count($ar); + if ($count > 1) { + return self::strlen($haystack) - self::strlen($ar[($count - 1)]) - self::strlen($needle); + } + return false; + } else { + if (!is_int($offset)) { + trigger_error('Offset must be an integer', E_USER_WARNING); + return false; + } + + $str = self::substr($haystack, $offset); + + if (false !== ($pos = self::strrpos($str, $needle))) { + return $pos + $offset; + } + return false; + } + } + + /** + * Unicode aware replacement for explode + * + * @author Harry Fuecks + * + * @see explode(); + */ + private static function explode($sep, $str) + { + if ($sep == '') { + trigger_error('Empty delimiter', E_USER_WARNING); + return false; + } + + return preg_split('!' . preg_quote($sep, '!') . '!u', $str); + } +} diff --git a/src/Stemmer/RussianStemmer.php b/src/Stemmer/RussianStemmer.php new file mode 100644 index 0000000..a1e174a --- /dev/null +++ b/src/Stemmer/RussianStemmer.php @@ -0,0 +1,112 @@ +processor = new WP_HTML_Tag_Processor($html); + } + + public function nextToken(): ?Token + { + // Could be text before the first tag? + + if (count($this->buffer) > 0) { + $token = array_shift($this->buffer); + return new Token($this->cursor, $token); + } elseif ($this->nextTag()) { + $this->processTag(); + $words = []; + + if ($this->chunk !== null) { + $words = Str::splitWords($this->chunk); + if (count($words) > 0) { + $this->buffer = $words; + } + } + + $tag = $this->processor->get_tag(); + + // This actually should never happen because of previous guards, the logic should be + // modified to reflect reality. + if ($tag === null) { + return null; + } + + return new Token($this->cursor, $tag, true); + } + + // Could be text after the last tag? + + return null; + } + + private function nextTag(): bool + { + return $this->processor->next_tag(['tag_closers' => 'visit']); + } + + /** + * @return void + */ + private function processTag() + { + $tokenStartsAt = $this->processor->get_token_starts_at(); + $tokenEndsAt = $this->processor->get_token_ends_at(); + + // Need to check if there is a gap between tags; + + if ($this->lastTokenEndsAt !== 0) { + $maybeChunkStartsAt = $this->lastTokenEndsAt + 1; + + if ($tokenStartsAt !== $maybeChunkStartsAt) { + $length = $tokenStartsAt - $maybeChunkStartsAt; + $this->chunk = $this->processor->substr($maybeChunkStartsAt, $length); + } else { + $this->chunk = null; + } + } + + $this->lastTokenEndsAt = $tokenEndsAt; + } +} diff --git a/src/Tokenizer/Tokenizer.php b/src/Tokenizer/Tokenizer.php index a1ded39..d62f1ec 100644 --- a/src/Tokenizer/Tokenizer.php +++ b/src/Tokenizer/Tokenizer.php @@ -10,10 +10,10 @@ class Tokenizer private int $cursor = 0; - public function __construct($text) + public function __construct(string $text) { - Str::splitOnWhitespace($text); $this->text = $text; + Str::splitWords($this->text); } public function getToken(): Token