feat: add stemmers from tntsearch

wp-blocks · Dec 15, 2023 · 0340d19 · 0340d19
1 parent e39129f
commit 0340d19
Show file tree

Hide file tree

Showing 33 changed files with 6,890 additions and 19 deletions.
diff --git a/bin/local-wp-shell.sh b/bin/local-wp-shell.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+docker compose --file docker-compose.yml --env-file .env exec -w /var/www/html/wp-content/plugins/indexed-search wordpress bash -c "vendor/bin/wp --allow-root shell"
diff --git a/phpstan.neon.dist b/phpstan.neon.dist
@@ -13,3 +13,7 @@ parameters:
     scanDirectories:
         - tests
         - vendor-prod
+    excludePaths:
+        - src/Compat/WP_HTML_Tag_Processor.php
+        - src/Stemmer/*
+        - tests/*/_wordpress/*
diff --git a/src/Tokenizer/WP_HTML_Tag_Processor.php → src/Compat/WP_HTML_Tag_Processor.php b/src/Tokenizer/WP_HTML_Tag_Processor.php → src/Compat/WP_HTML_Tag_Processor.php
@@ -1,6 +1,6 @@
 <?php
 
-namespace WpBlocks\Search\Tokenizer;
+namespace WpBlocks\Search\Compat;
 
 use WP_HTML_Attribute_Token;
 use WP_HTML_Span;
@@ -53,7 +53,7 @@
  *    no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
  *    or may not terminate a character reference.
  *
- * @package WordPress
+ * @package    WordPress
  * @subpackage HTML-API
  *
  * @since 6.2.0
@@ -715,7 +715,9 @@ public function next_tag($query = null)
      */
     public function class_list()
     {
-        /** @var string $class contains the string value of the class attribute, with character references decoded. */
+        /**
+         * @var string $class contains the string value of the class attribute, with character references decoded.
+         */
         $class = $this->get_attribute('class');
 
         if (!is_string($class)) {
@@ -912,7 +914,7 @@ public function release_bookmark($name)
      * Skips contents of generic rawtext elements.
      *
      * @since 6.3.2
-     * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
+     * @see   https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
      *
      * @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
      *
@@ -932,7 +934,7 @@ private function skip_rawtext($tag_name)
      * Skips contents of RCDATA elements, namely title and textarea tags.
      *
      * @since 6.2.0
-     * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
+     * @see   https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
      *
      * @param string $tag_name The uppercase tag name which will close the RCDATA region.
      *
@@ -1518,8 +1520,8 @@ private function after_tag()
      * (they are accumulated in different data formats for performance).
      *
      * @since 6.2.0
-     * @see WP_HTML_Tag_Processor::$lexical_updates
-     * @see WP_HTML_Tag_Processor::$classname_updates
+     * @see   WP_HTML_Tag_Processor::$lexical_updates
+     * @see   WP_HTML_Tag_Processor::$classname_updates
      */
     private function class_name_updates_to_attributes_updates()
     {
@@ -1551,7 +1553,6 @@ private function class_name_updates_to_attributes_updates()
          * attribute, skipping removed classes on the way, and then appending
          * added classes at the end. Only when finished processing will the
          * value contain the final new value.
-
          *
          * @var string $class
          */
@@ -1999,7 +2000,7 @@ public function get_attribute($name)
      *     $p->get_attribute_names_with_prefix( 'data-' ) === null;
      *
      * @since 6.2.0
-     * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+     * @see   https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
      *
      * @param string $prefix Prefix of requested attribute names.
      *
@@ -2374,7 +2375,7 @@ public function remove_class($class_name)
      * Returns the string representation of the HTML Tag Processor.
      *
      * @since 6.2.0
-     * @see WP_HTML_Tag_Processor::get_updated_html()
+     * @see   WP_HTML_Tag_Processor::get_updated_html()
      *
      * @return string The processed HTML.
      */
@@ -2567,4 +2568,27 @@ private function matches()
 
         return true;
     }
+
+    /**
+     * Modifications by WpBlocks\Search
+     */
+    public function get_token_starts_at()
+    {
+        return $this->token_starts_at;
+    }
+
+    public function get_token_ends_at()
+    {
+        return $this->token_starts_at + $this->token_length;
+    }
+
+    public function get_is_closing_tag()
+    {
+        return $this->is_closing_tag;
+    }
+
+    public function substr(int $offset, ?int $length = null)
+    {
+        return substr($this->html, $offset, $length);
+    }
 }
diff --git a/src/Exceptions/Exception.php b/src/Exceptions/Exception.php
diff --git a/src/Exceptions/IndexNotFoundException.php b/src/Exceptions/IndexNotFoundException.php
@@ -0,0 +1,5 @@
+<?php
+
+namespace WpBlocks\Search\Exceptions;
+
+class IndexNotFoundException extends IndexedSearchException {}
diff --git a/src/Exceptions/IndexedSearchException.php b/src/Exceptions/IndexedSearchException.php
@@ -0,0 +1,7 @@
+<?php
+
+namespace WpBlocks\Search\Exceptions;
+
+use Exception;
+
+class IndexedSearchException extends Exception {}
diff --git a/src/Exceptions/TokenizationException.php b/src/Exceptions/TokenizationException.php
@@ -0,0 +1,5 @@
+<?php
+
+namespace WpBlocks\Search\Exceptions;
+
+class TokenizationException extends IndexedSearchException {}
diff --git a/src/Helpers/Str.php b/src/Helpers/Str.php
@@ -5,10 +5,36 @@
 class Str
 {
     /**
-     * @return string[]
+     * Inspired by TNTSearch
+     * https://github.com/teamtnt/tntsearch/blob/c8863c626a47bcb73f860abfe8eed9fb3cde3be8/src/Support/Tokenizer.php
+     */
+    protected static string $splitWordsPattern = '/[^\p{L}\p{N}\p{Pc}\p{Pd}@]+/u';
+
+    /**
+     * @return array<int,string>
      */
     public static function splitOnWhitespace(string $text): array
     {
-        return preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
+        $result = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
+
+        if ($result === false) {
+            throw new \WpBlocks\Search\Exceptions\TokenizationException();
+        }
+
+        return $result;
+    }
+
+    /**
+     * @return array<int,string>
+     */
+    public static function splitWords(string $text): array
+    {
+        $result = preg_split(self::$splitWordsPattern, $text, -1, PREG_SPLIT_NO_EMPTY);
+
+        if ($result === false) {
+            throw new \WpBlocks\Search\Exceptions\TokenizationException();
+        }
+
+        return $result;
     }
 }
diff --git a/src/Index/IndexRepository.php b/src/Index/IndexRepository.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace WpBlocks\Search\Index\IndexRepository;
+
+interface IndexRepository
+{
+    public function create(string $name): int;
+}
diff --git a/src/Stemmer/ArabicStemmer.php b/src/Stemmer/ArabicStemmer.php
@@ -0,0 +1,180 @@
+<?php
+
+/*
+ * This is a reimplementation of AR-PHP Arabic stemmer.
+ * The original author is Khaled Al-Sham'aa <[email protected]>
+ *
+ * Simple stemmer for arabic language rewritten by Nenad Tičarić
+ *
+ * @link https://github.com/teamtnt/tntsearch
+ *
+ * Copyright (c) 2016 Nenad Tičarić [email protected]
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+namespace WpBlocks\Search\Stemmer;
+
+class ArabicStemmer implements Stemmer
+{
+    private static $_verbPre  = 'وأسفلي';
+
+    private static $_verbPost = 'ومكانيه';
+
+    private static $_verbMay;
+
+    private static $_verbMaxPre  = 4;
+
+    private static $_verbMaxPost = 6;
+
+    private static $_verbMinStem = 2;
+
+    private static $_nounPre  = 'ابفكلوأ';
+
+    private static $_nounPost = 'اتةكمنهوي';
+
+    private static $_nounMay;
+
+    private static $_nounMaxPre  = 4;
+
+    private static $_nounMaxPost = 6;
+
+    private static $_nounMinStem = 2;
+
+    /**
+     * Loads initialize values
+     *
+     * @ignore
+     */
+    public function __construct()
+    {
+        self::$_verbMay = self::$_verbPre . self::$_verbPost;
+        self::$_nounMay = self::$_nounPre . self::$_nounPost;
+    }
+
+    /**
+     * Get rough stem of the given Arabic word
+     *
+     * @param string $word Arabic word you would like to get its stem
+     *
+     * @return string Arabic stem of the word
+     *
+     * @author Khaled Al-Sham'aa <[email protected]>
+     */
+    public static function stem($word)
+    {
+        $nounStem = self::roughStem(
+            $word,
+            self::$_nounMay,
+            self::$_nounPre,
+            self::$_nounPost,
+            self::$_nounMaxPre,
+            self::$_nounMaxPost,
+            self::$_nounMinStem
+        );
+        $verbStem = self::roughStem(
+            $word,
+            self::$_verbMay,
+            self::$_verbPre,
+            self::$_verbPost,
+            self::$_verbMaxPre,
+            self::$_verbMaxPost,
+            self::$_verbMinStem
+        );
+
+        if (mb_strlen($nounStem, 'UTF-8') < mb_strlen($verbStem, 'UTF-8')) {
+            $stem = $nounStem;
+        } else {
+            $stem = $verbStem;
+        }
+
+        return $stem;
+    }
+
+    /**
+     * Get rough stem of the given Arabic word (under specific rules)
+     *
+     * @param string  $word      Arabic word you would like to get its stem
+     * @param string  $notChars  Arabic chars those can't be in postfix or prefix
+     * @param string  $preChars  Arabic chars those may exists in the prefix
+     * @param string  $postChars Arabic chars those may exists in the postfix
+     * @param integer $maxPre    Max prefix length
+     * @param integer $maxPost   Max postfix length
+     * @param integer $minStem   Min stem length
+     *
+     * @return string Arabic stem of the word under giving rules
+     *
+     * @author Khaled Al-Sham'aa <[email protected]>
+     */
+    protected static function roughStem(
+        $word,
+        $notChars,
+        $preChars,
+        $postChars,
+        $maxPre,
+        $maxPost,
+        $minStem
+    ) {
+        $right = -1;
+        $left  = -1;
+        $max   = mb_strlen($word, 'UTF-8');
+
+        for ($i = 0; $i < $max; $i++) {
+            $needle = mb_substr($word, $i, 1, 'UTF-8');
+            if (mb_strpos($notChars, $needle, 0, 'UTF-8') === false) {
+                if ($right == -1) {
+                    $right = $i;
+                }
+                $left = $i;
+            }
+        }
+
+        if ($right > $maxPre) {
+            $right = $maxPre;
+        }
+
+        if ($max - $left - 1 > $maxPost) {
+            $left = $max - $maxPost - 1;
+        }
+
+        for ($i = 0; $i < $right; $i++) {
+            $needle = mb_substr($word, $i, 1, 'UTF-8');
+            if (mb_strpos($preChars, $needle, 0, 'UTF-8') === false) {
+                $right = $i;
+                break;
+            }
+        }
+
+        for ($i = $max - 1; $i > $left; $i--) {
+            $needle = mb_substr($word, $i, 1, 'UTF-8');
+            if (mb_strpos($postChars, $needle, 0, 'UTF-8') === false) {
+                $left = $i;
+                break;
+            }
+        }
+
+        if ($left - $right >= $minStem) {
+            $stem = mb_substr($word, $right, $left - $right + 1, 'UTF-8');
+        } else {
+            $stem = null;
+        }
+
+        return $stem;
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/usr/bin/env bash

		docker compose --file docker-compose.yml --env-file .env exec -w /var/www/html/wp-content/plugins/indexed-search wordpress bash -c "vendor/bin/wp --allow-root shell"