Skip to content

Commit

Permalink
feat: add stemmers from tntsearch
Browse files Browse the repository at this point in the history
  • Loading branch information
johnhooks committed Dec 15, 2023
1 parent e39129f commit 0340d19
Show file tree
Hide file tree
Showing 33 changed files with 6,890 additions and 19 deletions.
3 changes: 3 additions & 0 deletions bin/local-wp-shell.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

docker compose --file docker-compose.yml --env-file .env exec -w /var/www/html/wp-content/plugins/indexed-search wordpress bash -c "vendor/bin/wp --allow-root shell"
4 changes: 4 additions & 0 deletions phpstan.neon.dist
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ parameters:
scanDirectories:
- tests
- vendor-prod
excludePaths:
- src/Compat/WP_HTML_Tag_Processor.php
- src/Stemmer/*
- tests/*/_wordpress/*
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace WpBlocks\Search\Tokenizer;
namespace WpBlocks\Search\Compat;

use WP_HTML_Attribute_Token;
use WP_HTML_Span;
Expand Down Expand Up @@ -53,7 +53,7 @@
* no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
* or may not terminate a character reference.
*
* @package WordPress
* @package WordPress
* @subpackage HTML-API
*
* @since 6.2.0
Expand Down Expand Up @@ -715,7 +715,9 @@ public function next_tag($query = null)
*/
public function class_list()
{
/** @var string $class contains the string value of the class attribute, with character references decoded. */
/**
* @var string $class contains the string value of the class attribute, with character references decoded.
*/
$class = $this->get_attribute('class');

if (!is_string($class)) {
Expand Down Expand Up @@ -912,7 +914,7 @@ public function release_bookmark($name)
* Skips contents of generic rawtext elements.
*
* @since 6.3.2
* @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
* @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
*
* @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
*
Expand All @@ -932,7 +934,7 @@ private function skip_rawtext($tag_name)
* Skips contents of RCDATA elements, namely title and textarea tags.
*
* @since 6.2.0
* @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
* @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
*
* @param string $tag_name The uppercase tag name which will close the RCDATA region.
*
Expand Down Expand Up @@ -1518,8 +1520,8 @@ private function after_tag()
* (they are accumulated in different data formats for performance).
*
* @since 6.2.0
* @see WP_HTML_Tag_Processor::$lexical_updates
* @see WP_HTML_Tag_Processor::$classname_updates
* @see WP_HTML_Tag_Processor::$lexical_updates
* @see WP_HTML_Tag_Processor::$classname_updates
*/
private function class_name_updates_to_attributes_updates()
{
Expand Down Expand Up @@ -1551,7 +1553,6 @@ private function class_name_updates_to_attributes_updates()
* attribute, skipping removed classes on the way, and then appending
* added classes at the end. Only when finished processing will the
* value contain the final new value.
*
* @var string $class
*/
Expand Down Expand Up @@ -1999,7 +2000,7 @@ public function get_attribute($name)
* $p->get_attribute_names_with_prefix( 'data-' ) === null;
*
* @since 6.2.0
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
* @param string $prefix Prefix of requested attribute names.
*
Expand Down Expand Up @@ -2374,7 +2375,7 @@ public function remove_class($class_name)
* Returns the string representation of the HTML Tag Processor.
*
* @since 6.2.0
* @see WP_HTML_Tag_Processor::get_updated_html()
* @see WP_HTML_Tag_Processor::get_updated_html()
*
* @return string The processed HTML.
*/
Expand Down Expand Up @@ -2567,4 +2568,27 @@ private function matches()

return true;
}

/**
* Modifications by WpBlocks\Search
*/
public function get_token_starts_at()
{
return $this->token_starts_at;
}

public function get_token_ends_at()
{
return $this->token_starts_at + $this->token_length;
}

public function get_is_closing_tag()
{
return $this->is_closing_tag;
}

public function substr(int $offset, ?int $length = null)
{
return substr($this->html, $offset, $length);
}
}
5 changes: 0 additions & 5 deletions src/Exceptions/Exception.php

This file was deleted.

5 changes: 5 additions & 0 deletions src/Exceptions/IndexNotFoundException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?php

namespace WpBlocks\Search\Exceptions;

class IndexNotFoundException extends IndexedSearchException {}
7 changes: 7 additions & 0 deletions src/Exceptions/IndexedSearchException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace WpBlocks\Search\Exceptions;

use Exception;

class IndexedSearchException extends Exception {}
5 changes: 5 additions & 0 deletions src/Exceptions/TokenizationException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?php

namespace WpBlocks\Search\Exceptions;

class TokenizationException extends IndexedSearchException {}
30 changes: 28 additions & 2 deletions src/Helpers/Str.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,36 @@
class Str
{
/**
* @return string[]
* Inspired by TNTSearch
* https://github.com/teamtnt/tntsearch/blob/c8863c626a47bcb73f860abfe8eed9fb3cde3be8/src/Support/Tokenizer.php
*/
protected static string $splitWordsPattern = '/[^\p{L}\p{N}\p{Pc}\p{Pd}@]+/u';

/**
* @return array<int,string>
*/
public static function splitOnWhitespace(string $text): array
{
return preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
$result = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);

if ($result === false) {
throw new \WpBlocks\Search\Exceptions\TokenizationException();
}

return $result;
}

/**
* @return array<int,string>
*/
public static function splitWords(string $text): array
{
$result = preg_split(self::$splitWordsPattern, $text, -1, PREG_SPLIT_NO_EMPTY);

if ($result === false) {
throw new \WpBlocks\Search\Exceptions\TokenizationException();
}

return $result;
}
}
8 changes: 8 additions & 0 deletions src/Index/IndexRepository.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php

namespace WpBlocks\Search\Index\IndexRepository;

interface IndexRepository
{
public function create(string $name): int;
}
180 changes: 180 additions & 0 deletions src/Stemmer/ArabicStemmer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
<?php

/*
* This is a reimplementation of AR-PHP Arabic stemmer.
* The original author is Khaled Al-Sham'aa <[email protected]>
*
* Simple stemmer for arabic language rewritten by Nenad Tičarić
*
* @link https://github.com/teamtnt/tntsearch
*
* Copyright (c) 2016 Nenad Tičarić [email protected]
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

namespace WpBlocks\Search\Stemmer;

class ArabicStemmer implements Stemmer
{
private static $_verbPre = 'وأسفلي';

private static $_verbPost = 'ومكانيه';

private static $_verbMay;

private static $_verbMaxPre = 4;

private static $_verbMaxPost = 6;

private static $_verbMinStem = 2;

private static $_nounPre = 'ابفكلوأ';

private static $_nounPost = 'اتةكمنهوي';

private static $_nounMay;

private static $_nounMaxPre = 4;

private static $_nounMaxPost = 6;

private static $_nounMinStem = 2;

/**
* Loads initialize values
*
* @ignore
*/
public function __construct()
{
self::$_verbMay = self::$_verbPre . self::$_verbPost;
self::$_nounMay = self::$_nounPre . self::$_nounPost;
}

/**
* Get rough stem of the given Arabic word
*
* @param string $word Arabic word you would like to get its stem
*
* @return string Arabic stem of the word
*
* @author Khaled Al-Sham'aa <[email protected]>
*/
public static function stem($word)
{
$nounStem = self::roughStem(
$word,
self::$_nounMay,
self::$_nounPre,
self::$_nounPost,
self::$_nounMaxPre,
self::$_nounMaxPost,
self::$_nounMinStem
);
$verbStem = self::roughStem(
$word,
self::$_verbMay,
self::$_verbPre,
self::$_verbPost,
self::$_verbMaxPre,
self::$_verbMaxPost,
self::$_verbMinStem
);

if (mb_strlen($nounStem, 'UTF-8') < mb_strlen($verbStem, 'UTF-8')) {
$stem = $nounStem;
} else {
$stem = $verbStem;
}

return $stem;
}

/**
* Get rough stem of the given Arabic word (under specific rules)
*
* @param string $word Arabic word you would like to get its stem
* @param string $notChars Arabic chars those can't be in postfix or prefix
* @param string $preChars Arabic chars those may exists in the prefix
* @param string $postChars Arabic chars those may exists in the postfix
* @param integer $maxPre Max prefix length
* @param integer $maxPost Max postfix length
* @param integer $minStem Min stem length
*
* @return string Arabic stem of the word under giving rules
*
* @author Khaled Al-Sham'aa <[email protected]>
*/
protected static function roughStem(
$word,
$notChars,
$preChars,
$postChars,
$maxPre,
$maxPost,
$minStem
) {
$right = -1;
$left = -1;
$max = mb_strlen($word, 'UTF-8');

for ($i = 0; $i < $max; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($notChars, $needle, 0, 'UTF-8') === false) {
if ($right == -1) {
$right = $i;
}
$left = $i;
}
}

if ($right > $maxPre) {
$right = $maxPre;
}

if ($max - $left - 1 > $maxPost) {
$left = $max - $maxPost - 1;
}

for ($i = 0; $i < $right; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($preChars, $needle, 0, 'UTF-8') === false) {
$right = $i;
break;
}
}

for ($i = $max - 1; $i > $left; $i--) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($postChars, $needle, 0, 'UTF-8') === false) {
$left = $i;
break;
}
}

if ($left - $right >= $minStem) {
$stem = mb_substr($word, $right, $left - $right + 1, 'UTF-8');
} else {
$stem = null;
}

return $stem;
}
}
Loading

0 comments on commit 0340d19

Please sign in to comment.