-
Notifications
You must be signed in to change notification settings - Fork 5
/
analyzer.py
65 lines (48 loc) · 1.67 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
This file contains code samples for analyzers.
Run the following to check the available methods:
.. code-block:: shell
python analyzers.py --help
"""
import typer
import collections
from rich.console import Console
from rich.table import Table
from enum import Enum
from config import INDEX_NAME, client
from helpers import log_titles
from typing import List
app = typer.Typer(rich_markup_mode="rich")
console = Console()
class BuiltInAnalyzer(str, Enum):
standard = "standard"
simple = "simple"
whitespace = "whitespace"
stop = "stop"
keyword = "keyword"
pattern = "pattern"
fingerprint = "fingerprint"
@app.command("test")
def test_analyzer(
text: str,
analyzer: BuiltInAnalyzer = BuiltInAnalyzer.standard,
):
"""Tokenizer your input with a built-in analyzer of your choice"""
res = client.indices.analyze(body={"analyzer": analyzer, "text": [text]})
tokens = [sample["token"] for c, sample in enumerate(res["tokens"])]
print(f"{analyzer} \n")
print(f"Tokens: {tokens} \n")
@app.command("test_all")
def test_analyzers(text: str):
"""Check how your input is tokenized with all OpenSearch built-in analyzers"""
console.rule("[bold red]OpenSearch® Built-in Analyzers")
table = Table()
table.add_column("Analyzer", justify="right", style="cyan", no_wrap=True)
table.add_column("Tokens", style="magenta")
for analyzer in BuiltInAnalyzer:
res = client.indices.analyze(body={"analyzer": analyzer, "text": [text]})
tokens = [sample["token"] for c, sample in enumerate(res["tokens"])]
table.add_row(f"{analyzer}", f"{tokens}")
console.print(table)
if __name__ == "__main__":
app()