Skip to content

Commit

Permalink
Add a new configuration setting synonym_analyzer for synonym_graph an…
Browse files Browse the repository at this point in the history
…d synonym (#16625)

Signed-off-by: Prudhvi Godithi <[email protected]>
  • Loading branch information
prudhvigodithi authored Nov 13, 2024
1 parent 0cacc98 commit 88c7b20
Show file tree
Hide file tree
Showing 10 changed files with 250 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add dynamic setting allowing size > 0 requests to be cached in the request cache ([#16483](https://github.com/opensearch-project/OpenSearch/pull/16483))
- Make IndexStoreListener a pluggable interface ([#16583](https://github.com/opensearch-project/OpenSearch/pull/16583))
- Add vertical scaling and SoftReference for snapshot repository data cache ([#16489](https://github.com/opensearch-project/OpenSearch/pull/16489))
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).

### Dependencies
- Bump `com.google.apis:google-api-services-compute` from v1-rev20240407-2.0.0 to v1-rev20241021-2.0.0 ([#16502](https://github.com/opensearch-project/OpenSearch/pull/16502), [#16548](https://github.com/opensearch-project/OpenSearch/pull/16548))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.opensearch.plugins.AnalysisPlugin;
Expand Down Expand Up @@ -247,7 +248,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("apostrophe", ApostropheFilterFactory::new);
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
Expand Down Expand Up @@ -325,14 +326,36 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
filters.put("unique", UniqueTokenFilterFactory::new);
filters.put("uppercase", UpperCaseTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put(
"synonym",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
filters.put(
"synonym_graph",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
Expand All @@ -49,8 +50,14 @@

public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {

SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
SynonymGraphTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, env, name, settings, analysisRegistry);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.CustomAnalyzer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
Expand All @@ -64,8 +66,16 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
protected final Settings settings;
protected final Environment environment;
protected final AnalysisMode analysisMode;

SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
private final String synonymAnalyzerName;
private final AnalysisRegistry analysisRegistry;

SynonymTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, name, settings);
this.settings = settings;

Expand All @@ -83,6 +93,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
boolean updateable = settings.getAsBoolean("updateable", false);
this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
this.environment = env;
this.synonymAnalyzerName = settings.get("synonym_analyzer", null);
this.analysisRegistry = analysisRegistry;
}

@Override
Expand Down Expand Up @@ -137,6 +149,17 @@ Analyzer buildSynonymAnalyzer(
List<TokenFilterFactory> tokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
if (synonymAnalyzerName != null) {
Analyzer customSynonymAnalyzer;
try {
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (customSynonymAnalyzer != null) {
return customSynonymAnalyzer;
}
}
return new CustomAnalyzer(
tokenizer,
charFilters.toArray(new CharFilterFactory[0]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory;
import org.apache.lucene.analysis.te.TeluguStemFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.indices.analysis.AnalysisFactoryTestCase;
import org.opensearch.indices.analysis.AnalysisModule;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.mockito.Mock;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

Expand All @@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() {
super(new CommonAnalysisPlugin());
}

@Mock
private AnalysisModule analysisModule;

@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
Expand Down Expand Up @@ -302,4 +309,19 @@ private void markedTestCase(String name, Map<String, Class<?>> map) {
unmarked
);
}

/**
* Tests the getTokenFilters(AnalysisModule) method to verify:
* 1. All token filters are properly loaded
* 2. Basic filters remain available
* 3. Synonym filters remain available when AnalysisModule is provided
*/
public void testGetTokenFiltersWithAnalysisModule() {
CommonAnalysisPlugin plugin = (CommonAnalysisPlugin) getAnalysisPlugin();
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(analysisModule);
assertNotNull("Token filters should not be null", filters);
assertTrue("Should contain basic filters", filters.containsKey("lowercase"));
assertTrue("Should contain synonym filter", filters.containsKey("synonym"));
assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.env.TestEnvironment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.PreConfiguredTokenFilter;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.VersionUtils;
Expand All @@ -64,6 +67,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class SynonymsAnalysisTests extends OpenSearchTestCase {
private IndexAnalyzers indexAnalyzers;
Expand Down Expand Up @@ -259,14 +263,17 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = TestEnvironment.newEnvironment(settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();

String[] bypassingFactories = new String[] { "dictionary_decompounder" };

CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
Expand Down Expand Up @@ -329,7 +336,10 @@ public void testDisallowedTokenFilters() throws IOException {
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();

String[] disallowedFactories = new String[] {
Expand All @@ -343,9 +353,9 @@ public void testDisallowedTokenFilters() throws IOException {
"fingerprint" };

for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);

IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
Expand All @@ -372,4 +382,76 @@ private void match(String analyzerName, String source, String target) throws IOE
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

/**
* Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
* This test verifies the correct handling of:
* 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
* 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
* 3. Single word synonyms (e.g., "laptop" → ["notebook"])
*
* @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
*/
public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph")
.put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true)
.put("index.analysis.filter.custom_word_delimiter.catenate_all", true)
.put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false)
.put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false)
.put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace")
.put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)")
.put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ")
.put("index.analysis.filter.custom_pattern_replace_filter.all", true)
.put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph")
.putList(
"index.analysis.filter.custom_synonym_graph_filter.synonyms",
"laptop => notebook",
"smartphone, mobile phone, cell phone => smartphone",
"tv, television => television"
)
.put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard")
.put("index.analysis.analyzer.text_en_index.type", "custom")
.put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace")
.putList(
"index.analysis.analyzer.text_en_index.filter",
"lowercase",
"custom_word_delimiter",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
)
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings);
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) {
assertTokenStreamContents(
ts,
new String[] { "notebook", "note", "book" },
new int[] { 0, 0, 5 },
new int[] { 9, 4, 9 },
new String[] { "word", "word", "word" },
new int[] { 1, 0, 1 },
new int[] { 2, 1, 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) {
assertTokenStreamContents(
ts,
new String[] { "smartphone" },
new int[] { 0 },
new int[] { 12 },
new String[] { "SYNONYM" },
new int[] { 1 },
new int[] { 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) {
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,12 @@ public boolean requiresAnalysisSettings() {
)
);

tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
for (AnalysisPlugin plugin : plugins) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(this);
for (Map.Entry<String, AnalysisProvider<TokenFilterFactory>> entry : filters.entrySet()) {
tokenFilters.register(entry.getKey(), entry.getValue());
}
}
return tokenFilters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;

import java.io.IOException;
Expand Down Expand Up @@ -92,6 +93,14 @@ default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return emptyMap();
}

/**
* Override to add additional {@link TokenFilter}s that need access to the AnalysisModule.
* The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method.
*/
default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
return getTokenFilters();
}

/**
* Override to add additional {@link Tokenizer}s. See {@link #requiresAnalysisSettings(AnalysisProvider)}
* how to on get the configuration from the index.
Expand Down
Loading

0 comments on commit 88c7b20

Please sign in to comment.