From 6f79131c8366e61098c6c043cf325c38517d6ef8 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Wed, 20 Sep 2023 16:33:22 +0200 Subject: [PATCH 1/9] Improve geocoder fuzziness --- .../ext/geocoder/LuceneIndexTest.java | 41 +++++++++++++++++-- .../ext/geocoder/LuceneIndex.java | 29 ++++++++----- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 97f151493e1..f85cdfae353 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -145,9 +145,24 @@ void stopLocationGroupsWithSpace() { @Nested class StopClusters { - @Test - void stopClusters() { - var result1 = index.queryStopClusters("alex").toList(); + @ParameterizedTest + @ValueSource( + strings = { + "Alexanderplatz", + "alex", + "Alexnderplatz", + "Alexnaderplatz", + "alexnaderplaz", + "Alexanderplat", + "alexanderplat", + "alexand", + "alexander platz", + "alexander-platz", + "alexander", + } + ) + void stopClustersWithTypos(String searchTerm) { + var result1 = index.queryStopClusters(searchTerm).toList(); assertEquals(List.of(mapper.map(ALEXANDERPLATZ_STATION)), result1); } @@ -167,7 +182,25 @@ void deduplicatedStopClusters() { @ParameterizedTest @ValueSource( strings = { - "five", "five ", "five p", "five po", "five poi", "five poin", "five point", "five points", + "five", + "five ", + "five p", + "five po", + "five poi", + "five poin", + "five point", + "five points", + "fife point", + "five poits", + "fife", + "points", + "the five points", + "five @ points", + "five@points", + "five at points", + "five&points", + "five & points", + "five and points", } ) void stopClustersWithSpace(String query) { diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index 62eeb977c8d..2ddbbf89069 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -10,7 +10,7 @@ import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; @@ -24,9 +24,12 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.suggest.document.Completion90PostingsFormat; import org.apache.lucene.search.suggest.document.CompletionAnalyzer; @@ -70,7 +73,7 @@ public LuceneIndex(Graph graph, TransitService transitService) { this.analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer(), - Map.of(NAME, new SimpleAnalyzer(), SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())) + Map.of(NAME, new EnglishAnalyzer(), SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())) ); var directory = new ByteBuffersDirectory(); @@ -194,7 +197,7 @@ public Stream queryStreetVertices(String query, boolean autocomple * one of those is chosen at random and returned. */ public Stream queryStopClusters(String query) { - return matchingDocuments(StopCluster.class, query, true).map(LuceneIndex::toStopCluster); + return matchingDocuments(StopCluster.class, query, false).map(LuceneIndex::toStopCluster); } private static StopCluster toStopCluster(Document document) { @@ -279,6 +282,7 @@ private Stream matchingDocuments( 3 ); var query = new ContextQuery(completionQuery); + query.addContext(type.getSimpleName()); var topDocs = searcher.suggest(query, 25, true); @@ -293,8 +297,12 @@ private Stream matchingDocuments( } }); } else { - var parser = new QueryParser(CODE, analyzer); - var nameQuery = parser.createPhraseQuery(NAME, searchTerms); + var parser = new QueryParser(NAME, analyzer); + var nameQuery = parser.parse(searchTerms); + var fuzzyNameQuery = new FuzzyQuery(new Term(NAME, analyzer.normalize(NAME, searchTerms))); + var prefixNameQuery = new PrefixQuery( + new Term(NAME, analyzer.normalize(NAME, searchTerms)) + ); var codeQuery = new TermQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); var typeQuery = new TermQuery( new Term(TYPE, analyzer.normalize(TYPE, type.getSimpleName())) @@ -303,11 +311,10 @@ private Stream matchingDocuments( var builder = new BooleanQuery.Builder() .setMinimumNumberShouldMatch(1) .add(typeQuery, Occur.MUST) - .add(codeQuery, Occur.SHOULD); - - if (nameQuery != null) { - builder.add(nameQuery, Occur.SHOULD); - } + .add(codeQuery, Occur.SHOULD) + .add(nameQuery, Occur.SHOULD) + .add(fuzzyNameQuery, Occur.SHOULD) + .add(prefixNameQuery, Occur.SHOULD); var query = builder.build(); @@ -323,7 +330,7 @@ private Stream matchingDocuments( } }); } - } catch (IOException ex) { + } catch (IOException | ParseException ex) { throw new RuntimeException(ex); } } From 30754257a3b83360631f6cef209cc1c0136818aa Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Wed, 20 Sep 2023 17:43:00 +0200 Subject: [PATCH 2/9] Allow prefix queries for stop code --- .../ext/geocoder/LuceneIndexTest.java | 33 +++++++++++++++++-- .../ext/geocoder/LuceneIndex.java | 31 ++++++++++++++++- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index f85cdfae353..76ad7b005c0 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -9,9 +9,15 @@ import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.Multimap; +import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -149,6 +155,7 @@ class StopClusters { @ValueSource( strings = { "Alexanderplatz", + "Alexa", "alex", "Alexnderplatz", "Alexnaderplatz", @@ -201,6 +208,8 @@ void deduplicatedStopClusters() { "five&points", "five & points", "five and points", + "points five", + "points fife", } ) void stopClustersWithSpace(String query) { @@ -208,13 +217,31 @@ void stopClustersWithSpace(String query) { assertEquals(List.of(mapper.map(FIVE_POINTS_STATION)), result); } - @Test - void stopCode() { - var result = index.queryStopClusters(ARTS_CENTER.getCode()).toList(); + @ParameterizedTest + @ValueSource(strings = { "4456", "445", "#445" }) + void fuzzyStopCode(String query) { + var result = index.queryStopClusters(query).toList(); assertEquals(1, result.size()); assertEquals(ARTS_CENTER.getName().toString(), result.get(0).name()); } + @Test + void analyzer() throws IOException { + var x = analyze("#444", new StandardAnalyzer()); + assertEquals(x, "444"); + } + + public List analyze(String text, Analyzer analyzer) throws IOException { + List result = new ArrayList(); + TokenStream tokenStream = analyzer.tokenStream("code", text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } + @Test void modes() { var result = index.queryStopClusters("westh").toList(); diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index 2ddbbf89069..fe210b6db22 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -1,5 +1,7 @@ package org.opentripplanner.ext.geocoder; +import static java.util.Map.entry; + import java.io.IOException; import java.io.Serializable; import java.util.Arrays; @@ -10,9 +12,14 @@ import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene95.Lucene95Codec; @@ -67,13 +74,31 @@ public class LuceneIndex implements Serializable { private final Analyzer analyzer; private final SuggestIndexSearcher searcher; + public static class MyCustomAnalyzer extends Analyzer { + + static final CharArraySet CODE_STOP_WORDS = new CharArraySet(Set.of("#"), true); + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + StandardTokenizer src = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(src); + result = new StopFilter(result, CODE_STOP_WORDS); + return new TokenStreamComponents(src, result); + } + } + public LuceneIndex(Graph graph, TransitService transitService) { this.graph = graph; this.transitService = transitService; + this.analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer(), - Map.of(NAME, new EnglishAnalyzer(), SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())) + Map.ofEntries( + entry(NAME, new EnglishAnalyzer()), + entry(SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())), + entry(CODE, new MyCustomAnalyzer()) + ) ); var directory = new ByteBuffersDirectory(); @@ -304,6 +329,9 @@ private Stream matchingDocuments( new Term(NAME, analyzer.normalize(NAME, searchTerms)) ); var codeQuery = new TermQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); + var prefixCodeQuery = new PrefixQuery( + new Term(CODE, analyzer.normalize(CODE, searchTerms)) + ); var typeQuery = new TermQuery( new Term(TYPE, analyzer.normalize(TYPE, type.getSimpleName())) ); @@ -312,6 +340,7 @@ private Stream matchingDocuments( .setMinimumNumberShouldMatch(1) .add(typeQuery, Occur.MUST) .add(codeQuery, Occur.SHOULD) + .add(prefixCodeQuery, Occur.SHOULD) .add(nameQuery, Occur.SHOULD) .add(fuzzyNameQuery, Occur.SHOULD) .add(prefixNameQuery, Occur.SHOULD); From d87d9fb7f0f24a36f261cabe6871a622f5a68464 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Wed, 20 Sep 2023 23:20:07 +0200 Subject: [PATCH 3/9] Allow prefix queries for stop codes --- .../ext/geocoder/LuceneIndexTest.java | 32 ++++--------------- .../ext/geocoder/LuceneIndex.java | 31 ++++-------------- 2 files changed, 12 insertions(+), 51 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 76ad7b005c0..d6502318726 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -9,15 +9,9 @@ import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.Multimap; -import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -157,6 +151,8 @@ class StopClusters { "Alexanderplatz", "Alexa", "alex", + "aleyanderplazt", + "alexnderplazt", "Alexnderplatz", "Alexnaderplatz", "alexnaderplaz", @@ -203,13 +199,14 @@ void deduplicatedStopClusters() { "points", "the five points", "five @ points", + "five @ the points", "five@points", "five at points", "five&points", "five & points", - "five and points", + "five and the points", "points five", - "points fife", + "points fife" } ) void stopClustersWithSpace(String query) { @@ -218,30 +215,13 @@ void stopClustersWithSpace(String query) { } @ParameterizedTest - @ValueSource(strings = { "4456", "445", "#445" }) + @ValueSource(strings = { "4456", "445" }) void fuzzyStopCode(String query) { var result = index.queryStopClusters(query).toList(); assertEquals(1, result.size()); assertEquals(ARTS_CENTER.getName().toString(), result.get(0).name()); } - @Test - void analyzer() throws IOException { - var x = analyze("#444", new StandardAnalyzer()); - assertEquals(x, "444"); - } - - public List analyze(String text, Analyzer analyzer) throws IOException { - List result = new ArrayList(); - TokenStream tokenStream = analyzer.tokenStream("code", text); - CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); - tokenStream.reset(); - while (tokenStream.incrementToken()) { - result.add(attr.toString()); - } - return result; - } - @Test void modes() { var result = index.queryStopClusters("westh").toList(); diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index fe210b6db22..3e2bb1c5289 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -12,14 +12,9 @@ import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene95.Lucene95Codec; @@ -74,19 +69,6 @@ public class LuceneIndex implements Serializable { private final Analyzer analyzer; private final SuggestIndexSearcher searcher; - public static class MyCustomAnalyzer extends Analyzer { - - static final CharArraySet CODE_STOP_WORDS = new CharArraySet(Set.of("#"), true); - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - StandardTokenizer src = new StandardTokenizer(); - TokenStream result = new LowerCaseFilter(src); - result = new StopFilter(result, CODE_STOP_WORDS); - return new TokenStreamComponents(src, result); - } - } - public LuceneIndex(Graph graph, TransitService transitService) { this.graph = graph; this.transitService = transitService; @@ -96,8 +78,7 @@ public LuceneIndex(Graph graph, TransitService transitService) { new StandardAnalyzer(), Map.ofEntries( entry(NAME, new EnglishAnalyzer()), - entry(SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())), - entry(CODE, new MyCustomAnalyzer()) + entry(SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())) ) ); @@ -322,16 +303,16 @@ private Stream matchingDocuments( } }); } else { - var parser = new QueryParser(NAME, analyzer); - var nameQuery = parser.parse(searchTerms); + var nameParser = new QueryParser(NAME, analyzer); + var nameQuery = nameParser.parse(searchTerms); var fuzzyNameQuery = new FuzzyQuery(new Term(NAME, analyzer.normalize(NAME, searchTerms))); var prefixNameQuery = new PrefixQuery( new Term(NAME, analyzer.normalize(NAME, searchTerms)) ); var codeQuery = new TermQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); - var prefixCodeQuery = new PrefixQuery( - new Term(CODE, analyzer.normalize(CODE, searchTerms)) - ); + + var prefixCodeQuery = new PrefixQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); + var typeQuery = new TermQuery( new Term(TYPE, analyzer.normalize(TYPE, type.getSimpleName())) ); From a84810b88bc6b685095440c8d02713f383443260 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 22 Sep 2023 13:30:37 +0200 Subject: [PATCH 4/9] Build NGram index for matching middle parts of names --- .../geocoder/EnglishNgramAnalyzerTest.java | 96 +++++++++++++++++++ .../ext/geocoder/LuceneIndexTest.java | 5 +- .../ext/geocoder/EnglishNGramAnalyzer.java | 34 +++++++ .../ext/geocoder/LuceneIndex.java | 15 ++- 4 files changed, 147 insertions(+), 3 deletions(-) create mode 100644 src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java create mode 100644 src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java new file mode 100644 index 00000000000..9bf7ef73da5 --- /dev/null +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -0,0 +1,96 @@ +package org.opentripplanner.ext.geocoder; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.junit.jupiter.api.Test; + +class EnglishNgramAnalyzerTest { + + @Test + void ngram() throws IOException { + var analyzer = new EnglishNGramAnalyzer(); + List result = analyze("Alexanderplatz", analyzer); + + System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); + assertEquals( + List.of( + "Alex", + "Alexa", + "Alexan", + "Alexand", + "Alexande", + "Alexander", + "Alexanderp", + "lexa", + "lexan", + "lexand", + "lexande", + "lexander", + "lexanderp", + "lexanderpl", + "exan", + "exand", + "exande", + "exander", + "exanderp", + "exanderpl", + "exanderpla", + "xand", + "xande", + "xander", + "xanderp", + "xanderpl", + "xanderpla", + "xanderplat", + "ande", + "ander", + "anderp", + "anderpl", + "anderpla", + "anderplat", + "anderplatz", + "nder", + "nderp", + "nderpl", + "nderpla", + "nderplat", + "nderplatz", + "derp", + "derpl", + "derpla", + "derplat", + "derplatz", + "erpl", + "erpla", + "erplat", + "erplatz", + "rpla", + "rplat", + "rplatz", + "plat", + "platz", + "latz", + "Alexanderplatz" + ), + result + ); + } + + public List analyze(String text, Analyzer analyzer) throws IOException { + List result = new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream("name", text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } +} diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index d6502318726..80cce649e04 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -155,6 +155,8 @@ class StopClusters { "alexnderplazt", "Alexnderplatz", "Alexnaderplatz", + "xande", + "xanderpla", "alexnaderplaz", "Alexanderplat", "alexanderplat", @@ -197,6 +199,7 @@ void deduplicatedStopClusters() { "five poits", "fife", "points", + "ife points", "the five points", "five @ points", "five @ the points", @@ -206,7 +209,7 @@ void deduplicatedStopClusters() { "five & points", "five and the points", "points five", - "points fife" + "points fife", } ) void stopClustersWithSpace(String query) { diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java new file mode 100644 index 00000000000..af2156fd40e --- /dev/null +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -0,0 +1,34 @@ +package org.opentripplanner.ext.geocoder; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.en.EnglishPossessiveFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * A custom analyzer for stop names. It removes english stop words (at,the...) and splits + * the input into ing NGrams (https://en.wikipedia.org/wiki/N-gram) so that the middle + * of a stop name can be matched efficiently. + *

+ * For example the query of "exanderpl" will match the stop name "Alexanderplatz". + */ +class EnglishNGramAnalyzer extends Analyzer { + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + StandardTokenizer src = new StandardTokenizer(); + TokenStream result = new EnglishPossessiveFilter(src); + result = new LowerCaseFilter(result); + result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); + result = new PorterStemFilter(result); + result = new CapitalizationFilter(result); + result = new NGramTokenFilter(result, 4, 10, true); + return new TokenStreamComponents(src, result); + } +} diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index 3e2bb1c5289..ca8a9d96df9 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -58,6 +58,7 @@ public class LuceneIndex implements Serializable { private static final String ID = "id"; private static final String SUGGEST = "suggest"; private static final String NAME = "name"; + private static final String NAME_NGRAM = "name_ngram"; private static final String CODE = "code"; private static final String LAT = "latitude"; private static final String LON = "longitude"; @@ -78,6 +79,7 @@ public LuceneIndex(Graph graph, TransitService transitService) { new StandardAnalyzer(), Map.ofEntries( entry(NAME, new EnglishAnalyzer()), + entry(NAME_NGRAM, new EnglishNGramAnalyzer()), entry(SUGGEST, new CompletionAnalyzer(new StandardAnalyzer())) ) ); @@ -249,6 +251,7 @@ private static void addToIndex( document.add(new StoredField(ID, id)); document.add(new TextField(TYPE, typeName, Store.YES)); document.add(new TextField(NAME, Objects.toString(name), Store.YES)); + document.add(new TextField(NAME_NGRAM, Objects.toString(name), Store.YES)); document.add(new ContextSuggestField(SUGGEST, Objects.toString(name), 1, typeName)); document.add(new StoredField(LAT, latitude)); document.add(new StoredField(LON, longitude)); @@ -305,13 +308,20 @@ private Stream matchingDocuments( } else { var nameParser = new QueryParser(NAME, analyzer); var nameQuery = nameParser.parse(searchTerms); + + var ngramNameQuery = new TermQuery( + new Term(NAME_NGRAM, analyzer.normalize(NAME_NGRAM, searchTerms)) + ); + var fuzzyNameQuery = new FuzzyQuery(new Term(NAME, analyzer.normalize(NAME, searchTerms))); var prefixNameQuery = new PrefixQuery( new Term(NAME, analyzer.normalize(NAME, searchTerms)) ); var codeQuery = new TermQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); - var prefixCodeQuery = new PrefixQuery(new Term(CODE, analyzer.normalize(CODE, searchTerms))); + var prefixCodeQuery = new PrefixQuery( + new Term(CODE, analyzer.normalize(CODE, searchTerms)) + ); var typeQuery = new TermQuery( new Term(TYPE, analyzer.normalize(TYPE, type.getSimpleName())) @@ -324,7 +334,8 @@ private Stream matchingDocuments( .add(prefixCodeQuery, Occur.SHOULD) .add(nameQuery, Occur.SHOULD) .add(fuzzyNameQuery, Occur.SHOULD) - .add(prefixNameQuery, Occur.SHOULD); + .add(prefixNameQuery, Occur.SHOULD) + .add(ngramNameQuery, Occur.SHOULD); var query = builder.build(); From 63ab2a83b70c4b125603fff62bfa3c77cd67975c Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 22 Sep 2023 14:00:56 +0200 Subject: [PATCH 5/9] Remove street corners from Lucene indexing --- .../ext/geocoder/LuceneIndexTest.java | 2 +- .../ext/geocoder/GeocoderResource.java | 28 ++-------------- .../ext/geocoder/LuceneIndex.java | 33 ++----------------- 3 files changed, 6 insertions(+), 57 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 80cce649e04..881df8b65af 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -113,7 +113,7 @@ public List getModesOfStopLocation(StopLocation stop) { } } }; - index = new LuceneIndex(graph, transitService); + index = new LuceneIndex(transitService); mapper = new StopClusterMapper(transitService); } diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/GeocoderResource.java b/src/ext/java/org/opentripplanner/ext/geocoder/GeocoderResource.java index 70e41d136c6..d2eafc91e5c 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/GeocoderResource.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/GeocoderResource.java @@ -16,7 +16,6 @@ import java.util.stream.Collectors; import org.opentripplanner.api.mapping.FeedScopedIdMapper; import org.opentripplanner.standalone.api.OtpServerRequestContext; -import org.opentripplanner.street.model.vertex.StreetVertex; import org.opentripplanner.transit.model.site.StopLocation; /** @@ -47,7 +46,6 @@ public GeocoderResource(@Context OtpServerRequestContext requestContext) { * @param autocomplete Whether we should use the query string to do a prefix match * @param stops Search for stops, either by name or stop code * @param clusters Search for clusters by their name - * @param corners Search for street corners using at least one of the street names * @return list of results in the format expected by GeocoderBuiltin.js in the OTP Leaflet * client */ @@ -56,12 +54,11 @@ public Response textSearch( @QueryParam("query") String query, @QueryParam("autocomplete") @DefaultValue("false") boolean autocomplete, @QueryParam("stops") @DefaultValue("true") boolean stops, - @QueryParam("clusters") @DefaultValue("false") boolean clusters, - @QueryParam("corners") @DefaultValue("true") boolean corners + @QueryParam("clusters") @DefaultValue("false") boolean clusters ) { return Response .status(Response.Status.OK) - .entity(query(query, autocomplete, stops, clusters, corners)) + .entity(query(query, autocomplete, stops, clusters)) .build(); } @@ -77,8 +74,7 @@ private List query( String query, boolean autocomplete, boolean stops, - boolean clusters, - boolean corners + boolean clusters ) { List results = new ArrayList<>(); @@ -90,10 +86,6 @@ private List query( results.addAll(queryStations(query, autocomplete)); } - if (corners) { - results.addAll(queryCorners(query, autocomplete)); - } - return results; } @@ -127,20 +119,6 @@ private Collection queryStations(String query, boolean a .collect(Collectors.toList()); } - private Collection queryCorners(String query, boolean autocomplete) { - return LuceneIndex - .forServer(serverContext) - .queryStreetVertices(query, autocomplete) - .map(v -> - new SearchResult(v.getLat(), v.getLon(), stringifyStreetVertex(v), v.getLabelString()) - ) - .collect(Collectors.toList()); - } - - private String stringifyStreetVertex(StreetVertex v) { - return String.format("%s (%s)", v.getIntersectionName(), v.getLabel()); - } - private String stringifyStopLocation(StopLocation sl) { return sl.getCode() != null ? String.format("%s (%s)", sl.getName(), sl.getCode()) diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index ca8a9d96df9..7037fd19361 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -43,10 +43,7 @@ import org.opentripplanner.ext.geocoder.StopCluster.Coordinate; import org.opentripplanner.framework.i18n.I18NString; import org.opentripplanner.framework.i18n.NonLocalizedString; -import org.opentripplanner.routing.graph.Graph; import org.opentripplanner.standalone.api.OtpServerRequestContext; -import org.opentripplanner.street.model.vertex.StreetVertex; -import org.opentripplanner.street.model.vertex.VertexLabel; import org.opentripplanner.transit.model.framework.FeedScopedId; import org.opentripplanner.transit.model.site.StopLocation; import org.opentripplanner.transit.model.site.StopLocationsGroup; @@ -64,14 +61,11 @@ public class LuceneIndex implements Serializable { private static final String LON = "longitude"; private static final String MODE = "mode"; - private final Graph graph; - private final TransitService transitService; private final Analyzer analyzer; private final SuggestIndexSearcher searcher; - public LuceneIndex(Graph graph, TransitService transitService) { - this.graph = graph; + public LuceneIndex(TransitService transitService) { this.transitService = transitService; this.analyzer = @@ -141,24 +135,6 @@ public LuceneIndex(Graph graph, TransitService transitService) { stopCluster.modes() ) ); - - graph - .getVertices() - .stream() - .filter(v -> v instanceof StreetVertex) - .map(v -> (StreetVertex) v) - .forEach(streetVertex -> - addToIndex( - directoryWriter, - StreetVertex.class, - streetVertex.getLabelString(), - streetVertex.getIntersectionName(), - streetVertex.getLabelString(), - streetVertex.getLat(), - streetVertex.getLon(), - Set.of() - ) - ); } DirectoryReader indexReader = DirectoryReader.open(directory); @@ -175,7 +151,7 @@ public static synchronized LuceneIndex forServer(OtpServerRequestContext serverC return existingIndex; } - var newIndex = new LuceneIndex(graph, serverContext.transitService()); + var newIndex = new LuceneIndex(serverContext.transitService()); graph.setLuceneIndex(newIndex); return newIndex; } @@ -190,11 +166,6 @@ public Stream queryStopLocationGroups(String query, boolean .map(document -> transitService.getStopLocationsGroup(FeedScopedId.parse(document.get(ID)))); } - public Stream queryStreetVertices(String query, boolean autocomplete) { - return matchingDocuments(StreetVertex.class, query, autocomplete) - .map(document -> (StreetVertex) graph.getVertex(VertexLabel.string(document.get(ID)))); - } - /** * Return all "stop clusters" for a given query. *

From ea0f449e723dd257b9a00dc89e68da6ed72342a2 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 29 Sep 2023 11:34:35 +0200 Subject: [PATCH 6/9] Round to 100m instead of 10m --- .../org/opentripplanner/ext/geocoder/LuceneIndexTest.java | 3 --- .../java/org/opentripplanner/ext/geocoder/LuceneIndex.java | 1 + .../org/opentripplanner/ext/geocoder/StopClusterMapper.java | 2 +- .../opentripplanner/framework/geometry/WgsCoordinate.java | 6 ++++++ .../opentripplanner/openstreetmap/model/OSMWithTags.java | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 881df8b65af..0ef9b17e2fd 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -17,7 +17,6 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; -import org.opentripplanner.routing.graph.Graph; import org.opentripplanner.transit.model.basic.TransitMode; import org.opentripplanner.transit.model.framework.Deduplicator; import org.opentripplanner.transit.model.site.RegularStop; @@ -29,8 +28,6 @@ class LuceneIndexTest { - static Graph graph = new Graph(); - // Berlin static Station BERLIN_HAUPTBAHNHOF_STATION = station("Hauptbahnhof") .withCoordinate(52.52495, 13.36952) diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index 7037fd19361..ad7fd151764 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -33,6 +33,7 @@ import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.suggest.document.Completion90PostingsFormat; import org.apache.lucene.search.suggest.document.CompletionAnalyzer; import org.apache.lucene.search.suggest.document.ContextQuery; diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/StopClusterMapper.java b/src/ext/java/org/opentripplanner/ext/geocoder/StopClusterMapper.java index f375b24fefa..6ec3e650eb6 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/StopClusterMapper.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/StopClusterMapper.java @@ -42,7 +42,7 @@ Stream generateStopClusters( // if they are very close to each other and have the same name, only one is chosen (at random) .filter( PredicateUtils.distinctByKey(sl -> - new DeduplicationKey(sl.getName(), sl.getCoordinate().roundToApproximate10m()) + new DeduplicationKey(sl.getName(), sl.getCoordinate().roundToApproximate100m()) ) ) .flatMap(sl -> this.map(sl).stream()); diff --git a/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java b/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java index c1fb3619a04..0f2b5ebc160 100644 --- a/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java +++ b/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java @@ -146,6 +146,12 @@ public WgsCoordinate roundToApproximate10m() { return new WgsCoordinate(lat, lng); } + public WgsCoordinate roundToApproximate100m() { + var lat = DoubleUtils.roundTo3Decimals(latitude); + var lng = DoubleUtils.roundTo3Decimals(longitude); + return new WgsCoordinate(lat, lng); + } + /** * Return a string on the form: {@code "(60.12345, 11.12345)"}. Up to 5 digits are used after the * period(.), even if the coordinate is specified with a higher precision. diff --git a/src/main/java/org/opentripplanner/openstreetmap/model/OSMWithTags.java b/src/main/java/org/opentripplanner/openstreetmap/model/OSMWithTags.java index 4b4883544d4..04815d8820b 100644 --- a/src/main/java/org/opentripplanner/openstreetmap/model/OSMWithTags.java +++ b/src/main/java/org/opentripplanner/openstreetmap/model/OSMWithTags.java @@ -438,7 +438,7 @@ public boolean isParkAndRide() { } /** - * Is this a public transport boarding location where passengers wait for transti and that can be + * Is this a public transport boarding location where passengers wait for transit and that can be * linked to a transit stop vertex later on. *

* This intentionally excludes railway=stop and public_transport=stop because these are supposed From 302a49c7e8f6452322996f6b45c91d757f930785 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 6 Oct 2023 12:38:34 +0200 Subject: [PATCH 7/9] Add Javadoc --- docs/sandbox/GeocoderAPI.md | 1 - .../ext/geocoder/EnglishNgramAnalyzerTest.java | 3 +-- .../framework/geometry/WgsCoordinate.java | 11 +++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/sandbox/GeocoderAPI.md b/docs/sandbox/GeocoderAPI.md index 5a90f13573b..0405724fff6 100644 --- a/docs/sandbox/GeocoderAPI.md +++ b/docs/sandbox/GeocoderAPI.md @@ -36,7 +36,6 @@ It supports the following URL parameters: | `autocomplete` | Whether we should use the query string to do a prefix match | | `stops` | Search for stops, either by name or stop code | | `clusters` | Search for clusters by their name | -| `corners` | Search for street corners using at least one of the street names | #### Stop clusters diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index 9bf7ef73da5..615ef90cbbd 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -5,7 +5,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -18,7 +17,7 @@ void ngram() throws IOException { var analyzer = new EnglishNGramAnalyzer(); List result = analyze("Alexanderplatz", analyzer); - System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); + //System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( "Alex", diff --git a/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java b/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java index 0f2b5ebc160..e818d50c1f6 100644 --- a/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java +++ b/src/main/java/org/opentripplanner/framework/geometry/WgsCoordinate.java @@ -146,6 +146,17 @@ public WgsCoordinate roundToApproximate10m() { return new WgsCoordinate(lat, lng); } + /** + * Return a new version of this coordinate where latitude/longitude are rounded to 3 decimal + * places which at the equator has ~100 meter precision. + *

+ * See https://wiki.openstreetmap.org/wiki/Precision_of_coordinates + *

+ * This is useful when you want to cache coordinate-based computations but don't need absolute + * precision. + *

+ * DO NOT USE THIS IN ROUTING (USE AT LEAST 7 DECIMALS)! + */ public WgsCoordinate roundToApproximate100m() { var lat = DoubleUtils.roundTo3Decimals(latitude); var lng = DoubleUtils.roundTo3Decimals(longitude); From 3dcbc984e0c2e10325bd4d70741c6a95936ad286 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 10 Oct 2023 12:26:35 +0200 Subject: [PATCH 8/9] Add tests for rounding --- .../framework/geometry/WgsCoordinateTest.java | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/opentripplanner/framework/geometry/WgsCoordinateTest.java b/src/test/java/org/opentripplanner/framework/geometry/WgsCoordinateTest.java index 78c92cea174..05051ab7dff 100644 --- a/src/test/java/org/opentripplanner/framework/geometry/WgsCoordinateTest.java +++ b/src/test/java/org/opentripplanner/framework/geometry/WgsCoordinateTest.java @@ -9,25 +9,26 @@ import java.util.List; import org.junit.jupiter.api.Test; import org.locationtech.jts.geom.Coordinate; +import org.opentripplanner._support.geometry.Coordinates; public class WgsCoordinateTest { @Test - public void normalize() { + void normalize() { WgsCoordinate c = new WgsCoordinate(1.123456789, 2.987654321); assertEquals(1.1234568, c.latitude()); assertEquals(2.9876543, c.longitude()); } @Test - public void testToString() { + void testToString() { WgsCoordinate c = new WgsCoordinate(1.123456789, 2.987654321); assertEquals("(1.12346, 2.98765)", c.toString()); assertEquals("(1.123, 2.9)", new WgsCoordinate(1.123, 2.9).toString()); } @Test - public void testCoordinateEquals() { + void testCoordinateEquals() { WgsCoordinate a = new WgsCoordinate(5.000_000_3, 3.0); // Test latitude @@ -50,7 +51,7 @@ public void testCoordinateEquals() { } @Test - public void asJtsCoordinate() { + void asJtsCoordinate() { // Given a well known location in Oslo double latitude = 59.9110583; double longitude = 10.7502691; @@ -65,7 +66,7 @@ public void asJtsCoordinate() { } @Test - public void mean() { + void mean() { var c1 = new WgsCoordinate(10.0, 5.0); var c2 = new WgsCoordinate(20.0, -5.0); @@ -79,7 +80,7 @@ public void mean() { } @Test - public void validCoordinates() { + void validCoordinates() { // Edge cases should NOT throw exceptions new WgsCoordinate(90d, 1d); new WgsCoordinate(-90d, 1d); @@ -94,13 +95,29 @@ public void validCoordinates() { } @Test - public void add() { + void add() { assertEquals(new WgsCoordinate(12d, 5d), new WgsCoordinate(9d, 1d).add(3d, 4d)); } @Test - public void testGreenwich() { + void testGreenwich() { assertEquals(51.48d, WgsCoordinate.GREENWICH.latitude()); assertEquals(0d, WgsCoordinate.GREENWICH.longitude()); } + + @Test + void roundingTo10m() { + var hamburg = new WgsCoordinate(Coordinates.HAMBURG); + var rounded = hamburg.roundToApproximate10m(); + assertEquals(10.0003, rounded.latitude()); + assertEquals(53.5566, rounded.longitude()); + } + + @Test + void roundingTo100m() { + var hamburg = new WgsCoordinate(Coordinates.HAMBURG); + var rounded = hamburg.roundToApproximate100m(); + assertEquals(10, rounded.latitude()); + assertEquals(53.557, rounded.longitude()); + } } From 2714adc859e3fcd9121afeeb5febd609d35e726e Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Mon, 16 Oct 2023 12:23:27 +0200 Subject: [PATCH 9/9] Update docs Co-authored-by: Zsombor Welker --- .../org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index af2156fd40e..ffe46604744 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -13,7 +13,7 @@ /** * A custom analyzer for stop names. It removes english stop words (at,the...) and splits - * the input into ing NGrams (https://en.wikipedia.org/wiki/N-gram) so that the middle + * the input into NGrams (https://en.wikipedia.org/wiki/N-gram) so that the middle * of a stop name can be matched efficiently. *

* For example the query of "exanderpl" will match the stop name "Alexanderplatz".