diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index 615ef90cbbd..2c352e0f760 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -5,17 +5,17 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; class EnglishNgramAnalyzerTest { @Test - void ngram() throws IOException { - var analyzer = new EnglishNGramAnalyzer(); - List result = analyze("Alexanderplatz", analyzer); + void ngram() { + List result = tokenize("Alexanderplatz"); //System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( @@ -82,14 +82,79 @@ void ngram() throws IOException { ); } - public List analyze(String text, Analyzer analyzer) throws IOException { - List result = new ArrayList<>(); - TokenStream tokenStream = analyzer.tokenStream("name", text); - CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); - tokenStream.reset(); - while (tokenStream.incrementToken()) { - result.add(attr.toString()); + @Test + void ampersand() { + List result = tokenize("Meridian Ave N & N 148th St"); + + assertEquals( + List.of( + "Meri", + "Merid", + "Meridi", + "Meridia", + "Meridian", + "erid", + "eridi", + "eridia", + "eridian", + "ridi", + "ridia", + "ridian", + "idia", + "idian", + "dian", + "Av", + "N", + "N", + "148", + "St" + ), + result + ); + } + + @ParameterizedTest + @CsvSource( + value = { + "1st:1", + "2nd:2", + "3rd:3", + "4th:4", + "6th:6", + "148th:148", + "102nd:102", + "1003rd:1003", + "St:St", + "S3:S3", + "Aard:Aard", + }, + delimiter = ':' + ) + void numberSuffixes(String input, String expected) { + var result = tokenize(input); + assertEquals(List.of(expected), result); + } + + @Test + void wordBoundary() { + var result = tokenize("1stst"); + assertEquals(List.of("1sts", "1stst", "stst"), result); + } + + private List tokenize(String text) { + try (var analyzer = new EnglishNGramAnalyzer()) { + List result; + TokenStream tokenStream; + result = new ArrayList<>(); + tokenStream = analyzer.tokenStream("name", text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } catch (IOException e) { + throw new RuntimeException(e); } - return result; } } diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 3e6c2b15195..de6e600037c 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -12,7 +12,7 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; -import javax.annotation.Nonnull; +import java.util.stream.Stream; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -98,6 +98,10 @@ class LuceneIndexTest { .withCoordinate(52.52277, 13.41046) .build(); + static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th St").build(); + static final RegularStop MERIDIAN_N1 = TEST_MODEL.stop("Meridian N & Spencer").build(); + static final RegularStop MERIDIAN_N2 = TEST_MODEL.stop("N 205th St & Meridian Ave N").build(); + static LuceneIndex index; static StopClusterMapper mapper; @@ -113,7 +117,10 @@ static void setup() { LICHTERFELDE_OST_2, WESTHAFEN, ARTS_CENTER, - ARTHUR + ARTHUR, + MERIDIAN_N1, + MERIDIAN_N2, + MERIDIAN_AVE ) .forEach(stopModel::withRegularStop); List @@ -295,9 +302,32 @@ void agenciesAndFeedPublisher() { assertEquals(List.of(StopClusterMapper.toAgency(BVG)), cluster.primary().agencies()); assertEquals("A Publisher", cluster.primary().feedPublisher().name()); } + + @ParameterizedTest + @ValueSource( + strings = { + "Meridian Ave N & N 148th", + "Meridian Ave N & N 148", + "Meridian Ave N N 148", + "Meridian Ave N 148", + "Meridian & 148 N", + "148 N & Meridian", + "Meridian & N 148", + "Meridian Ave 148", + "Meridian Av 148", + "meridian av 148", + } + ) + void numericAdjectives(String query) { + var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); + assertEquals( + Stream.of(MERIDIAN_AVE, MERIDIAN_N2, MERIDIAN_N1).map(s -> s.getName().toString()).toList(), + names + ); + } } - private static @Nonnull Function primaryId() { + private static Function primaryId() { return c -> c.primary().id(); } } diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index ffe46604744..17bf529a559 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -1,14 +1,16 @@ package org.opentripplanner.ext.geocoder; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /** @@ -17,14 +19,21 @@ * of a stop name can be matched efficiently. *

* For example the query of "exanderpl" will match the stop name "Alexanderplatz". + *

+ * It also removes number suffixes in the American street names, like "147th Street", which will + * be tokenized to "147 Street". */ class EnglishNGramAnalyzer extends Analyzer { + // Matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th" + private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)(st|nd|rd|th)\\b"); + @Override protected TokenStreamComponents createComponents(String fieldName) { StandardTokenizer src = new StandardTokenizer(); TokenStream result = new EnglishPossessiveFilter(src); result = new LowerCaseFilter(result); + result = new PatternReplaceFilter(result, NUMBER_SUFFIX_PATTERN, "$1", true); result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); result = new PorterStemFilter(result); result = new CapitalizationFilter(result); diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index fe7bef8ad13..71b80ac58a6 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -288,7 +288,7 @@ private Stream matchingDocuments( } }); } else { - var nameParser = new QueryParser(NAME, analyzer); + var nameParser = new QueryParser(NAME_NGRAM, analyzer); var nameQuery = nameParser.parse(searchTerms); var ngramNameQuery = new TermQuery(