Skip to content

Commit

Permalink
Merge pull request #5997 from ibi-group/th-geocoder
Browse files Browse the repository at this point in the history
Improve geocoder matches for numeric adjectives
  • Loading branch information
leonardehrenfried authored Aug 9, 2024
2 parents 2f4b2d4 + bf02dae commit ebdb572
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

class EnglishNgramAnalyzerTest {

@Test
void ngram() throws IOException {
var analyzer = new EnglishNGramAnalyzer();
List<String> result = analyze("Alexanderplatz", analyzer);
void ngram() {
List<String> result = tokenize("Alexanderplatz");

//System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\"")));
assertEquals(
Expand Down Expand Up @@ -82,14 +82,79 @@ void ngram() throws IOException {
);
}

public List<String> analyze(String text, Analyzer analyzer) throws IOException {
List<String> result = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
@Test
void ampersand() {
List<String> result = tokenize("Meridian Ave N & N 148th St");

assertEquals(
List.of(
"Meri",
"Merid",
"Meridi",
"Meridia",
"Meridian",
"erid",
"eridi",
"eridia",
"eridian",
"ridi",
"ridia",
"ridian",
"idia",
"idian",
"dian",
"Av",
"N",
"N",
"148",
"St"
),
result
);
}

@ParameterizedTest
@CsvSource(
value = {
"1st:1",
"2nd:2",
"3rd:3",
"4th:4",
"6th:6",
"148th:148",
"102nd:102",
"1003rd:1003",
"St:St",
"S3:S3",
"Aard:Aard",
},
delimiter = ':'
)
void numberSuffixes(String input, String expected) {
var result = tokenize(input);
assertEquals(List.of(expected), result);
}

@Test
void wordBoundary() {
var result = tokenize("1stst");
assertEquals(List.of("1sts", "1stst", "stst"), result);
}

private List<String> tokenize(String text) {
try (var analyzer = new EnglishNGramAnalyzer()) {
List<String> result;
TokenStream tokenStream;
result = new ArrayList<>();
tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
}
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import java.util.stream.Stream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -98,6 +98,10 @@ class LuceneIndexTest {
.withCoordinate(52.52277, 13.41046)
.build();

static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th St").build();
static final RegularStop MERIDIAN_N1 = TEST_MODEL.stop("Meridian N & Spencer").build();
static final RegularStop MERIDIAN_N2 = TEST_MODEL.stop("N 205th St & Meridian Ave N").build();

static LuceneIndex index;

static StopClusterMapper mapper;
Expand All @@ -113,7 +117,10 @@ static void setup() {
LICHTERFELDE_OST_2,
WESTHAFEN,
ARTS_CENTER,
ARTHUR
ARTHUR,
MERIDIAN_N1,
MERIDIAN_N2,
MERIDIAN_AVE
)
.forEach(stopModel::withRegularStop);
List
Expand Down Expand Up @@ -295,9 +302,32 @@ void agenciesAndFeedPublisher() {
assertEquals(List.of(StopClusterMapper.toAgency(BVG)), cluster.primary().agencies());
assertEquals("A Publisher", cluster.primary().feedPublisher().name());
}

@ParameterizedTest
@ValueSource(
strings = {
"Meridian Ave N & N 148th",
"Meridian Ave N & N 148",
"Meridian Ave N N 148",
"Meridian Ave N 148",
"Meridian & 148 N",
"148 N & Meridian",
"Meridian & N 148",
"Meridian Ave 148",
"Meridian Av 148",
"meridian av 148",
}
)
void numericAdjectives(String query) {
var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList();
assertEquals(
Stream.of(MERIDIAN_AVE, MERIDIAN_N2, MERIDIAN_N1).map(s -> s.getName().toString()).toList(),
names
);
}
}

private static @Nonnull Function<StopCluster, FeedScopedId> primaryId() {
private static Function<StopCluster, FeedScopedId> primaryId() {
return c -> c.primary().id();
}
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package org.opentripplanner.ext.geocoder;

import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
Expand All @@ -17,14 +19,21 @@
* of a stop name can be matched efficiently.
* <p>
* For example the query of "exanderpl" will match the stop name "Alexanderplatz".
* <p>
* It also removes number suffixes in the American street names, like "147th Street", which will
* be tokenized to "147 Street".
*/
class EnglishNGramAnalyzer extends Analyzer {

// Matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th"
private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)(st|nd|rd|th)\\b");

@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
TokenStream result = new EnglishPossessiveFilter(src);
result = new LowerCaseFilter(result);
result = new PatternReplaceFilter(result, NUMBER_SUFFIX_PATTERN, "$1", true);
result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
result = new PorterStemFilter(result);
result = new CapitalizationFilter(result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ private Stream<Document> matchingDocuments(
}
});
} else {
var nameParser = new QueryParser(NAME, analyzer);
var nameParser = new QueryParser(NAME_NGRAM, analyzer);
var nameQuery = nameParser.parse(searchTerms);

var ngramNameQuery = new TermQuery(
Expand Down

0 comments on commit ebdb572

Please sign in to comment.