Skip to content

Commit

Permalink
Merge pull request #5401 from ibi-group/geocoder-fuzziness
Browse files Browse the repository at this point in the history
Improve geocoding fuzziness, remove street corners
  • Loading branch information
leonardehrenfried authored Oct 20, 2023
2 parents 003d535 + 2714adc commit 0c97ab9
Show file tree
Hide file tree
Showing 10 changed files with 269 additions and 89 deletions.
1 change: 0 additions & 1 deletion docs/sandbox/GeocoderAPI.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ It supports the following URL parameters:
| `autocomplete` | Whether we should use the query string to do a prefix match |
| `stops` | Search for stops, either by name or stop code |
| `clusters` | Search for clusters by their name |
| `corners` | Search for street corners using at least one of the street names |

#### Stop clusters

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package org.opentripplanner.ext.geocoder;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.jupiter.api.Test;

class EnglishNgramAnalyzerTest {

@Test
void ngram() throws IOException {
var analyzer = new EnglishNGramAnalyzer();
List<String> result = analyze("Alexanderplatz", analyzer);

//System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\"")));
assertEquals(
List.of(
"Alex",
"Alexa",
"Alexan",
"Alexand",
"Alexande",
"Alexander",
"Alexanderp",
"lexa",
"lexan",
"lexand",
"lexande",
"lexander",
"lexanderp",
"lexanderpl",
"exan",
"exand",
"exande",
"exander",
"exanderp",
"exanderpl",
"exanderpla",
"xand",
"xande",
"xander",
"xanderp",
"xanderpl",
"xanderpla",
"xanderplat",
"ande",
"ander",
"anderp",
"anderpl",
"anderpla",
"anderplat",
"anderplatz",
"nder",
"nderp",
"nderpl",
"nderpla",
"nderplat",
"nderplatz",
"derp",
"derpl",
"derpla",
"derplat",
"derplatz",
"erpl",
"erpla",
"erplat",
"erplatz",
"rpla",
"rplat",
"rplatz",
"plat",
"platz",
"latz",
"Alexanderplatz"
),
result
);
}

public List<String> analyze(String text, Analyzer analyzer) throws IOException {
List<String> result = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import org.opentripplanner.routing.graph.Graph;
import org.opentripplanner.transit.model.basic.TransitMode;
import org.opentripplanner.transit.model.framework.Deduplicator;
import org.opentripplanner.transit.model.site.RegularStop;
Expand All @@ -29,8 +28,6 @@

class LuceneIndexTest {

static Graph graph = new Graph();

// Berlin
static Station BERLIN_HAUPTBAHNHOF_STATION = station("Hauptbahnhof")
.withCoordinate(52.52495, 13.36952)
Expand Down Expand Up @@ -113,7 +110,7 @@ public List<TransitMode> getModesOfStopLocation(StopLocation stop) {
}
}
};
index = new LuceneIndex(graph, transitService);
index = new LuceneIndex(transitService);
mapper = new StopClusterMapper(transitService);
}

Expand Down Expand Up @@ -145,9 +142,29 @@ void stopLocationGroupsWithSpace() {
@Nested
class StopClusters {

@Test
void stopClusters() {
var result1 = index.queryStopClusters("alex").toList();
@ParameterizedTest
@ValueSource(
strings = {
"Alexanderplatz",
"Alexa",
"alex",
"aleyanderplazt",
"alexnderplazt",
"Alexnderplatz",
"Alexnaderplatz",
"xande",
"xanderpla",
"alexnaderplaz",
"Alexanderplat",
"alexanderplat",
"alexand",
"alexander platz",
"alexander-platz",
"alexander",
}
)
void stopClustersWithTypos(String searchTerm) {
var result1 = index.queryStopClusters(searchTerm).toList();
assertEquals(List.of(mapper.map(ALEXANDERPLATZ_STATION)), result1);
}

Expand All @@ -167,17 +184,40 @@ void deduplicatedStopClusters() {
@ParameterizedTest
@ValueSource(
strings = {
"five", "five ", "five p", "five po", "five poi", "five poin", "five point", "five points",
"five",
"five ",
"five p",
"five po",
"five poi",
"five poin",
"five point",
"five points",
"fife point",
"five poits",
"fife",
"points",
"ife points",
"the five points",
"five @ points",
"five @ the points",
"five@points",
"five at points",
"five&points",
"five & points",
"five and the points",
"points five",
"points fife",
}
)
void stopClustersWithSpace(String query) {
var result = index.queryStopClusters(query).toList();
assertEquals(List.of(mapper.map(FIVE_POINTS_STATION)), result);
}

@Test
void stopCode() {
var result = index.queryStopClusters(ARTS_CENTER.getCode()).toList();
@ParameterizedTest
@ValueSource(strings = { "4456", "445" })
void fuzzyStopCode(String query) {
var result = index.queryStopClusters(query).toList();
assertEquals(1, result.size());
assertEquals(ARTS_CENTER.getName().toString(), result.get(0).name());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.opentripplanner.ext.geocoder;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
* A custom analyzer for stop names. It removes english stop words (at,the...) and splits
* the input into NGrams (https://en.wikipedia.org/wiki/N-gram) so that the middle
* of a stop name can be matched efficiently.
* <p>
* For example the query of "exanderpl" will match the stop name "Alexanderplatz".
*/
class EnglishNGramAnalyzer extends Analyzer {

@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
TokenStream result = new EnglishPossessiveFilter(src);
result = new LowerCaseFilter(result);
result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
result = new PorterStemFilter(result);
result = new CapitalizationFilter(result);
result = new NGramTokenFilter(result, 4, 10, true);
return new TokenStreamComponents(src, result);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.util.stream.Collectors;
import org.opentripplanner.api.mapping.FeedScopedIdMapper;
import org.opentripplanner.standalone.api.OtpServerRequestContext;
import org.opentripplanner.street.model.vertex.StreetVertex;
import org.opentripplanner.transit.model.site.StopLocation;

/**
Expand Down Expand Up @@ -47,7 +46,6 @@ public GeocoderResource(@Context OtpServerRequestContext requestContext) {
* @param autocomplete Whether we should use the query string to do a prefix match
* @param stops Search for stops, either by name or stop code
* @param clusters Search for clusters by their name
* @param corners Search for street corners using at least one of the street names
* @return list of results in the format expected by GeocoderBuiltin.js in the OTP Leaflet
* client
*/
Expand All @@ -56,12 +54,11 @@ public Response textSearch(
@QueryParam("query") String query,
@QueryParam("autocomplete") @DefaultValue("false") boolean autocomplete,
@QueryParam("stops") @DefaultValue("true") boolean stops,
@QueryParam("clusters") @DefaultValue("false") boolean clusters,
@QueryParam("corners") @DefaultValue("true") boolean corners
@QueryParam("clusters") @DefaultValue("false") boolean clusters
) {
return Response
.status(Response.Status.OK)
.entity(query(query, autocomplete, stops, clusters, corners))
.entity(query(query, autocomplete, stops, clusters))
.build();
}

Expand All @@ -77,8 +74,7 @@ private List<SearchResult> query(
String query,
boolean autocomplete,
boolean stops,
boolean clusters,
boolean corners
boolean clusters
) {
List<SearchResult> results = new ArrayList<>();

Expand All @@ -90,10 +86,6 @@ private List<SearchResult> query(
results.addAll(queryStations(query, autocomplete));
}

if (corners) {
results.addAll(queryCorners(query, autocomplete));
}

return results;
}

Expand Down Expand Up @@ -127,20 +119,6 @@ private Collection<? extends SearchResult> queryStations(String query, boolean a
.collect(Collectors.toList());
}

private Collection<? extends SearchResult> queryCorners(String query, boolean autocomplete) {
return LuceneIndex
.forServer(serverContext)
.queryStreetVertices(query, autocomplete)
.map(v ->
new SearchResult(v.getLat(), v.getLon(), stringifyStreetVertex(v), v.getLabelString())
)
.collect(Collectors.toList());
}

private String stringifyStreetVertex(StreetVertex v) {
return String.format("%s (%s)", v.getIntersectionName(), v.getLabel());
}

private String stringifyStopLocation(StopLocation sl) {
return sl.getCode() != null
? String.format("%s (%s)", sl.getName(), sl.getCode())
Expand Down
Loading

0 comments on commit 0c97ab9

Please sign in to comment.