diff --git a/pom.xml b/pom.xml index f41f621..8fdebfc 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 2.3.3.2 + 2.3.3.3-SNAPSHOT jar Elasticsearch URL token filter plugin diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java index 598dcb5..6fabe1d 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java @@ -1,8 +1,10 @@ package org.elasticsearch.index.analysis.url; +import com.google.common.base.Objects; import com.google.common.base.Optional; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; import com.google.common.net.InetAddresses; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; @@ -20,9 +22,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.regex.Pattern; import static org.elasticsearch.index.analysis.url.URLUtils.getPart; @@ -169,12 +169,12 @@ private List tokenize(String urlString) throws IOException { return tokenize(url, part); } // No part is specified. Tokenize all parts. - List tokens = new ArrayList<>(); + Set tokens = new HashSet<>(); for (URLPart urlPart : URLPart.values()) { tokens.addAll(tokenize(url, urlPart)); } tokens.addAll(tokenizeSpecial(url)); - return tokens; + return Lists.newArrayList(tokens); } catch (MalformedURLException e) { if (allowMalformed) { return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE); @@ -192,6 +192,14 @@ private List tokenize(String urlString) throws IOException { * @throws IOException */ private List tokenizeMalformed(String url, URLPart part) throws IOException { + if (part == null) { + // No part is specified. Tokenize all parts. + List tokens = new ArrayList<>(); + for (URLPart urlPart : URLPart.values()) { + tokens.addAll(tokenizeMalformed(url, urlPart)); + } + return tokens; + } Optional partOptional = getPart(url, part); if (!partOptional.isPresent() || partOptional.get().equals("")) { // desired part was not found @@ -486,5 +494,27 @@ public Token(String token, URLPart part, int start, int end) { public int getStart() { return start; } public int getEnd() { return end; } + + + @Override + public boolean equals(Object obj) { + if (obj == null || !(obj instanceof Token)) { + return false; + } + Token that = (Token) obj; + return this.start == that.start + && this.end == that.end + && Objects.equal(this.token, that.token) + && Objects.equal(this.part, that.part); + } + + @Override + public int hashCode() { + int result = token != null ? token.hashCode() : 0; + result = 31 * result + part.hashCode(); + result = 31 * result + start; + result = 31 * result + end; + return result; + } } } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java index d971830..3dc6cb1 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java @@ -8,6 +8,7 @@ import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.hasItem; +import static org.hamcrest.CoreMatchers.notNullValue; import static org.hamcrest.collection.IsCollectionWithSize.hasSize; /** @@ -25,6 +26,16 @@ public void testAnalyze() { assertThat(hostTokens, hasSize(1)); assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com"); + + assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query"); + } + + + @Test + public void testAnalyzeWhole() throws Exception { + List tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed"); + assertThat(tokens, notNullValue()); + assertThat(tokens, hasSize(7)); } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java index cb0c1f4..f2b784d 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java @@ -203,6 +203,16 @@ public void testMalformedGetRef() throws Exception { } + @Test + public void testMalformedWhole() throws Exception { + String url = "foo.bar.com/baz.html/query?a=1"; + URLTokenizer tokenizer = createTokenizer(url, URLPart.WHOLE); + tokenizer.setAllowMalformed(true); + tokenizer.setTokenizeMalformed(true); + assertTokenStreamContents(tokenizer, stringArray("foo.bar.com/baz.html/query?a=1")); + } + + private URLTokenizer createTokenizer(String input, URLPart part) throws IOException { URLTokenizer tokenizer = new URLTokenizer(part); tokenizer.setReader(new StringReader(input)); diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json index e6a0268..b0b6607 100644 --- a/src/test/resources/test-settings.json +++ b/src/test/resources/test-settings.json @@ -16,6 +16,11 @@ }, "url_all": { "type": "url" + }, + "url_all_malformed": { + "type": "url", + "allow_malformed": true, + "tokenize_malformed": true } }, "filter": { @@ -97,6 +102,9 @@ }, "tokenizer_url_all": { "tokenizer": "url_all" + }, + "tokenizer_url_all_malformed": { + "tokenizer": "url_all_malformed" } } }