diff --git a/pom.xml b/pom.xml
index f41f621..8fdebfc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.elasticsearch
elasticsearch-analysis-url
- 2.3.3.2
+ 2.3.3.3-SNAPSHOT
jar
Elasticsearch URL token filter plugin
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
index 598dcb5..6fabe1d 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -1,8 +1,10 @@
package org.elasticsearch.index.analysis.url;
+import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
import com.google.common.net.InetAddresses;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
@@ -20,9 +22,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
import java.util.regex.Pattern;
import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
@@ -169,12 +169,12 @@ private List tokenize(String urlString) throws IOException {
return tokenize(url, part);
}
// No part is specified. Tokenize all parts.
- List tokens = new ArrayList<>();
+ Set tokens = new HashSet<>();
for (URLPart urlPart : URLPart.values()) {
tokens.addAll(tokenize(url, urlPart));
}
tokens.addAll(tokenizeSpecial(url));
- return tokens;
+ return Lists.newArrayList(tokens);
} catch (MalformedURLException e) {
if (allowMalformed) {
return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE);
@@ -192,6 +192,14 @@ private List tokenize(String urlString) throws IOException {
* @throws IOException
*/
private List tokenizeMalformed(String url, URLPart part) throws IOException {
+ if (part == null) {
+ // No part is specified. Tokenize all parts.
+ List tokens = new ArrayList<>();
+ for (URLPart urlPart : URLPart.values()) {
+ tokens.addAll(tokenizeMalformed(url, urlPart));
+ }
+ return tokens;
+ }
Optional partOptional = getPart(url, part);
if (!partOptional.isPresent() || partOptional.get().equals("")) {
// desired part was not found
@@ -486,5 +494,27 @@ public Token(String token, URLPart part, int start, int end) {
public int getStart() { return start; }
public int getEnd() { return end; }
+
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof Token)) {
+ return false;
+ }
+ Token that = (Token) obj;
+ return this.start == that.start
+ && this.end == that.end
+ && Objects.equal(this.token, that.token)
+ && Objects.equal(this.part, that.part);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = token != null ? token.hashCode() : 0;
+ result = 31 * result + part.hashCode();
+ result = 31 * result + start;
+ result = 31 * result + end;
+ return result;
+ }
}
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
index d971830..3dc6cb1 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
@@ -8,6 +8,7 @@
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.hasItem;
+import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
/**
@@ -25,6 +26,16 @@ public void testAnalyze() {
assertThat(hostTokens, hasSize(1));
assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
+
+ assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
+ }
+
+
+ @Test
+ public void testAnalyzeWhole() throws Exception {
+ List tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed");
+ assertThat(tokens, notNullValue());
+ assertThat(tokens, hasSize(7));
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
index cb0c1f4..f2b784d 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -203,6 +203,16 @@ public void testMalformedGetRef() throws Exception {
}
+ @Test
+ public void testMalformedWhole() throws Exception {
+ String url = "foo.bar.com/baz.html/query?a=1";
+ URLTokenizer tokenizer = createTokenizer(url, URLPart.WHOLE);
+ tokenizer.setAllowMalformed(true);
+ tokenizer.setTokenizeMalformed(true);
+ assertTokenStreamContents(tokenizer, stringArray("foo.bar.com/baz.html/query?a=1"));
+ }
+
+
private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
URLTokenizer tokenizer = new URLTokenizer(part);
tokenizer.setReader(new StringReader(input));
diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
index e6a0268..b0b6607 100644
--- a/src/test/resources/test-settings.json
+++ b/src/test/resources/test-settings.json
@@ -16,6 +16,11 @@
},
"url_all": {
"type": "url"
+ },
+ "url_all_malformed": {
+ "type": "url",
+ "allow_malformed": true,
+ "tokenize_malformed": true
}
},
"filter": {
@@ -97,6 +102,9 @@
},
"tokenizer_url_all": {
"tokenizer": "url_all"
+ },
+ "tokenizer_url_all_malformed": {
+ "tokenizer": "url_all_malformed"
}
}
}