Allow multiple URL parts to be specified for tokenization

jlinn · Jul 15, 2016 · 12469d1 · 12469d1
1 parent 7f4bc6a
commit 12469d1
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 | Elasticsearch Version | Plugin Version |
 |-----------------------|----------------|
-| 2.3.3 | 2.3.3.2 |
+| 2.3.3 | 2.3.3.3 |
 | 2.3.2 | 2.3.2.1 |
 | 2.3.1 | 2.3.1.1 |
 | 2.3.0 | 2.3.0.1 |
@@ -26,13 +26,13 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 ## Installation
 ```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.2/elasticsearch-analysis-url-2.3.3.2.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.3/elasticsearch-analysis-url-2.3.3.3.zip
 ```
 
 ## Usage
 ### URL Tokenizer
 #### Options: 
-* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
+* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Can be either a string (single URL part) or an array of multiple URL parts. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
 * `url_decode`: Defaults to `false`. If `true`, URL tokens will be URL decoded.
 * `allow_malformed`: Defaults to `false`. If `true`, malformed URLs will not be rejected, but will be passed through without being tokenized.
 * `tokenize_malformed`: Defaults to `false`. Has no effect if `allow_malformed` is `false`. If both are `true`, an attempt will be made to tokenize malformed URLs using regular expressions.
@@ -92,9 +92,9 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
 ### URL Token Filter
 #### Options:
 * `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
-`protocol`, `host`, `port`, `path`, `query`, and `ref`.
+`protocol`, `host`, `port`, `path`, `query`, and `ref`. Can be either a single URL part (string) or an array of URL parts.
 * `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
-* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string. 
+* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
 If the desired part cannot be found, no value will be indexed for that field.
 * `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter.  Valid URLs will be tokenized according to the filter's other settings.
 * `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-analysis-url</artifactId>
-    <version>2.3.3.3-SNAPSHOT</version>
+    <version>2.3.3.3</version>
     <packaging>jar</packaging>
     <description>Elasticsearch URL token filter plugin</description>
 

diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java
@@ -1,5 +1,7 @@
 package org.elasticsearch.index.analysis;
 
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
 import org.apache.lucene.analysis.TokenStream;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
@@ -8,13 +10,15 @@
 import org.elasticsearch.index.analysis.url.URLTokenFilter;
 import org.elasticsearch.index.settings.IndexSettingsService;
 
+import java.util.List;
+
 /**
  * Joe Linn
  * 1/17/2015
  */
 @AnalysisSettingsRequired
 public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
-    private final URLPart part;
+    private final List<URLPart> parts;
     private final boolean urlDecode;
     private boolean tokenizeHost;
     private boolean tokenizePath;
@@ -27,7 +31,14 @@ public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
     public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
         super(index, indexSettings.indexSettings(), name, settings);
 
-        this.part = URLPart.fromString(settings.get("part", "whole"));
+        this.parts = FluentIterable.of(settings.getAsArray("part", new String[]{"whole"}))
+                .transform(new Function<String, URLPart>() {
+                    @Override
+                    public URLPart apply(String input) {
+                        return URLPart.fromString(input);
+                    }
+                }).toList();
+
         this.urlDecode = settings.getAsBoolean("url_decode", false);
         this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
         this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
@@ -39,7 +50,8 @@ public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @A
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        return  new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
+        return new URLTokenFilter(tokenStream, null, urlDecode, allowMalformed, passthrough)
+                .setParts(parts)
                 .setTokenizeMalformed(tokenizeMalformed)
                 .setTokenizeHost(tokenizeHost)
                 .setTokenizePath(tokenizePath)

diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java
@@ -1,6 +1,7 @@
 package org.elasticsearch.index.analysis;
 
-import com.google.common.base.Strings;
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
 import org.apache.lucene.analysis.Tokenizer;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
@@ -9,13 +10,15 @@
 import org.elasticsearch.index.analysis.url.URLTokenizer;
 import org.elasticsearch.index.settings.IndexSettingsService;
 
+import java.util.List;
+
 /**
  * Joe Linn
  * 8/1/2015
  */
 @AnalysisSettingsRequired
 public class URLTokenizerFactory extends AbstractTokenizerFactory {
-    private URLPart part;
+    private List<URLPart> parts;
     private boolean urlDecode;
     private boolean tokenizeHost;
     private boolean tokenizePath;
@@ -28,9 +31,14 @@ public class URLTokenizerFactory extends AbstractTokenizerFactory {
     public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
         super(index, indexSettings.indexSettings(), name, settings);
 
-        String partString = settings.get("part");
-        if (!Strings.isNullOrEmpty(partString)) {
-            this.part = URLPart.fromString(partString);
+        String[] parts = settings.getAsArray("part");
+        if (parts != null && parts.length > 0) {
+            this.parts = FluentIterable.of(parts).transform(new Function<String, URLPart>() {
+                @Override
+                public URLPart apply(String input) {
+                    return URLPart.fromString(input);
+                }
+            }).toList();
         }
         this.urlDecode = settings.getAsBoolean("url_decode", false);
         this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
@@ -44,7 +52,7 @@ public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Ass
     @Override
     public Tokenizer create() {
         URLTokenizer tokenizer = new URLTokenizer();
-        tokenizer.setPart(part);
+        tokenizer.setParts(parts);
         tokenizer.setUrlDecode(urlDecode);
         tokenizer.setTokenizeHost(tokenizeHost);
         tokenizer.setTokenizePath(tokenizePath);

diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
@@ -25,7 +25,7 @@
 public final class URLTokenFilter extends TokenFilter {
     public static final String NAME = "url";
 
-    private final URLPart part;
+    private List<URLPart> parts;
 
     private final boolean urlDeocde;
 
@@ -69,13 +69,22 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolea
 
     public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
         super(input);
-        this.part = part;
+        if (part != null) {
+            this.parts = ImmutableList.of(part);
+        } else {
+            parts = null;
+        }
         this.urlDeocde = urlDecode;
         this.allowMalformed = allowMalformed;
         this.passthrough = passthrough;
     }
 
 
+    public URLTokenFilter setParts(List<URLPart> parts) {
+        this.parts = parts;
+        return this;
+    }
+
     public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
         this.tokenizeHost = tokenizeHost;
         return this;
@@ -99,7 +108,7 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
 
     @Override
     public boolean incrementToken() throws IOException {
-        if(iterator == null || !iterator.hasNext()){
+        if (iterator == null || !iterator.hasNext()) {
             if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
                 return false;
             }
@@ -157,7 +166,8 @@ private boolean advance() throws IOException {
      */
     private List<String> tokenize(String input) throws IOException {
         List<String> tokens = new ArrayList<>();
-        URLTokenizer tokenizer = new URLTokenizer(part);
+        URLTokenizer tokenizer = new URLTokenizer();
+        tokenizer.setParts(parts);
         tokenizer.setUrlDecode(urlDeocde);
         tokenizer.setTokenizeHost(tokenizeHost);
         tokenizer.setTokenizePath(tokenizePath);
@@ -190,18 +200,31 @@ public void reset() throws IOException {
      * @return the url part if it can be parsed, null otherwise
      */
     private String parseMalformed(String urlString) {
-        switch (part) {
-            case PROTOCOL:
-                return applyPattern(REGEX_PROTOCOL, urlString);
-            case PORT:
-                return applyPattern(REGEX_PORT, urlString);
-            case QUERY:
-                return applyPattern(REGEX_QUERY, urlString);
-            case WHOLE:
-                return urlString;
-            default:
-                return urlString;
+        if (parts != null && !parts.isEmpty()) {
+            String ret;
+            for (URLPart part : parts) {
+                switch (part) {
+                    case PROTOCOL:
+                        ret = applyPattern(REGEX_PROTOCOL, urlString);
+                        break;
+                    case PORT:
+                        ret = applyPattern(REGEX_PORT, urlString);
+                        break;
+                    case QUERY:
+                        ret = applyPattern(REGEX_QUERY, urlString);
+                        break;
+                    case WHOLE:
+                        ret = urlString;
+                        break;
+                    default:
+                        ret = urlString;
+                }
+                if (!Strings.isNullOrEmpty(ret)) {
+                    return ret;
+                }
+            }
         }
+        return urlString;
     }
 
     /**

diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -37,7 +37,7 @@ public final class URLTokenizer extends Tokenizer {
     /**
      * If set, only the given part of the url will be tokenized.
      */
-    private URLPart part;
+    private List<URLPart> parts;
 
     /**
      * If true, url parts will be url decoded prior to tokenization.
@@ -84,16 +84,21 @@ public URLTokenizer() {
     }
 
     public URLTokenizer(URLPart part) {
-        this.part = part;
+        setPart(part);
     }
 
 
     public URLTokenizer(AttributeFactory factory) {
         super(factory);
     }
 
+    public void setParts(List<URLPart> parts) { this.parts = parts; }
 
-    public void setPart(URLPart part) { this.part = part; }
+    public void setPart(URLPart part) {
+        if (part != null) {
+            this.parts = ImmutableList.of(part);
+        }
+    }
 
     public void setUrlDecode(boolean urlDecode) { this.urlDecode = urlDecode; }
 
@@ -164,9 +169,12 @@ private String readerToString(Reader reader) throws IOException {
     private List<Token> tokenize(String urlString) throws IOException {
         try {
             URL url = new URL(urlString);
-            if (part != null) {
-                // single URL part
-                return tokenize(url, part);
+            if (parts != null && !parts.isEmpty()) {
+                List<Token> tokens = new ArrayList<>();
+                for (URLPart part : parts) {
+                    tokens.addAll(tokenize(url, part));
+                }
+                return tokens;
             }
             // No part is specified. Tokenize all parts.
             Set<Token> tokens = new HashSet<>();
@@ -177,7 +185,14 @@ private List<Token> tokenize(String urlString) throws IOException {
             return Lists.newArrayList(tokens);
         } catch (MalformedURLException e) {
             if (allowMalformed) {
-                return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE);
+                if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
+                    List<Token> tokens = new ArrayList<>();
+                    for (URLPart part : parts) {
+                        tokens.addAll(tokenizeMalformed(urlString, part));
+                    }
+                    return tokens;
+                }
+                return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
             }
             throw new IOException("Malformed URL: " + urlString, e);
         }

diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
@@ -27,6 +27,8 @@ public void testAnalyze() {
 
         assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
 
+        assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
+
         assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
     }
 

diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -1,5 +1,6 @@
 package org.elasticsearch.index.analysis.url;
 
+import com.google.common.collect.ImmutableList;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -213,8 +214,18 @@ public void testMalformedWhole() throws Exception {
     }
 
 
-    private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
-        URLTokenizer tokenizer = new URLTokenizer(part);
+    @Test
+    public void testProtocolAndPort() throws Exception {
+        URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL, URLPart.PORT);
+        assertTokenStreamContents(tokenizer, stringArray("http", "9200"));
+    }
+
+
+    private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
+        URLTokenizer tokenizer = new URLTokenizer();
+        if (parts != null) {
+            tokenizer.setParts(ImmutableList.copyOf(parts));
+        }
         tokenizer.setReader(new StringReader(input));
         return tokenizer;
     }

diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
@@ -14,6 +14,10 @@
                 "part": "host",
                 "tokenize_host": false
             },
+            "url_protocol_and_host": {
+                "type": "url",
+                "part": ["protocol", "host"]
+            },
             "url_all": {
                 "type": "url"
             },
@@ -100,6 +104,9 @@
             "tokenizer_url_host_single": {
                 "tokenizer": "url_host_single"
             },
+            "tokenizer_url_protocol_and_host": {
+                "tokenizer": "url_protocol_and_host"
+            },
             "tokenizer_url_all": {
                 "tokenizer": "url_all"
             },
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,8 @@ public void testAnalyze() { @@
             assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
+            assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
             assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
         }
@@ Expand Down @@