Skip to content

Commit

Permalink
Allow multiple URL parts to be specified for tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
jlinn committed Jul 15, 2016
1 parent 7f4bc6a commit 12469d1
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 39 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 2.3.3 | 2.3.3.2 |
| 2.3.3 | 2.3.3.3 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
| 2.3.0 | 2.3.0.1 |
Expand All @@ -26,13 +26,13 @@ This plugin enables URL tokenization and token filtering by URL part.

## Installation
```bash
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.2/elasticsearch-analysis-url-2.3.3.2.zip
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.3/elasticsearch-analysis-url-2.3.3.3.zip
```

## Usage
### URL Tokenizer
#### Options:
* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Can be either a string (single URL part) or an array of multiple URL parts. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
* `url_decode`: Defaults to `false`. If `true`, URL tokens will be URL decoded.
* `allow_malformed`: Defaults to `false`. If `true`, malformed URLs will not be rejected, but will be passed through without being tokenized.
* `tokenize_malformed`: Defaults to `false`. Has no effect if `allow_malformed` is `false`. If both are `true`, an attempt will be made to tokenize malformed URLs using regular expressions.
Expand Down Expand Up @@ -92,9 +92,9 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
### URL Token Filter
#### Options:
* `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
`protocol`, `host`, `port`, `path`, `query`, and `ref`.
`protocol`, `host`, `port`, `path`, `query`, and `ref`. Can be either a single URL part (string) or an array of URL parts.
* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
If the desired part cannot be found, no value will be indexed for that field.
* `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter. Valid URLs will be tokenized according to the filter's other settings.
* `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.3.3.3-SNAPSHOT</version>
<version>2.3.3.3</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.elasticsearch.index.analysis;

import com.google.common.base.Function;
import com.google.common.collect.FluentIterable;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
Expand All @@ -8,13 +10,15 @@
import org.elasticsearch.index.analysis.url.URLTokenFilter;
import org.elasticsearch.index.settings.IndexSettingsService;

import java.util.List;

/**
* Joe Linn
* 1/17/2015
*/
@AnalysisSettingsRequired
public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
private final URLPart part;
private final List<URLPart> parts;
private final boolean urlDecode;
private boolean tokenizeHost;
private boolean tokenizePath;
Expand All @@ -27,7 +31,14 @@ public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings.indexSettings(), name, settings);

this.part = URLPart.fromString(settings.get("part", "whole"));
this.parts = FluentIterable.of(settings.getAsArray("part", new String[]{"whole"}))
.transform(new Function<String, URLPart>() {
@Override
public URLPart apply(String input) {
return URLPart.fromString(input);
}
}).toList();

this.urlDecode = settings.getAsBoolean("url_decode", false);
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
Expand All @@ -39,7 +50,8 @@ public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @A

@Override
public TokenStream create(TokenStream tokenStream) {
return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
return new URLTokenFilter(tokenStream, null, urlDecode, allowMalformed, passthrough)
.setParts(parts)
.setTokenizeMalformed(tokenizeMalformed)
.setTokenizeHost(tokenizeHost)
.setTokenizePath(tokenizePath)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.elasticsearch.index.analysis;

import com.google.common.base.Strings;
import com.google.common.base.Function;
import com.google.common.collect.FluentIterable;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
Expand All @@ -9,13 +10,15 @@
import org.elasticsearch.index.analysis.url.URLTokenizer;
import org.elasticsearch.index.settings.IndexSettingsService;

import java.util.List;

/**
* Joe Linn
* 8/1/2015
*/
@AnalysisSettingsRequired
public class URLTokenizerFactory extends AbstractTokenizerFactory {
private URLPart part;
private List<URLPart> parts;
private boolean urlDecode;
private boolean tokenizeHost;
private boolean tokenizePath;
Expand All @@ -28,9 +31,14 @@ public class URLTokenizerFactory extends AbstractTokenizerFactory {
public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings.indexSettings(), name, settings);

String partString = settings.get("part");
if (!Strings.isNullOrEmpty(partString)) {
this.part = URLPart.fromString(partString);
String[] parts = settings.getAsArray("part");
if (parts != null && parts.length > 0) {
this.parts = FluentIterable.of(parts).transform(new Function<String, URLPart>() {
@Override
public URLPart apply(String input) {
return URLPart.fromString(input);
}
}).toList();
}
this.urlDecode = settings.getAsBoolean("url_decode", false);
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
Expand All @@ -44,7 +52,7 @@ public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Ass
@Override
public Tokenizer create() {
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setPart(part);
tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDecode);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
public final class URLTokenFilter extends TokenFilter {
public static final String NAME = "url";

private final URLPart part;
private List<URLPart> parts;

private final boolean urlDeocde;

Expand Down Expand Up @@ -69,13 +69,22 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolea

public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
super(input);
this.part = part;
if (part != null) {
this.parts = ImmutableList.of(part);
} else {
parts = null;
}
this.urlDeocde = urlDecode;
this.allowMalformed = allowMalformed;
this.passthrough = passthrough;
}


public URLTokenFilter setParts(List<URLPart> parts) {
this.parts = parts;
return this;
}

public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
this.tokenizeHost = tokenizeHost;
return this;
Expand All @@ -99,7 +108,7 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {

@Override
public boolean incrementToken() throws IOException {
if(iterator == null || !iterator.hasNext()){
if (iterator == null || !iterator.hasNext()) {
if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
return false;
}
Expand Down Expand Up @@ -157,7 +166,8 @@ private boolean advance() throws IOException {
*/
private List<String> tokenize(String input) throws IOException {
List<String> tokens = new ArrayList<>();
URLTokenizer tokenizer = new URLTokenizer(part);
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDeocde);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
Expand Down Expand Up @@ -190,18 +200,31 @@ public void reset() throws IOException {
* @return the url part if it can be parsed, null otherwise
*/
private String parseMalformed(String urlString) {
switch (part) {
case PROTOCOL:
return applyPattern(REGEX_PROTOCOL, urlString);
case PORT:
return applyPattern(REGEX_PORT, urlString);
case QUERY:
return applyPattern(REGEX_QUERY, urlString);
case WHOLE:
return urlString;
default:
return urlString;
if (parts != null && !parts.isEmpty()) {
String ret;
for (URLPart part : parts) {
switch (part) {
case PROTOCOL:
ret = applyPattern(REGEX_PROTOCOL, urlString);
break;
case PORT:
ret = applyPattern(REGEX_PORT, urlString);
break;
case QUERY:
ret = applyPattern(REGEX_QUERY, urlString);
break;
case WHOLE:
ret = urlString;
break;
default:
ret = urlString;
}
if (!Strings.isNullOrEmpty(ret)) {
return ret;
}
}
}
return urlString;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public final class URLTokenizer extends Tokenizer {
/**
* If set, only the given part of the url will be tokenized.
*/
private URLPart part;
private List<URLPart> parts;

/**
* If true, url parts will be url decoded prior to tokenization.
Expand Down Expand Up @@ -84,16 +84,21 @@ public URLTokenizer() {
}

public URLTokenizer(URLPart part) {
this.part = part;
setPart(part);
}


public URLTokenizer(AttributeFactory factory) {
super(factory);
}

public void setParts(List<URLPart> parts) { this.parts = parts; }

public void setPart(URLPart part) { this.part = part; }
public void setPart(URLPart part) {
if (part != null) {
this.parts = ImmutableList.of(part);
}
}

public void setUrlDecode(boolean urlDecode) { this.urlDecode = urlDecode; }

Expand Down Expand Up @@ -164,9 +169,12 @@ private String readerToString(Reader reader) throws IOException {
private List<Token> tokenize(String urlString) throws IOException {
try {
URL url = new URL(urlString);
if (part != null) {
// single URL part
return tokenize(url, part);
if (parts != null && !parts.isEmpty()) {
List<Token> tokens = new ArrayList<>();
for (URLPart part : parts) {
tokens.addAll(tokenize(url, part));
}
return tokens;
}
// No part is specified. Tokenize all parts.
Set<Token> tokens = new HashSet<>();
Expand All @@ -177,7 +185,14 @@ private List<Token> tokenize(String urlString) throws IOException {
return Lists.newArrayList(tokens);
} catch (MalformedURLException e) {
if (allowMalformed) {
return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE);
if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
List<Token> tokens = new ArrayList<>();
for (URLPart part : parts) {
tokens.addAll(tokenizeMalformed(urlString, part));
}
return tokens;
}
return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
}
throw new IOException("Malformed URL: " + urlString, e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public void testAnalyze() {

assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");

assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");

assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.collect.ImmutableList;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Expand Down Expand Up @@ -213,8 +214,18 @@ public void testMalformedWhole() throws Exception {
}


private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
URLTokenizer tokenizer = new URLTokenizer(part);
@Test
public void testProtocolAndPort() throws Exception {
URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL, URLPart.PORT);
assertTokenStreamContents(tokenizer, stringArray("http", "9200"));
}


private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
URLTokenizer tokenizer = new URLTokenizer();
if (parts != null) {
tokenizer.setParts(ImmutableList.copyOf(parts));
}
tokenizer.setReader(new StringReader(input));
return tokenizer;
}
Expand Down
7 changes: 7 additions & 0 deletions src/test/resources/test-settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
"part": "host",
"tokenize_host": false
},
"url_protocol_and_host": {
"type": "url",
"part": ["protocol", "host"]
},
"url_all": {
"type": "url"
},
Expand Down Expand Up @@ -100,6 +104,9 @@
"tokenizer_url_host_single": {
"tokenizer": "url_host_single"
},
"tokenizer_url_protocol_and_host": {
"tokenizer": "url_protocol_and_host"
},
"tokenizer_url_all": {
"tokenizer": "url_all"
},
Expand Down

0 comments on commit 12469d1

Please sign in to comment.