Skip to content

Commit

Permalink
Fix NPE when no URL part is specified when tokenizing a malformed URL
Browse files Browse the repository at this point in the history
  • Loading branch information
jlinn committed Jun 27, 2016
1 parent d4369c1 commit 7f4bc6a
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.3.3.2</version>
<version>2.3.3.3-SNAPSHOT</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.net.InetAddresses;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
Expand All @@ -20,9 +22,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.*;
import java.util.regex.Pattern;

import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
Expand Down Expand Up @@ -169,12 +169,12 @@ private List<Token> tokenize(String urlString) throws IOException {
return tokenize(url, part);
}
// No part is specified. Tokenize all parts.
List<Token> tokens = new ArrayList<>();
Set<Token> tokens = new HashSet<>();
for (URLPart urlPart : URLPart.values()) {
tokens.addAll(tokenize(url, urlPart));
}
tokens.addAll(tokenizeSpecial(url));
return tokens;
return Lists.newArrayList(tokens);
} catch (MalformedURLException e) {
if (allowMalformed) {
return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE);
Expand All @@ -192,6 +192,14 @@ private List<Token> tokenize(String urlString) throws IOException {
* @throws IOException
*/
private List<Token> tokenizeMalformed(String url, URLPart part) throws IOException {
if (part == null) {
// No part is specified. Tokenize all parts.
List<Token> tokens = new ArrayList<>();
for (URLPart urlPart : URLPart.values()) {
tokens.addAll(tokenizeMalformed(url, urlPart));
}
return tokens;
}
Optional<String> partOptional = getPart(url, part);
if (!partOptional.isPresent() || partOptional.get().equals("")) {
// desired part was not found
Expand Down Expand Up @@ -486,5 +494,27 @@ public Token(String token, URLPart part, int start, int end) {
public int getStart() { return start; }

public int getEnd() { return end; }


@Override
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof Token)) {
return false;
}
Token that = (Token) obj;
return this.start == that.start
&& this.end == that.end
&& Objects.equal(this.token, that.token)
&& Objects.equal(this.part, that.part);
}

@Override
public int hashCode() {
int result = token != null ? token.hashCode() : 0;
result = 31 * result + part.hashCode();
result = 31 * result + start;
result = 31 * result + end;
return result;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.hasItem;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;

/**
Expand All @@ -25,6 +26,16 @@ public void testAnalyze() {
assertThat(hostTokens, hasSize(1));

assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");

assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
}


@Test
public void testAnalyzeWhole() throws Exception {
List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed");
assertThat(tokens, notNullValue());
assertThat(tokens, hasSize(7));
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,16 @@ public void testMalformedGetRef() throws Exception {
}


@Test
public void testMalformedWhole() throws Exception {
String url = "foo.bar.com/baz.html/query?a=1";
URLTokenizer tokenizer = createTokenizer(url, URLPart.WHOLE);
tokenizer.setAllowMalformed(true);
tokenizer.setTokenizeMalformed(true);
assertTokenStreamContents(tokenizer, stringArray("foo.bar.com/baz.html/query?a=1"));
}


private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
URLTokenizer tokenizer = new URLTokenizer(part);
tokenizer.setReader(new StringReader(input));
Expand Down
8 changes: 8 additions & 0 deletions src/test/resources/test-settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
},
"url_all": {
"type": "url"
},
"url_all_malformed": {
"type": "url",
"allow_malformed": true,
"tokenize_malformed": true
}
},
"filter": {
Expand Down Expand Up @@ -97,6 +102,9 @@
},
"tokenizer_url_all": {
"tokenizer": "url_all"
},
"tokenizer_url_all_malformed": {
"tokenizer": "url_all_malformed"
}
}
}
Expand Down

0 comments on commit 7f4bc6a

Please sign in to comment.