Skip to content

Commit

Permalink
Add passthrough option to URLTokenFilter. Delegate to URLTokenizer fo…
Browse files Browse the repository at this point in the history
…r tokenization in URLTokenFilter.
  • Loading branch information
jlinn committed Jan 25, 2016
1 parent 8ce9054 commit e1e41a9
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 36 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 2.1.1 | 2.2.0 |
| 2.1.1 | 2.1.1 |
| 2.0.0 | 2.1.0 |
| 1.6.x, 1.7.x | 2.0.0 |
Expand All @@ -18,7 +19,7 @@ This plugin enables URL tokenization and token filtering by URL part.

## Installation
```bash
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.1.1/elasticsearch-analysis-url-2.1.1.zip
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.2.0/elasticsearch-analysis-url-2.2.0.zip
```

## Usage
Expand Down Expand Up @@ -87,6 +88,10 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
If the desired part cannot be found, no value will be indexed for that field.
* `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter. Valid URLs will be tokenized according to the filter's other settings.
* `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
* `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
* `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.

#### Example:
Set up your index like so:
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.1.1</version>
<version>2.2.0</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,30 @@
public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
private final URLPart part;
private final boolean urlDecode;
private boolean tokenizeHost;
private boolean tokenizePath;
private boolean tokenizeQuery;
private final boolean allowMalformed;
private final boolean passthrough;

@Inject
public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings.indexSettings(), name, settings);

this.part = URLPart.fromString(settings.get("part", "whole"));
this.urlDecode = settings.getAsBoolean("url_decode", false);
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
this.passthrough = settings.getAsBoolean("passthrough", false);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed);
return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
.setTokenizeHost(tokenizeHost)
.setTokenizePath(tokenizePath)
.setTokenizeQuery(tokenizeQuery);
}
}
139 changes: 112 additions & 27 deletions src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.index.analysis.URLPart;

import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -24,11 +29,29 @@ public final class URLTokenFilter extends TokenFilter {

private final boolean urlDeocde;

/**
* If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
*/
private boolean tokenizeHost = true;

/**
* If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
*/
private boolean tokenizePath = true;

/**
* If true, the url's query string will be split on &
*/
private boolean tokenizeQuery = true;

private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);

private final boolean allowMalformed;

private boolean parsed;
private boolean passthrough;

private List<String> tokens;
private Iterator<String> iterator;

public URLTokenFilter(TokenStream input, URLPart part) {
this(input, part, false);
Expand All @@ -39,49 +62,111 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
}

public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
this(input, part, urlDecode, allowMalformed, false);
}

public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
super(input);
this.part = part;
this.urlDeocde = urlDecode;
this.allowMalformed = allowMalformed;
this.passthrough = passthrough;
}


public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
this.tokenizeHost = tokenizeHost;
return this;
}

public URLTokenFilter setTokenizePath(boolean tokenizePath) {
this.tokenizePath = tokenizePath;
return this;
}

public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
this.tokenizeQuery = tokenizeQuery;
return this;
}


@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken() && !parsed) {
final String urlString = termAttribute.toString();
termAttribute.setEmpty();
if (Strings.isNullOrEmpty(urlString) || urlString.equals("null")) {
if(iterator == null || !iterator.hasNext()){
if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
return false;
}
}
clearAttributes();
String next = iterator.next();
if (allowMalformed) {
next = parseMalformed(next);
}
termAttribute.append(next);
return true;
}


/**
* Advance to the next token, if any
* @return true if more tokens are forthcoming, false otherwise
* @throws IOException
*/
private boolean advance() throws IOException {
if (input.incrementToken()) {
String urlString = termAttribute.toString();
if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
return false;
}
String partString;
try {
URL url = new URL(urlString);
partString = URLUtils.getPart(url, part);
parsed = !Strings.isNullOrEmpty(partString);
} catch (MalformedURLException e) {
if (allowMalformed) {
partString = parseMalformed(urlString);
if (Strings.isNullOrEmpty(partString)) {
return false;
tokens = tokenize(urlString);
} catch (IOException e) {
if (e.getMessage().contains("Malformed URL")) {
if (allowMalformed) {
tokens = ImmutableList.of(urlString);
} else {
throw new MalformedURLException("Malformed URL: " + urlString);
}
parsed = true;
} else {
throw e;
}
throw e;
}
if (urlDeocde) {
partString = URLDecoder.decode(partString, "UTF-8");
}
termAttribute.append(partString);
iterator = tokens.iterator();
return true;
} else {
return false;
}
return false;
}


/**
* Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
* will be passed along to the tokenizer.
* @param input a string to be tokenized
* @return a list of tokens extracted from the input string
* @throws IOException
*/
private List<String> tokenize(String input) throws IOException {
List<String> tokens = new ArrayList<>();
URLTokenizer tokenizer = new URLTokenizer(part);
tokenizer.setUrlDecode(urlDeocde);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
tokenizer.setTokenizeQuery(tokenizeQuery);
tokenizer.setAllowMalformed(allowMalformed || passthrough);
tokenizer.setReader(new StringReader(input));
tokenizer.reset();
while (tokenizer.incrementToken()) {
tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
}
return tokens;
}


@Override
public void reset() throws IOException {
super.reset();
parsed = false;
tokens = null;
iterator = null;
}

private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
Expand All @@ -104,7 +189,7 @@ private String parseMalformed(String urlString) {
case WHOLE:
return urlString;
default:
return null;
return urlString;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ private int getEndIndex(int start, String partStringRaw) {
* @return a list of tokens
* @throws IOException
*/
List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
tokenizer.reset();
List<Token> tokens = new ArrayList<>();
OffsetAttribute offset;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTPS_URL;
import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTP_URL;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;

/**
Expand Down Expand Up @@ -65,6 +66,18 @@ public void testMalformed() {
assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
}


@Test
public void testPassthrough() {
List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.com:9200/foo.bar baz bat.blah", "url_host_passthrough");
assertThat(tokens, hasSize(4));
assertThat(tokens.get(0).getTerm(), equalTo("foo.com"));
assertThat(tokens.get(1).getTerm(), equalTo("com"));
assertThat(tokens.get(2).getTerm(), equalTo("baz"));
assertThat(tokens.get(3).getTerm(), equalTo("bat.blah"));
}


@Test
public void testIndex() {
Map<String, Object> doc = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public void testFilterProtocol() throws IOException {

@Test
public void testFilterHost() throws IOException {
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST), "www.foo.bar.com");
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
}

@Test
Expand All @@ -35,7 +35,7 @@ public void testFilterPort() throws IOException {

@Test
public void testFilterPath() throws IOException {
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH), "/index_name/type_name/_search.html");
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH).setTokenizePath(false), "/index_name/type_name/_search.html");
}

@Test
Expand All @@ -45,7 +45,7 @@ public void testFilterRef() throws IOException {

@Test
public void testFilterQuery() throws IOException {
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY), "foo=bar&baz=bat");
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY).setTokenizeQuery(false), "foo=bar&baz=bat");
}

@Test(expected = MalformedURLException.class)
Expand Down
17 changes: 15 additions & 2 deletions src/test/resources/test-settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
},
"url_host": {
"type": "url",
"part": "host"
"part": "host",
"tokenize_host": false
},
"url_port": {
"type": "url",
Expand All @@ -34,12 +35,18 @@
"url_query": {
"type": "url",
"part": "query",
"url_decode": true
"url_decode": true,
"tokenize_query": false
},
"url_port_malformed": {
"type": "url",
"part": "port",
"allow_malformed": true
},
"url_host_passthrough": {
"type": "url",
"part": "host",
"passthrough": "true"
}
},
"analyzer": {
Expand Down Expand Up @@ -73,6 +80,12 @@
],
"tokenizer": "whitespace"
},
"url_host_passthrough": {
"filter": [
"url_host_passthrough"
],
"tokenizer": "whitespace"
},
"tokenizer_url_protocol": {
"tokenizer": "url_protocol"
},
Expand Down

0 comments on commit e1e41a9

Please sign in to comment.