Add passthrough option to URLTokenFilter. Delegate to URLTokenizer fo…

…r tokenization in URLTokenFilter.
jlinn · Jan 25, 2016 · e1e41a9 · e1e41a9
1 parent 8ce9054
commit e1e41a9
Show file tree

Hide file tree

Showing 8 changed files with 163 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 | Elasticsearch Version | Plugin Version |
 |-----------------------|----------------|
+| 2.1.1 | 2.2.0 |
 | 2.1.1 | 2.1.1 |
 | 2.0.0 | 2.1.0 |
 | 1.6.x, 1.7.x | 2.0.0 |
@@ -18,7 +19,7 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 ## Installation
 ```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.1.1/elasticsearch-analysis-url-2.1.1.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.2.0/elasticsearch-analysis-url-2.2.0.zip
 ```
 
 ## Usage
@@ -87,6 +88,10 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
 * `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
 * `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string. 
 If the desired part cannot be found, no value will be indexed for that field.
+* `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter.  Valid URLs will be tokenized according to the filter's other settings.
+* `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
+* `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
+* `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
 
 #### Example:
 Set up your index like so:

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-analysis-url</artifactId>
-    <version>2.1.1</version>
+    <version>2.2.0</version>
     <packaging>jar</packaging>
     <description>Elasticsearch URL token filter plugin</description>
 

diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java
@@ -16,19 +16,30 @@
 public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
     private final URLPart part;
     private final boolean urlDecode;
+    private boolean tokenizeHost;
+    private boolean tokenizePath;
+    private boolean tokenizeQuery;
     private final boolean allowMalformed;
+    private final boolean passthrough;
 
     @Inject
     public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
         super(index, indexSettings.indexSettings(), name, settings);
 
         this.part = URLPart.fromString(settings.get("part", "whole"));
         this.urlDecode = settings.getAsBoolean("url_decode", false);
+        this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
+        this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
+        this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
         this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
+        this.passthrough = settings.getAsBoolean("passthrough", false);
     }
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed);
+        return  new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
+                .setTokenizeHost(tokenizeHost)
+                .setTokenizePath(tokenizePath)
+                .setTokenizeQuery(tokenizeQuery);
     }
 }
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
@@ -1,15 +1,20 @@
 package org.elasticsearch.index.analysis.url;
 
-import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableList;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
+import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.index.analysis.URLPart;
 
 import java.io.IOException;
+import java.io.StringReader;
 import java.net.MalformedURLException;
-import java.net.URL;
-import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -24,11 +29,29 @@ public final class URLTokenFilter extends TokenFilter {
 
     private final boolean urlDeocde;
 
+    /**
+     * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
+     */
+    private boolean tokenizeHost = true;
+
+    /**
+     * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
+     */
+    private boolean tokenizePath = true;
+
+    /**
+     * If true, the url's query string will be split on &
+     */
+    private boolean tokenizeQuery = true;
+
     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
 
     private final boolean allowMalformed;
 
-    private boolean parsed;
+    private boolean passthrough;
+
+    private List<String> tokens;
+    private Iterator<String> iterator;
 
     public URLTokenFilter(TokenStream input, URLPart part) {
         this(input, part, false);
@@ -39,49 +62,111 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
     }
 
     public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
+        this(input, part, urlDecode, allowMalformed, false);
+    }
+
+    public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
         super(input);
         this.part = part;
         this.urlDeocde = urlDecode;
         this.allowMalformed = allowMalformed;
+        this.passthrough = passthrough;
+    }
+
+
+    public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
+        this.tokenizeHost = tokenizeHost;
+        return this;
+    }
+
+    public URLTokenFilter setTokenizePath(boolean tokenizePath) {
+        this.tokenizePath = tokenizePath;
+        return this;
+    }
+
+    public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
+        this.tokenizeQuery = tokenizeQuery;
+        return this;
     }
 
+
     @Override
     public boolean incrementToken() throws IOException {
-        if (input.incrementToken() && !parsed) {
-            final String urlString = termAttribute.toString();
-            termAttribute.setEmpty();
-            if (Strings.isNullOrEmpty(urlString) || urlString.equals("null")) {
+        if(iterator == null || !iterator.hasNext()){
+            if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
+                return false;
+            }
+        }
+        clearAttributes();
+        String next = iterator.next();
+        if (allowMalformed) {
+            next = parseMalformed(next);
+        }
+        termAttribute.append(next);
+        return true;
+    }
+
+
+    /**
+     * Advance to the next token, if any
+     * @return true if more tokens are forthcoming, false otherwise
+     * @throws IOException
+     */
+    private boolean advance() throws IOException {
+        if (input.incrementToken()) {
+            String urlString = termAttribute.toString();
+            if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
                 return false;
             }
-            String partString;
             try {
-                URL url = new URL(urlString);
-                partString = URLUtils.getPart(url, part);
-                parsed = !Strings.isNullOrEmpty(partString);
-            } catch (MalformedURLException e) {
-                if (allowMalformed) {
-                    partString = parseMalformed(urlString);
-                    if (Strings.isNullOrEmpty(partString)) {
-                        return false;
+                tokens = tokenize(urlString);
+            } catch (IOException e) {
+                if (e.getMessage().contains("Malformed URL")) {
+                    if (allowMalformed) {
+                        tokens = ImmutableList.of(urlString);
+                    } else {
+                        throw new MalformedURLException("Malformed URL: " + urlString);
                     }
-                    parsed = true;
-                } else {
-                    throw e;
                 }
+                throw e;
             }
-            if (urlDeocde) {
-                partString = URLDecoder.decode(partString, "UTF-8");
-            }
-            termAttribute.append(partString);
+            iterator = tokens.iterator();
             return true;
+        } else {
+            return false;
         }
-        return false;
     }
 
+
+    /**
+     * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
+     * will be passed along to the tokenizer.
+     * @param input a string to be tokenized
+     * @return a list of tokens extracted from the input string
+     * @throws IOException
+     */
+    private List<String> tokenize(String input) throws IOException {
+        List<String> tokens = new ArrayList<>();
+        URLTokenizer tokenizer = new URLTokenizer(part);
+        tokenizer.setUrlDecode(urlDeocde);
+        tokenizer.setTokenizeHost(tokenizeHost);
+        tokenizer.setTokenizePath(tokenizePath);
+        tokenizer.setTokenizeQuery(tokenizeQuery);
+        tokenizer.setAllowMalformed(allowMalformed || passthrough);
+        tokenizer.setReader(new StringReader(input));
+        tokenizer.reset();
+        while (tokenizer.incrementToken()) {
+            tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
+        }
+        return tokens;
+    }
+
+
     @Override
     public void reset() throws IOException {
         super.reset();
-        parsed = false;
+        tokens = null;
+        iterator = null;
     }
 
     private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
@@ -104,7 +189,7 @@ private String parseMalformed(String urlString) {
             case WHOLE:
                 return urlString;
             default:
-                return null;
+                return urlString;
         }
     }
 

diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -287,7 +287,7 @@ private int getEndIndex(int start, String partStringRaw) {
      * @return a list of tokens
      * @throws IOException
      */
-    List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
+    private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
         tokenizer.reset();
         List<Token> tokens = new ArrayList<>();
         OffsetAttribute offset;

diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java
@@ -12,6 +12,7 @@
 
 import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTPS_URL;
 import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTP_URL;
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 
 /**
@@ -65,6 +66,18 @@ public void testMalformed() {
         assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
     }
 
+
+    @Test
+    public void testPassthrough() {
+        List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.com:9200/foo.bar baz bat.blah", "url_host_passthrough");
+        assertThat(tokens, hasSize(4));
+        assertThat(tokens.get(0).getTerm(), equalTo("foo.com"));
+        assertThat(tokens.get(1).getTerm(), equalTo("com"));
+        assertThat(tokens.get(2).getTerm(), equalTo("baz"));
+        assertThat(tokens.get(3).getTerm(), equalTo("bat.blah"));
+    }
+
+
     @Test
     public void testIndex() {
         Map<String, Object> doc = new HashMap<>();

diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java
@@ -25,7 +25,7 @@ public void testFilterProtocol() throws IOException {
 
     @Test
     public void testFilterHost() throws IOException {
-        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST), "www.foo.bar.com");
+        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
     }
 
     @Test
@@ -35,7 +35,7 @@ public void testFilterPort() throws IOException {
 
     @Test
     public void testFilterPath() throws IOException {
-        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH), "/index_name/type_name/_search.html");
+        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH).setTokenizePath(false), "/index_name/type_name/_search.html");
     }
 
     @Test
@@ -45,7 +45,7 @@ public void testFilterRef() throws IOException {
 
     @Test
     public void testFilterQuery() throws IOException {
-        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY), "foo=bar&baz=bat");
+        assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY).setTokenizeQuery(false), "foo=bar&baz=bat");
     }
 
     @Test(expected = MalformedURLException.class)

diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
@@ -25,7 +25,8 @@
             },
             "url_host": {
                 "type": "url",
-                "part": "host"
+                "part": "host",
+                "tokenize_host": false
             },
             "url_port": {
                 "type": "url",
@@ -34,12 +35,18 @@
             "url_query": {
                 "type": "url",
                 "part": "query",
-                "url_decode": true
+                "url_decode": true,
+                "tokenize_query": false
             },
             "url_port_malformed": {
                 "type": "url",
                 "part": "port",
                 "allow_malformed": true
+            },
+            "url_host_passthrough": {
+                "type": "url",
+                "part": "host",
+                "passthrough": "true"
             }
         },
         "analyzer": {
@@ -73,6 +80,12 @@
                 ],
                 "tokenizer": "whitespace"
             },
+            "url_host_passthrough": {
+                "filter": [
+                    "url_host_passthrough"
+                ],
+                "tokenizer": "whitespace"
+            },
             "tokenizer_url_protocol": {
                 "tokenizer": "url_protocol"
             },