enhancement

hankcs · Apr 9, 2015 · 00f59e6 · 00f59e6
1 parent bcd79c9
commit 00f59e6
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -10,9 +10,9 @@ You may heard that Aho-Corasick algorithm is fast for parsing text with a huge d
 * adding semantics to plain text
 * checking against a dictionary to see if syntactic errors were made
 
-But most implementation use a `TreeMap<Character, State>` to store the *goto* structure, which costs `O(ln(t))` time, `t` is the largest amount of a word's common suffixes. The final complexity is `O(n * ln(t))`, absolutely `t > 2`, so `n * ln(t) > n `. The others used a `HashMap`, which wasted too much memory, and still remained slowly.
+But most implementation use a `TreeMap<Character, State>` to store the *goto* structure, which costs `O(ln(t))` time, `t` is the largest amount of a word's common prefixes. The final complexity is `O(n * ln(t))`, absolutely `t > 2`, so `n * ln(t) > n `. The others used a `HashMap`, which wasted too much memory, and still remained slowly.
 
-I improve it by replace the `XXXMap` to a Double Array Trie, whose time complexity is just `O(1)`, thus we get a total complexity of exactly `O(n)`, and take a perfect balance of time and memory. Yes, its speed is not related to the length or language or common suffix of the words of a dictionary.
+I improve it by replace the `XXXMap` to a Double Array Trie, whose time complexity is just `O(1)`, thus we get a total complexity of exactly `O(n)`, and take a perfect balance of time and memory. Yes, its speed is not related to the length or language or common prefix of the words of a dictionary.
 
 Usage
 -----
@@ -32,22 +32,22 @@ Setting up the `AhoCorasickDoubleArrayTrie` is a piece of cake:
             map.put(key, key);
         }
         // Build an AhoCorasickDoubleArrayTrie
-        AhoCorasickDoubleArrayTrie<String> act = new AhoCorasickDoubleArrayTrie<String>();
-        act.build(map);
+        AhoCorasickDoubleArrayTrie<String> acdat = new AhoCorasickDoubleArrayTrie<String>();
+        acdat.build(map);
         // Test it
         final String text = "uhers";
-        List<AhoCorasickDoubleArrayTrie<String>.Hit<String>> segmentList = act.parseText(text);
+        List<AhoCorasickDoubleArrayTrie<String>.Hit<String>> wordList = acdat.parseText(text);
 ```
 
 Of course, there remains many useful methods to be discovered, feel free to try:
 * Use a `Map<String, Object>` to assign a Object as value to a keyword.
 * Store the `AhoCorasickDoubleArrayTrie` to disk by calling `save` method.
 * Restore the `AhoCorasickDoubleArrayTrie` from disk by calling `load` method.
 
-In other situations you probably do not need a huge segmentList, then please try this:
+In other situations you probably do not need a huge wordList, then please try this:
 
 ```java
-        act.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<String>()
+        acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<String>()
         {
             @Override
             public void hit(int begin, int end, String value)
@@ -59,7 +59,7 @@ In other situations you probably do not need a huge segmentList, then please try
 
 or a lambda function
 ```
-        act.parseText(text, (begin, end, value) -> {
+        acdat.parseText(text, (begin, end, value) -> {
             System.out.printf("[%d:%d]=%s\n", begin, end, value);
         });
 ```
@@ -70,15 +70,15 @@ I compared my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, ACDAT r
 ```
 Parsing English document which contains 3409283 characters, with a dictionary of 127142 words.
                	Naive          	ACDAT          
-time           	571            	306            
-char/s         	5970723.29     	11141447.71    
-rate           	1.00           	1.87           
+time           	554            	290            
+char/s         	6153940.43     	11756148.28    
+rate           	1.00           	1.91           
 ===========================================================================
 Parsing Chinese document which contains 1290573 characters, with a dictionary of 146047 words.
                	Naive          	ACDAT          
-time           	270            	62             
-char/s         	4779900.00     	20815693.55    
-rate           	1.00           	4.35           
+time           	269            	56             
+char/s         	4797669.14     	23045946.43    
+rate           	1.00           	4.80           
 ===========================================================================
 ```
 

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>AhoCorasickDoubleArrayTrie</groupId>
     <artifactId>aho-corasick-double-array-trie</artifactId>
-    <version>1.0.0</version>
+    <version>1.0.1</version>
 
     <build>
         <plugins>

diff --git a/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java b/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java
@@ -1,6 +1,5 @@
 package com.hankcs.algorithm;
 
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
@@ -15,7 +14,7 @@
 public class AhoCorasickDoubleArrayTrie<V>
 {
     /**
-     * 双数组值check
+     * 双数组之check
      */
     protected int check[];
     /**
@@ -350,27 +349,6 @@ public void build(Map<String, V> map)
         new Builder().build(map);
     }
 
-    /**
-     * 获取直接相连的子节点
-     *
-     * @param parent   父节点
-     * @param siblings （子）兄弟节点
-     * @return 兄弟节点个数
-     */
-    private int fetch(State parent, List<Map.Entry<Integer, State>> siblings)
-    {
-        if (parent.isAcceptable())
-        {
-            State fakeNode = new State(-(parent.getDepth() + 1));  // 此节点是parent的子节点，同时具备parent的输出
-            fakeNode.addEmit(parent.getLargestValueId());
-            siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode));
-        }
-        for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet())
-        {
-            siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue()));
-        }
-        return siblings.size();
-    }
 
     /**
      * 精确匹配
@@ -624,6 +602,28 @@ public void build(Map<String, V> map)
             loseWeight();
         }
 
+        /**
+         * 获取直接相连的子节点
+         *
+         * @param parent   父节点
+         * @param siblings （子）兄弟节点
+         * @return 兄弟节点个数
+         */
+        private int fetch(State parent, List<Map.Entry<Integer, State>> siblings)
+        {
+            if (parent.isAcceptable())
+            {
+                State fakeNode = new State(-(parent.getDepth() + 1));  // 此节点是parent的子节点，同时具备parent的输出
+                fakeNode.addEmit(parent.getLargestValueId());
+                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode));
+            }
+            for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet())
+            {
+                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue()));
+            }
+            return siblings.size();
+        }
+
         /**
          * 添加一个键
          *

diff --git a/src/test/java/TestAhoCorasickDoubleArrayTrie.java b/src/test/java/TestAhoCorasickDoubleArrayTrie.java
@@ -40,11 +40,11 @@ public void testBuildAndParseSimply() throws Exception
             map.put(key, key);
         }
         // Build an AhoCorasickDoubleArrayTrie
-        AhoCorasickDoubleArrayTrie<String> act = new AhoCorasickDoubleArrayTrie<String>();
-        act.build(map);
+        AhoCorasickDoubleArrayTrie<String> acdat = new AhoCorasickDoubleArrayTrie<String>();
+        acdat.build(map);
         // Test it
         final String text = "uhers";
-        act.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<String>()
+        acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<String>()
         {
             @Override
             public void hit(int begin, int end, String value)
@@ -53,8 +53,8 @@ public void hit(int begin, int end, String value)
                 assertEquals(text.substring(begin, end), value);
             }
         });
-        List<AhoCorasickDoubleArrayTrie<String>.Hit<String>> segmentList = act.parseText(text);
-        System.out.println(segmentList);
+        List<AhoCorasickDoubleArrayTrie<String>.Hit<String>> wordList = acdat.parseText(text);
+        System.out.println(wordList);
     }
 
     private String loadText(String path) throws IOException
@@ -119,6 +119,11 @@ private void runTest(String dictionaryPath, String textPath) throws IOException
         System.out.println("===========================================================================");
     }
 
+    /**
+     * Compare my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, notice that robert-bor's aho-corasick is
+     * compiled under jdk1.8, so you will need jdk1.8 to run this test
+     * @throws Exception
+     */
     public void testBenchmark() throws Exception
     {
         runTest("en/dictionary.txt", "en/text.txt");