From 00f59e678799a74b2b8df6c58c84d7281cb88545 Mon Sep 17 00:00:00 2001 From: hankcs Date: Thu, 9 Apr 2015 10:35:16 +0800 Subject: [PATCH] enhancement --- README.md | 28 +++++------ pom.xml | 2 +- .../algorithm/AhoCorasickDoubleArrayTrie.java | 46 +++++++++---------- .../java/TestAhoCorasickDoubleArrayTrie.java | 15 ++++-- 4 files changed, 48 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 3ee0d1f..ac1b70a 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ You may heard that Aho-Corasick algorithm is fast for parsing text with a huge d * adding semantics to plain text * checking against a dictionary to see if syntactic errors were made -But most implementation use a `TreeMap` to store the *goto* structure, which costs `O(ln(t))` time, `t` is the largest amount of a word's common suffixes. The final complexity is `O(n * ln(t))`, absolutely `t > 2`, so `n * ln(t) > n `. The others used a `HashMap`, which wasted too much memory, and still remained slowly. +But most implementation use a `TreeMap` to store the *goto* structure, which costs `O(ln(t))` time, `t` is the largest amount of a word's common prefixes. The final complexity is `O(n * ln(t))`, absolutely `t > 2`, so `n * ln(t) > n `. The others used a `HashMap`, which wasted too much memory, and still remained slowly. -I improve it by replace the `XXXMap` to a Double Array Trie, whose time complexity is just `O(1)`, thus we get a total complexity of exactly `O(n)`, and take a perfect balance of time and memory. Yes, its speed is not related to the length or language or common suffix of the words of a dictionary. +I improve it by replace the `XXXMap` to a Double Array Trie, whose time complexity is just `O(1)`, thus we get a total complexity of exactly `O(n)`, and take a perfect balance of time and memory. Yes, its speed is not related to the length or language or common prefix of the words of a dictionary. Usage ----- @@ -32,11 +32,11 @@ Setting up the `AhoCorasickDoubleArrayTrie` is a piece of cake: map.put(key, key); } // Build an AhoCorasickDoubleArrayTrie - AhoCorasickDoubleArrayTrie act = new AhoCorasickDoubleArrayTrie(); - act.build(map); + AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie(); + acdat.build(map); // Test it final String text = "uhers"; - List.Hit> segmentList = act.parseText(text); + List.Hit> wordList = acdat.parseText(text); ``` Of course, there remains many useful methods to be discovered, feel free to try: @@ -44,10 +44,10 @@ Of course, there remains many useful methods to be discovered, feel free to try: * Store the `AhoCorasickDoubleArrayTrie` to disk by calling `save` method. * Restore the `AhoCorasickDoubleArrayTrie` from disk by calling `load` method. -In other situations you probably do not need a huge segmentList, then please try this: +In other situations you probably do not need a huge wordList, then please try this: ```java - act.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() + acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, String value) @@ -59,7 +59,7 @@ In other situations you probably do not need a huge segmentList, then please try or a lambda function ``` - act.parseText(text, (begin, end, value) -> { + acdat.parseText(text, (begin, end, value) -> { System.out.printf("[%d:%d]=%s\n", begin, end, value); }); ``` @@ -70,15 +70,15 @@ I compared my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, ACDAT r ``` Parsing English document which contains 3409283 characters, with a dictionary of 127142 words. Naive ACDAT -time 571 306 -char/s 5970723.29 11141447.71 -rate 1.00 1.87 +time 554 290 +char/s 6153940.43 11756148.28 +rate 1.00 1.91 =========================================================================== Parsing Chinese document which contains 1290573 characters, with a dictionary of 146047 words. Naive ACDAT -time 270 62 -char/s 4779900.00 20815693.55 -rate 1.00 4.35 +time 269 56 +char/s 4797669.14 23045946.43 +rate 1.00 4.80 =========================================================================== ``` diff --git a/pom.xml b/pom.xml index 9b037fa..a0fd816 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ AhoCorasickDoubleArrayTrie aho-corasick-double-array-trie - 1.0.0 + 1.0.1 diff --git a/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java b/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java index f86e576..7c7fdac 100644 --- a/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java +++ b/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java @@ -1,6 +1,5 @@ package com.hankcs.algorithm; -import java.io.DataOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; @@ -15,7 +14,7 @@ public class AhoCorasickDoubleArrayTrie { /** - * 双数组值check + * 双数组之check */ protected int check[]; /** @@ -350,27 +349,6 @@ public void build(Map map) new Builder().build(map); } - /** - * 获取直接相连的子节点 - * - * @param parent 父节点 - * @param siblings (子)兄弟节点 - * @return 兄弟节点个数 - */ - private int fetch(State parent, List> siblings) - { - if (parent.isAcceptable()) - { - State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点,同时具备parent的输出 - fakeNode.addEmit(parent.getLargestValueId()); - siblings.add(new AbstractMap.SimpleEntry(0, fakeNode)); - } - for (Map.Entry entry : parent.getSuccess().entrySet()) - { - siblings.add(new AbstractMap.SimpleEntry(entry.getKey() + 1, entry.getValue())); - } - return siblings.size(); - } /** * 精确匹配 @@ -624,6 +602,28 @@ public void build(Map map) loseWeight(); } + /** + * 获取直接相连的子节点 + * + * @param parent 父节点 + * @param siblings (子)兄弟节点 + * @return 兄弟节点个数 + */ + private int fetch(State parent, List> siblings) + { + if (parent.isAcceptable()) + { + State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点,同时具备parent的输出 + fakeNode.addEmit(parent.getLargestValueId()); + siblings.add(new AbstractMap.SimpleEntry(0, fakeNode)); + } + for (Map.Entry entry : parent.getSuccess().entrySet()) + { + siblings.add(new AbstractMap.SimpleEntry(entry.getKey() + 1, entry.getValue())); + } + return siblings.size(); + } + /** * 添加一个键 * diff --git a/src/test/java/TestAhoCorasickDoubleArrayTrie.java b/src/test/java/TestAhoCorasickDoubleArrayTrie.java index 7776e63..bc73903 100644 --- a/src/test/java/TestAhoCorasickDoubleArrayTrie.java +++ b/src/test/java/TestAhoCorasickDoubleArrayTrie.java @@ -40,11 +40,11 @@ public void testBuildAndParseSimply() throws Exception map.put(key, key); } // Build an AhoCorasickDoubleArrayTrie - AhoCorasickDoubleArrayTrie act = new AhoCorasickDoubleArrayTrie(); - act.build(map); + AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie(); + acdat.build(map); // Test it final String text = "uhers"; - act.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() + acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, String value) @@ -53,8 +53,8 @@ public void hit(int begin, int end, String value) assertEquals(text.substring(begin, end), value); } }); - List.Hit> segmentList = act.parseText(text); - System.out.println(segmentList); + List.Hit> wordList = acdat.parseText(text); + System.out.println(wordList); } private String loadText(String path) throws IOException @@ -119,6 +119,11 @@ private void runTest(String dictionaryPath, String textPath) throws IOException System.out.println("==========================================================================="); } + /** + * Compare my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, notice that robert-bor's aho-corasick is + * compiled under jdk1.8, so you will need jdk1.8 to run this test + * @throws Exception + */ public void testBenchmark() throws Exception { runTest("en/dictionary.txt", "en/text.txt");