Merge branch 'release/v5.2.1'

duydo · Feb 20, 2017 · 5ab50da · 5ab50da
2 parents 8a1c7eb + 4b1144b
commit 5ab50da
Show file tree

Hide file tree

Showing 17 changed files with 271 additions and 412 deletions.
diff --git a/README.md b/README.md
@@ -3,24 +3,63 @@ Vietnamese Analysis Plugin for Elasticsearch
 
 Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch.
 
+The plugin provides the `vi_analyzer` analyzer and `vi_tokenizer` tokenizer. The `vi_analyzer` is composed of the `vi_tokenizer` tokenizer, the `lowercase` and `stop` filter.
+
+
+## Installation on Elasticsearch 5.x
+
 In order to install the plugin, choose a version in [releases](https://github.com/duydo/elasticsearch-analysis-vietnamese/releases) page then run:
 
 ```sh
-bin/plugin install link/to/binary/version
+bin/elasticsearch-plugin install link/to/binary/version
 ```
 
 Or to build from source, you need to build it with Maven:
 
 ```bash
 mvn clean package
-bin/plugin install file:target/releases/elasticsearch-analysis-vietnamese-2.4.1.zip
+bin/elasticsearch-plugin install file:target/releases/elasticsearch-analysis-vietnamese-5.2.1.zip
 ```
 
-*Notes*: To build the plugin you need to clone and build the [vn-nlp-libararies](https://github.com/duydo/vn-nlp-libraries). The plugin uses  [Lê Hồng Phương](http://mim.hus.vnu.edu.vn/phuonglh/) vnTokenizer library. Thanks thầy Lê Hồng Phương for great contribution.
+*In order to build the plugin you need to build the [vn-nlp-libararies](https://github.com/duydo/vn-nlp-libraries) first. Thanks to thầy [Lê Hồng Phương](http://mim.hus.vnu.edu.vn/phuonglh/) for his VnTokenizer library.*
+
+
+
+## Example
+```sh
+curl "http://localhost:9200/_analyze?pretty" -d'
+{
+  "analyzer": "vi_analyzer",
+  "text": "Công nghệ thông tin Việt Nam"
+}'
+```
+
+Result
+```json
+{
+  "tokens" : [
+    {
+      "token" : "công nghệ thông tin",
+      "start_offset" : 0,
+      "end_offset" : 19,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "việt nam",
+      "start_offset" : 20,
+      "end_offset" : 28,
+      "type" : "name2",
+      "position" : 1
+    }
+  ]
+}
+```
 
 |Vietnamese Analysis Plugin|Elasticsearch|
 |---|---|
-| master|2.4.1|
+| master|5.2.1|
+| 5.2.1|5.2.1|
 | 2.4.1|2.4.1|
 | 2.4.0|2.4.0|
 | 2.3.5|2.3.5|
@@ -39,15 +78,6 @@ bin/plugin install file:target/releases/elasticsearch-analysis-vietnamese-2.4.1.
 | 0.1.1|1.4+|
 | 0.1|1.3|
 
-
-## User guide
-
-The plugin provides the `vi_analyzer` analyzer and `vi_tokenizer` tokenizer.
-
-The `vi_analyzer` is built using the `vi_tokenizer` tokenizer, the `lowercase` and `stop` filter.
-
- The analyzer analyzes `"công nghệ thông tin Việt Nam"` into `"công nghệ thông tin"` and `"việt nam"` tokens.
-
 License
 -------
 

diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-analysis-vietnamese</artifactId>
-    <version>2.4.1</version>
+    <version>5.2.1</version>
     <packaging>jar</packaging>
     <name>elasticsearch-analysis-vietnamese</name>
     <url>https://github.com/duydo/elasticsearch-analysis-vietnamese/</url>
@@ -16,52 +16,32 @@
             <distribution>repo</distribution>
         </license>
     </licenses>
+    <developers>
+        <developer>
+            <id>duydo</id>
+            <name>Duy Do</name>
+            <url>http://duydo.me</url>
+        </developer>
+    </developers>
     <scm>
         <connection>scm:git:[email protected]:duydo/elasticsearch-analysis-vietnamese.git</connection>
         <developerConnection>scm:git:[email protected]:duydo/elasticsearch-analysis-vietnamese.git</developerConnection>
         <url>http://github.com/duydo/elasticsearch-analysis-vietnamese</url>
     </scm>
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-        <project.build.java.version>1.7</project.build.java.version>
-        <elasticsearch.version>2.4.1</elasticsearch.version>
-        <lucene.version>5.5.2</lucene.version>
-        <tests.jvms>1</tests.jvms>
-        <tests.shuffle>true</tests.shuffle>
-        <tests.output>onerror</tests.output>
-        <es.logger.level>INFO</es.logger.level>
+        <project.build.java.version>1.8</project.build.java.version>
+        <elasticsearch.version>5.2.1</elasticsearch.version>
+        <lucene.version>6.4.1</lucene.version>
+        <jna.version>4.1.0</jna.version>
+        <log4j.version>2.7</log4j.version>
     </properties>
     <dependencies>
-        <dependency>
-            <groupId>org.apache.lucene</groupId>
-            <artifactId>lucene-test-framework</artifactId>
-            <version>${lucene.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.hamcrest</groupId>
-            <artifactId>hamcrest-all</artifactId>
-            <version>1.3</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.carrotsearch.randomizedtesting</groupId>
-            <artifactId>randomizedtesting-runner</artifactId>
-            <version>2.1.14</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
             <groupId>org.elasticsearch</groupId>
             <artifactId>elasticsearch</artifactId>
             <version>${elasticsearch.version}</version>
-            <type>test-jar</type>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.elasticsearch</groupId>
-            <artifactId>elasticsearch</artifactId>
-            <version>${elasticsearch.version}</version>
-            <scope>compile</scope>
+            <scope>provided</scope>
         </dependency>
         <dependency>
             <groupId>commons-io</groupId>
@@ -79,15 +59,28 @@
             <version>4.1.1</version>
         </dependency>
         <dependency>
-            <groupId>log4j</groupId>
-            <artifactId>log4j</artifactId>
-            <version>1.2.17</version>
-            <scope>runtime</scope>
+            <groupId>org.elasticsearch.test</groupId>
+            <artifactId>framework</artifactId>
+            <version>${elasticsearch.version}</version>
+            <scope>test</scope>
         </dependency>
         <dependency>
-            <groupId>org.elasticsearch</groupId>
-            <artifactId>elasticsearch</artifactId>
-            <version>${elasticsearch.version}</version>
+            <groupId>net.java.dev.jna</groupId>
+            <artifactId>jna</artifactId>
+            <version>${jna.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-api</artifactId>
+            <version>${log4j.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-core</artifactId>
+            <version>${log4j.version}</version>
+            <scope>test</scope>
         </dependency>
     </dependencies>
     <build>
@@ -106,10 +99,10 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-compiler-plugin</artifactId>
-                <version>2.3.2</version>
+                <version>3.3</version>
                 <configuration>
-                    <source>1.6</source>
-                    <target>1.6</target>
+                    <source>1.8</source>
+                    <target>1.8</target>
                 </configuration>
             </plugin>
             <plugin>
@@ -127,7 +120,7 @@
             </plugin>
             <plugin>
                 <artifactId>maven-assembly-plugin</artifactId>
-                <version>2.3</version>
+                <version>2.6</version>
                 <configuration>
                     <appendAssemblyId>false</appendAssemblyId>
                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
@@ -144,61 +137,35 @@
                     </execution>
                 </executions>
             </plugin>
-
+            <plugin>
+                <!-- we skip surefire to work with randomized testing above -->
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.9</version>
+                <configuration>
+                    <skipTests>true</skipTests>
+                </configuration>
+            </plugin>
             <plugin>
                 <groupId>com.carrotsearch.randomizedtesting</groupId>
                 <artifactId>junit4-maven-plugin</artifactId>
-                <version>2.0.12</version>
+                <version>2.3.3</version>
+                <configuration>
+                    <assertions enableSystemAssertions="false">
+                        <enable/>
+                    </assertions>
+
+                    <listeners>
+                        <report-text/>
+                    </listeners>
+                </configuration>
                 <executions>
                     <execution>
-                        <id>tests</id>
+                        <id>unit-tests</id>
                         <phase>test</phase>
                         <goals>
                             <goal>junit4</goal>
                         </goals>
-                        <configuration>
-                            <heartbeat>20</heartbeat>
-                            <jvmOutputAction>pipe,warn</jvmOutputAction>
-                            <leaveTemporary>true</leaveTemporary>
-                            <listeners>
-                                <report-ant-xml mavenExtensions="true"
-                                                dir="${project.build.directory}/surefire-reports"/>
-                                <report-text
-                                        showThrowable="true"
-                                        showStackTraces="true"
-                                        showOutput="${tests.output}"
-                                        showStatusOk="false"
-                                        showStatusError="true"
-                                        showStatusFailure="true"
-                                        showStatusIgnored="true"
-                                        showSuiteSummary="true"
-                                        timestamps="false"/>
-                                <report-execution-times file="${basedir}/.local-execution-hints.log"/>
-                            </listeners>
-                            <assertions>
-                                <enable/>
-                            </assertions>
-                            <parallelism>${tests.jvms}</parallelism>
-                            <balancers>
-                                <execution-times>
-                                    <fileset dir="${basedir}" includes=".local-execution-hints.log"/>
-                                </execution-times>
-                            </balancers>
-                            <includes>
-                                <include>**/*Tests.class</include>
-                                <include>**/*Test.class</include>
-                            </includes>
-                            <excludes>
-                                <exclude>**/Abstract*.class</exclude>
-                                <exclude>**/*StressTest.class</exclude>
-                            </excludes>
-                            <jvmArgs>
-                                <param>-Xmx512m</param>
-                                <param>-XX:MaxDirectMemorySize=512m</param>
-                                <param>-Des.logger.prefix=</param>
-                            </jvmArgs>
-                            <shuffleOnSlave>${tests.shuffle}</shuffleOnSlave>
-                        </configuration>
                     </execution>
                 </executions>
             </plugin>

diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml
@@ -8,26 +8,24 @@
     <files>
         <file>
             <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
-            <outputDirectory></outputDirectory>
+            <outputDirectory>elasticsearch</outputDirectory>
             <filtered>true</filtered>
         </file>
         <file>
             <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
-            <outputDirectory></outputDirectory>
+            <outputDirectory>elasticsearch</outputDirectory>
             <filtered>true</filtered>
         </file>
+
     </files>
     <dependencySets>
         <dependencySet>
-            <outputDirectory>/</outputDirectory>
+            <outputDirectory>elasticsearch</outputDirectory>
             <useProjectArtifact>true</useProjectArtifact>
             <useTransitiveFiltering>true</useTransitiveFiltering>
-            <excludes>
-                <exclude>org.elasticsearch:elasticsearch</exclude>
-            </excludes>
         </dependencySet>
         <dependencySet>
-            <outputDirectory>/</outputDirectory>
+            <outputDirectory>elasticsearch</outputDirectory>
             <useProjectArtifact>true</useProjectArtifact>
             <useTransitiveFiltering>true</useTransitiveFiltering>
             <includes>

diff --git a/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java b/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java
@@ -14,15 +14,8 @@
 
 package org.apache.lucene.analysis.vi;
 
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.*;
 
-import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 

diff --git a/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java b/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java
@@ -74,10 +74,11 @@ public VietnameseTokenizer(boolean sentenceDetectorEnabled, boolean ambiguitiesR
         tokenizer = AccessController.doPrivileged(new PrivilegedAction<vn.hus.nlp.tokenizer.Tokenizer>() {
             @Override
             public vn.hus.nlp.tokenizer.Tokenizer run() {
-                return TokenizerProvider.getInstance().getTokenizer();
+                vn.hus.nlp.tokenizer.Tokenizer vnTokenizer = TokenizerProvider.getInstance().getTokenizer();
+                vnTokenizer.setAmbiguitiesResolved(ambiguitiesResolved);
+                return vnTokenizer;
             }
         });
-        tokenizer.setAmbiguitiesResolved(ambiguitiesResolved);
     }
 
     private void tokenize(Reader input) throws IOException {

diff --git a/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java