quickwit-oss · cjrh · Jan 21, 2024 · Jan 29, 2024 · Feb 17, 2024 · Feb 18, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,7 +22,15 @@ futures = "0.3.26"
 pythonize = "0.20.0"
 serde = "1.0"
 serde_json = "1.0.91"
+# Lindera
+lindera-core = { version = "0.27.2", optional = true }
+lindera-dictionary = { version = "0.27.2", optional = true }
+lindera-tantivy = { version = "0.27.1", optional = true, features = ["ipadic"] }
 
 [dependencies.pyo3]
 version = "0.20.0"
 features = ["chrono", "extension-module"]
+
+[features]
+lindera = ["lindera-core", "lindera-dictionary", "lindera-tantivy"]
+
diff --git a/noxfile.py b/noxfile.py
@@ -5,4 +5,17 @@
 def test(session):
     session.install("-rrequirements-dev.txt")
     session.install("-e", ".", "--no-build-isolation")
+    session.run("pytest", "-m", "not lindera", *session.posargs)
+
+
+@nox.session(python=["3.8", "3.9", "3.10", "3.11", "3.12"])
+def test_lindera(session):
+    session.install("-rrequirements-dev.txt")
+    session.install(
+        "--no-build-isolation",
+        '--config-settings',
+        'build-args="--features=lindera"',
+        "-e",
+        ".",
+    )
     session.run("pytest", *session.posargs)
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,9 @@ dev = [
 bindings = "pyo3"
 
 [tool.pytest.ini_options]
+markers = [
+    "lindera: mark a test as requiring lindera",
+]
 # Set the durations option and doctest modules
 # See https://docs.pytest.org/en/latest/usage.html#durations
 addopts = "--doctest-modules --durations=10"

diff --git a/src/index.rs b/src/index.rs
@@ -244,6 +244,18 @@ impl Index {
         Ok(Index { index, reader })
     }
 
+    /// Register the lindera tokenizer
+    ///
+    /// This will only be available if tantivy-py was built with the "lindera"
+    /// feature.
+    #[cfg(feature = "lindera")]
+    fn register_lindera_tokenizer(
+        &self,
+    ) {
+        let tokenizer = crate::lindera_tokenizer::create_tokenizer(lindera_core::mode::Mode::Normal);
+        self.index.tokenizers().register("lang_ja", tokenizer);
+    }
+
     /// Create a `IndexWriter` for the index.
     ///
     /// The writer will be multithreaded and the provided heap size will be

diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,8 @@ mod schema;
 mod schemabuilder;
 mod searcher;
 mod snippet;
+#[cfg(feature = "lindera")]
+mod lindera_tokenizer;
 
 use document::Document;
 use facet::Facet;

diff --git a/src/lindera_tokenizer.rs b/src/lindera_tokenizer.rs
@@ -0,0 +1,16 @@
+use lindera_core::mode::Mode;
+use lindera_dictionary::{
+    load_dictionary_from_config, DictionaryConfig, DictionaryKind,
+};
+use lindera_tantivy::tokenizer::LinderaTokenizer;
+
+pub fn create_tokenizer(mode: Mode) -> LinderaTokenizer {
+    let dictionary_config = DictionaryConfig {
+        kind: Some(DictionaryKind::IPADIC),
+        path: None,
+    };
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let tokenizer = LinderaTokenizer::new(dictionary, None, mode);
+
+    tokenizer
+}
diff --git a/tests/test_lindera.py b/tests/test_lindera.py
@@ -0,0 +1,18 @@
+import pytest
+pytestmark = pytest.mark.lindera
+
+from tantivy import SchemaBuilder, Index, Document
+
+
+def test_basic():
+    sb = SchemaBuilder()
+    sb.add_text_field("title", stored=True, tokenizer_name="lang_ja")
+    schema = sb.build()
+    index = Index(schema)
+    index.register_lindera_tokenizer()
+    writer = index.writer(50_000_000)
+    doc = Document()
+    doc.add_text("title", "成田国際空港")
+    writer.add_document(doc)
+    writer.commit()
+    index.reload()