diff --git a/README.md b/README.md index 92567f6..047f10f 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ The library is thread safe and can be used in a concurrent environment ## Disclaimer This project is still in development and might not be ready for production use. The API is subject to change, and the project may contain bugs. Please use with caution. +## Features +### Jieba Tokenizer +This library includes the Jieba feature by default, which provides Chinese text segmentation. However, if you do not need this functionality, you can build the library without it to save approximately 5MB of the dictionary. + ## Installation ```bash diff --git a/bindings.h b/bindings.h index 2bc402a..20b7cae 100644 --- a/bindings.h +++ b/bindings.h @@ -1,4 +1,4 @@ -#include +#include "binding_typedefs.h" #include #include #include @@ -108,4 +108,7 @@ void document_free(struct Document *doc_ptr); void string_free(char *s); -void init_lib(const char *log_level_ptr, char **error_buffer, bool clear_on_panic); +void init_lib(const char *log_level_ptr, + char **error_buffer, + bool clear_on_panic, + bool utf8_lenient); diff --git a/example/main.go b/example/main.go index 2debea7..c7b5c25 100644 --- a/example/main.go +++ b/example/main.go @@ -13,7 +13,7 @@ const NameTitle = "title" func main() { // Initialize the library - err := tantivy_go.LibInit(true, "debug") + err := tantivy_go.LibInit(true, true, "debug") if err != nil { fmt.Println("Failed to initialize library:", err) return diff --git a/internal/libinit.go b/internal/libinit.go new file mode 100644 index 0000000..af4bd1e --- /dev/null +++ b/internal/libinit.go @@ -0,0 +1,31 @@ +package internal + +//#include "../bindings.h" +import "C" +import "fmt" + +// LibInit for tests init +func LibInit(cleanOnPanic, utf8Lenient bool, directive ...string) error { + var initVal string + if len(directive) == 0 { + initVal = "info" + } else { + initVal = directive[0] + } + + cInitVal := C.CString(initVal) + defer C.string_free(cInitVal) + cCleanOnPanic := C.bool(cleanOnPanic) + cUtf8Lenient := C.bool(utf8Lenient) + var errBuffer *C.char + fmt.Println("### lenient", utf8Lenient) + C.init_lib(cInitVal, &errBuffer, cCleanOnPanic, cUtf8Lenient) + + errorMessage := C.GoString(errBuffer) + defer C.string_free(errBuffer) + + if errorMessage != "" { + return fmt.Errorf(errorMessage) + } + return nil +} diff --git a/rust/src/build.rs b/rust/src/build.rs index ceddc16..dcb2c3f 100644 --- a/rust/src/build.rs +++ b/rust/src/build.rs @@ -8,8 +8,7 @@ use std::io::{self, Write}; const FILE_PATH: &str = "../bindings.h"; fn add_typedefs() -> io::Result<()> { - - let include = "#include \n"; + let include = "#include \"binding_typedefs.h\"\n"; let mut existing_content = fs::read_to_string(FILE_PATH)?; diff --git a/rust/src/c_util/mod.rs b/rust/src/c_util/mod.rs index 764b8e1..e21cd66 100644 --- a/rust/src/c_util/mod.rs +++ b/rust/src/c_util/mod.rs @@ -1,4 +1,5 @@ mod util; +mod string_processor; pub use self::util::set_error; pub use self::util::assert_string; diff --git a/rust/src/c_util/string_processor.rs b/rust/src/c_util/string_processor.rs new file mode 100644 index 0000000..e69de29 diff --git a/rust/src/c_util/util.rs b/rust/src/c_util/util.rs index d9749ad..b02337c 100644 --- a/rust/src/c_util/util.rs +++ b/rust/src/c_util/util.rs @@ -1,23 +1,19 @@ use std::{fs, panic, slice}; +use std::borrow::Cow; use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::os::raw::{c_char, c_float}; +use std::panic::PanicInfo; use std::path::Path; -use std::sync::Mutex; -use lazy_static::lazy_static; use log::debug; use serde_json::json; use tantivy::{Index, IndexWriter, Score, TantivyDocument, TantivyError, Term}; use tantivy::directory::MmapDirectory; use tantivy::query::{QueryParser}; use tantivy::schema::{Field, Schema}; - +use crate::config; use crate::tantivy_util::{convert_document_to_json, Document, TantivyContext, DOCUMENT_BUDGET_BYTES, find_highlights, get_string_field_entry, SearchResult}; -lazy_static! { - static ref FTS_PATH: Mutex = Mutex::new(String::from("")); -} - pub fn set_error(err: &str, error_buffer: *mut *mut c_char) { let err_str = match CString::new(err) { Ok(s) => s, @@ -35,38 +31,56 @@ fn write_buffer(error_buffer: *mut *mut c_char, err_str: CString) { } } -fn process_c_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Result<&'a str, String> { + +fn process_c_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Result, String> { unsafe { if str_ptr.is_null() { set_error(POINTER_IS_NULL, error_buffer); return Err(POINTER_IS_NULL.to_owned()); } - match CStr::from_ptr(str_ptr).to_str() { - Ok(valid_str) => Ok(valid_str), + let is_lenient = match config::CONFIG.read() { + Ok(config) => { + config.utf8_lenient + } Err(err) => { let error_message = err.to_string(); set_error(&error_message, error_buffer); - Err(error_message) + return Err(error_message); + } + }; + let cstr = CStr::from_ptr(str_ptr); + if is_lenient { + Ok(cstr.to_string_lossy()) + } else { + match cstr.to_str() { + Ok(valid_str) => Ok(Cow::Borrowed(valid_str)), + Err(err) => { + let error_message = err.to_string(); + set_error(&error_message, error_buffer); + Err(error_message) + } } } } } +// Always copy long-living strings for safety reasons pub fn assert_string(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option { match process_c_str(str_ptr, error_buffer) { - Ok(valid_str) => Some(valid_str.to_owned()), + Ok(Cow::Borrowed(original_str)) => Some(original_str.to_owned()), + Ok(Cow::Owned(fixed_str)) => Some(fixed_str), Err(_) => None, } } -pub fn assert_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option<&'a str> { +// Try not to copy one-time-living strings if possible +pub fn assert_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option> { match process_c_str(str_ptr, error_buffer) { - Ok(valid_str) => Some(valid_str), Err(_) => None, + Ok(res) => Some(res) } } - pub fn assert_pointer<'a, T>(ptr: *mut T, error_buffer: *mut *mut c_char) -> Option<&'a mut T> { let result = unsafe { if ptr.is_null() { @@ -112,7 +126,7 @@ pub fn process_string_slice<'a, F>( mut func: F, ) -> Result<(), ()> where - F: FnMut(&'a str) -> Result<(), ()>, + F: FnMut(Cow<'a, str>) -> Result<(), ()>, { let slice = match assert_pointer(ptr, error_buffer) { Some(ptr) => unsafe { slice::from_raw_parts(ptr, len) }, @@ -157,14 +171,14 @@ where Ok(()) } -pub fn schema_apply_for_field<'a, T, K, F: FnMut(Field, &'a str) -> Result>( +pub fn schema_apply_for_field<'a, T, K, F: FnMut(Field, Cow<'a, str>) -> Result>( error_buffer: *mut *mut c_char, schema: Schema, - field_name: &'a str, + field_name: Cow<'a, str>, mut func: F, ) -> Result { - match schema.get_field(field_name) { + match schema.get_field(&field_name) { Ok(field) => func(field, field_name), Err(err) => { set_error(&err.to_string(), error_buffer); @@ -177,7 +191,7 @@ pub fn convert_document_as_json( include_fields_ptr: *mut *const c_char, include_fields_len: usize, error_buffer: *mut *mut c_char, - doc: &&mut Document, + doc: &mut Document, schema: Schema, ) -> Result { let mut field_to_name = HashMap::new(); @@ -192,47 +206,69 @@ pub fn convert_document_as_json( return Err(()); } - let doc_json = convert_document_to_json(&doc, field_to_name); + let doc_json = convert_document_to_json(doc, &field_to_name); Ok(json!(doc_json).to_string()) } -pub fn start_lib_init(log_level: &str, clear_on_panic: bool) { +pub fn start_lib_init( + log_level: &str, + clear_on_panic: bool, + utf8_lenient: bool, +) { let old_hook = panic::take_hook(); if clear_on_panic { - panic::set_hook(Box::new(move |panic_info| { - let _ = match FTS_PATH.lock() { - Ok(fts_path) => { - let fts_path = fts_path.as_str(); - if fts_path.is_empty() { - debug!("fts path is empty"); - } else { - let _ = fs::remove_dir_all(Path::new(fts_path)); - } - } - Err(e) => { - debug!("Set hook err: {}", e); - } - }; - old_hook(panic_info) - })); + handle_panic(old_hook); } + set_utf8_lenient(utf8_lenient); + let _ = env_logger::Builder::from_env( env_logger::Env::default().default_filter_or(log_level) ).try_init(); } +fn set_utf8_lenient(utf8_lenient: bool) { + match config::CONFIG.write() { + Ok(mut config) => { + config.update_utf8_lenient(utf8_lenient); + } + Err(e) => { + debug!("Set utf8_lenient err: {}", e); + } + } +} + +fn handle_panic(old_hook: Box) { + panic::set_hook(Box::new(move |panic_info| { + match config::CONFIG.read() { + Ok(config) => { + let fts_path = config.fts_path.as_str(); + if fts_path.is_empty() { + debug!("fts path is empty"); + } else { + let _ = fs::remove_dir_all(Path::new(fts_path)); + } + } + Err(e) => { + debug!("Set hook err: {}", e); + } + } + old_hook(panic_info) + })); +} + pub fn create_context_with_schema( error_buffer: *mut *mut c_char, schema: Schema, path: String, ) -> Result<*mut TantivyContext, ()> { - match FTS_PATH.lock() { - Ok(mut fts_path) => *fts_path = path.clone(), - Err(e) => debug!("Failed to set path: {}", e), - }; - + match config::CONFIG.write() { + Ok(mut config) => { + config.update_fts_path(path.clone()); + } + Err(e) => { debug!("Failed to set path: {}", e) } + } match fs::create_dir_all(Path::new(path.as_str())) { Err(e) => { debug!("Failed to create directories: {}", e); @@ -296,12 +332,12 @@ fn commit(writer: &mut IndexWriter, message: &str, error_buffer: *mut *mut c_cha } } -pub fn delete_docs( +pub fn delete_docs<'a>( delete_ids_ptr: *mut *const c_char, delete_ids_len: usize, error_buffer: *mut *mut c_char, context: &mut TantivyContext, - field_name: &str, + field_name: Cow<'a, str>, ) { let schema = context.index.schema(); @@ -320,7 +356,7 @@ pub fn delete_docs( }; if process_string_slice(delete_ids_ptr, error_buffer, delete_ids_len, |id_value| { - let _ = context.writer.delete_term(Term::from_field_text(field, id_value)); + let _ = context.writer.delete_term(Term::from_field_text(field, &id_value)); Ok(()) }).is_err() { rollback(error_buffer, &mut context.writer, "Failed to process string slice"); @@ -353,11 +389,11 @@ pub fn get_doc<'a>( Ok(Box::into_raw(Box::new(doc))) } -pub fn add_field( +pub fn add_field<'a>( error_buffer: *mut *mut c_char, doc: &mut Document, index: &Index, - field_name: &str, + field_name: Cow<'a, str>, field_value: &str, ) { let schema = index.schema(); diff --git a/rust/src/config/config.rs b/rust/src/config/config.rs new file mode 100644 index 0000000..21e52bf --- /dev/null +++ b/rust/src/config/config.rs @@ -0,0 +1,28 @@ +use lazy_static::lazy_static; +use std::sync::RwLock; + +lazy_static! { + pub static ref CONFIG: RwLock = RwLock::new(Config::default()); +} + +pub struct Config { + pub utf8_lenient: bool, + pub fts_path: String, +} + +impl Config { + fn default() -> Config { + Config { + utf8_lenient: false, + fts_path: String::new(), + } + } + + pub fn update_utf8_lenient(&mut self, utf8_lenient: bool) { + self.utf8_lenient = utf8_lenient; + } + + pub fn update_fts_path(&mut self, fts_path: String) { + self.fts_path = fts_path; + } +} \ No newline at end of file diff --git a/rust/src/config/mod.rs b/rust/src/config/mod.rs new file mode 100644 index 0000000..c245e27 --- /dev/null +++ b/rust/src/config/mod.rs @@ -0,0 +1,3 @@ +mod config; + +pub use self::config::CONFIG; \ No newline at end of file diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 71c7070..970f1bd 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -9,7 +9,7 @@ use crate::tantivy_util::{add_text_field, Document, register_edge_ngram_tokenize mod tantivy_util; mod c_util; - +mod config; #[logcall] #[no_mangle] pub extern "C" fn schema_builder_new() -> *mut SchemaBuilder { @@ -178,12 +178,12 @@ pub extern "C" fn context_register_text_analyzer_simple( None => return }; - let lang = match assert_str(lang_str_ptr, error_buffer) { + let lang = match assert_string(lang_str_ptr, error_buffer) { Some(value) => value, None => return }; - register_simple_tokenizer(text_limit, &context.index, tokenizer_name.as_str(), lang); + register_simple_tokenizer(text_limit, &context.index, tokenizer_name.as_str(), &lang); } #[logcall] @@ -394,7 +394,7 @@ pub extern "C" fn document_add_field( None => return }; - add_field(error_buffer, doc, &context.index, field_name, field_value); + add_field(error_buffer, doc, &context.index, field_name, &field_value); } #[logcall] @@ -420,7 +420,7 @@ pub extern "C" fn document_as_json( include_fields_ptr, include_fields_len, error_buffer, - &doc, + doc, schema, ) { Ok(value) => value, @@ -459,10 +459,11 @@ pub unsafe extern "C" fn init_lib( log_level_ptr: *const c_char, error_buffer: *mut *mut c_char, clear_on_panic: bool, + utf8_lenient: bool, ) { let log_level = match assert_string(log_level_ptr, error_buffer) { Some(value) => value, None => return }; - start_lib_init(log_level.as_str(), clear_on_panic); + start_lib_init(log_level.as_str(), clear_on_panic, utf8_lenient); } \ No newline at end of file diff --git a/rust/src/tantivy_util/document.rs b/rust/src/tantivy_util/document.rs index 2f65153..bf77426 100644 --- a/rust/src/tantivy_util/document.rs +++ b/rust/src/tantivy_util/document.rs @@ -1,19 +1,20 @@ +use std::borrow::Cow; use std::collections::HashMap; use tantivy::schema::Field; use crate::tantivy_util::{Document, extract_text_from_owned_value}; pub fn convert_document_to_json<'a>( - doc: &&mut Document, - field_to_name: HashMap, -) -> HashMap<&'a str, serde_json::Value> { - let mut result_json: HashMap<&str, serde_json::Value> = HashMap::new(); + doc: &mut Document, + field_to_name: &'a HashMap>, +) -> HashMap, serde_json::Value> { + let mut result_json: HashMap, serde_json::Value> = HashMap::new(); let _ = serde_json::to_value(doc.score).is_ok_and( - |score| result_json.insert("score", score).is_some() + |score| result_json.insert(Cow::from("score"), score).is_some() ); let _ = serde_json::to_value(&doc.highlights).is_ok_and( - |highlights| result_json.insert("highlights", highlights).is_some() + |highlights| result_json.insert(Cow::from("highlights"), highlights).is_some() ); let doc = &doc.tantivy_doc; @@ -22,7 +23,7 @@ pub fn convert_document_to_json<'a>( Some(key) => { let _ = extract_text_from_owned_value(&field_value.value).is_some_and( |value| serde_json::to_value(value).is_ok_and( - |value| result_json.insert(key, value).is_some()) + |value| result_json.insert(Cow::Borrowed(key), value).is_some()) ); } None => {} diff --git a/rust/src/tantivy_util/util.rs b/rust/src/tantivy_util/util.rs index 60e0604..af22bd6 100644 --- a/rust/src/tantivy_util/util.rs +++ b/rust/src/tantivy_util/util.rs @@ -1,8 +1,9 @@ +use std::borrow::Cow; use tantivy::schema::OwnedValue; -pub fn extract_text_from_owned_value(value: &OwnedValue) -> Option<&str> { +pub fn extract_text_from_owned_value<'a>(value: &'a OwnedValue) -> Option> { match value { - OwnedValue::Str(text) => Some(text), + OwnedValue::Str(text) => Some(Cow::Borrowed(text)), _ => { None } } } diff --git a/schemabuilder.go b/schemabuilder.go index f79322e..41e9574 100644 --- a/schemabuilder.go +++ b/schemabuilder.go @@ -7,8 +7,11 @@ import ( ) type ( - SchemaBuilder struct{ ptr *C.SchemaBuilder } - Schema struct{ ptr *C.Schema } + SchemaBuilder struct { + ptr *C.SchemaBuilder + fieldNames map[string]struct{} + } + Schema struct{ ptr *C.Schema } ) const ( @@ -50,7 +53,7 @@ func NewSchemaBuilder() (*SchemaBuilder, error) { if ptr == nil { return nil, errors.New("failed to create schema builder") } - return &SchemaBuilder{ptr: ptr}, nil + return &SchemaBuilder{ptr: ptr, fieldNames: make(map[string]struct{})}, nil } // AddTextField adds a text field to the schema being built. @@ -72,6 +75,10 @@ func (b *SchemaBuilder) AddTextField( indexRecordOption int, tokenizer string, ) error { + if _, contains := b.fieldNames[name]; contains { + return errors.New("field already defined: " + name) + } + b.fieldNames[name] = struct{}{} cName := C.CString(name) cTokenizer := C.CString(tokenizer) defer C.string_free(cName) diff --git a/tantivy.go b/tantivy.go index ee53b90..8a1c835 100644 --- a/tantivy.go +++ b/tantivy.go @@ -16,7 +16,7 @@ package tantivy_go */ import "C" import ( - "fmt" + "github.com/anyproto/tantivy-go/internal" "sync" ) @@ -36,28 +36,10 @@ var doOnce sync.Once // // Returns: // - An error if the initialization fails. -func LibInit(cleanOnPanic bool, directive ...string) error { - var initVal string +func LibInit(cleanOnPanic, utf8Lenient bool, directive ...string) error { var err error doOnce.Do(func() { - if len(directive) == 0 { - initVal = "info" - } else { - initVal = directive[0] - } - - cInitVal := C.CString(initVal) - defer C.string_free(cInitVal) - cCleanOnPanic := C.bool(cleanOnPanic) - var errBuffer *C.char - C.init_lib(cInitVal, &errBuffer, cCleanOnPanic) - - errorMessage := C.GoString(errBuffer) - defer C.string_free(errBuffer) - - if errorMessage != "" { - err = fmt.Errorf(errorMessage) - } + err = internal.LibInit(cleanOnPanic, utf8Lenient, directive...) }) return err } diff --git a/tantivy_test.go b/tantivy_test.go index f98bc17..433b01b 100644 --- a/tantivy_test.go +++ b/tantivy_test.go @@ -2,6 +2,7 @@ package tantivy_go_test import ( "encoding/json" + "github.com/anyproto/tantivy-go/internal" "os" "testing" @@ -39,8 +40,8 @@ type Highlight struct { func Test(t *testing.T) { - t.Run("docs search and remove - when by raw Id", func(t *testing.T) { - schema, tc := fx(t, limit, minGram, false) + t.Run("docs search and remove - when by part of body", func(t *testing.T) { + schema, tc := fx(t, limit, minGram, false, false) defer tc.Free() @@ -101,7 +102,7 @@ func Test(t *testing.T) { }) t.Run("docs remove - when by body token", func(t *testing.T) { - _, tc := fx(t, limit, minGram, false) + _, tc := fx(t, limit, minGram, false, false) defer tc.Free() @@ -124,7 +125,7 @@ func Test(t *testing.T) { }) t.Run("docs remove - when by wrong body token", func(t *testing.T) { - _, tc := fx(t, limit, minGram, false) + _, tc := fx(t, limit, minGram, false, false) defer tc.Free() @@ -147,7 +148,7 @@ func Test(t *testing.T) { }) t.Run("docs remove - when by proper token and wrong field length", func(t *testing.T) { - _, tc := fx(t, 1, minGram, false) + _, tc := fx(t, 1, minGram, false, false) defer tc.Free() @@ -169,7 +170,7 @@ func Test(t *testing.T) { }) t.Run("docs search and remove - when thai", func(t *testing.T) { - _, tc := fx(t, limit, 1, false) + _, tc := fx(t, limit, 1, false, false) defer tc.Free() @@ -221,7 +222,7 @@ func Test(t *testing.T) { }) t.Run("docs search - when ascii folding", func(t *testing.T) { - _, tc := fx(t, limit, 1, false) + _, tc := fx(t, limit, 1, false, false) defer tc.Free() @@ -289,7 +290,7 @@ func Test(t *testing.T) { }) t.Run("docs search and remove - when fast", func(t *testing.T) { - _, tc := fx(t, limit, minGram, false) + _, tc := fx(t, limit, minGram, false, false) defer tc.Free() @@ -315,15 +316,97 @@ func Test(t *testing.T) { size, err := result.GetSize() defer result.Free() require.Equal(t, 1, int(size)) + }) - err = tc.DeleteDocuments(NameId, "1") - docs, err = tc.NumDocs() + t.Run("err - when add field twice", func(t *testing.T) { + err := internal.LibInit(true, false, "debug") + assert.NoError(t, err) + builder, err := tantivy_go.NewSchemaBuilder() + require.NoError(t, err) + + err = builder.AddTextField( + NameTitle, + true, + true, + false, + tantivy_go.IndexRecordOptionWithFreqsAndPositions, + tantivy_go.TokenizerEdgeNgram, + ) + require.NoError(t, err) + + err = builder.AddTextField( + NameTitle, + true, + true, + false, + tantivy_go.IndexRecordOptionWithFreqsAndPositions, + tantivy_go.TokenizerEdgeNgram, + ) + require.Error(t, err) + }) + + t.Run("docs fix utf8 - wrong utf8 - when lenient", func(t *testing.T) { + schema, tc := fx(t, limit, minGram, false, true) + + defer tc.Free() + + invalidUtf8Hello := string([]byte{0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xff}) + doc, err := addDoc(t, "some", invalidUtf8Hello, "1", tc) require.NoError(t, err) - require.Equal(t, uint64(0), docs) + + err = tc.AddAndConsumeDocuments(doc) + require.NoError(t, err) + + docs, err := tc.NumDocs() + require.NoError(t, err) + require.Equal(t, uint64(1), docs) + + sCtx := tantivy_go.NewSearchContextBuilder(). + SetQuery("1"). + SetDocsLimit(100). + SetWithHighlights(false). + AddFieldDefaultWeight(NameId). + Build() + result, err := tc.Search(sCtx) + require.NoError(t, err) + + size, err := result.GetSize() + require.Equal(t, 1, int(size)) + + results, err := tantivy_go.GetSearchResults(result, schema, func(jsonStr string) (interface{}, error) { + var doc DocSample + return doc, json.Unmarshal([]byte(jsonStr), &doc) + }, NameId, NameTitle, NameBody) + require.NoError(t, err) + + require.Equal(t, len(results), int(size)) + + for next := range results { + model := results[next].(DocSample) + require.Equal(t, DocSample{ + "some", + "1", + "hello�", + []Highlight{}, + }, + model) + } + }) + + t.Run("docs fix utf8 - wrong utf8 - when not lenient", func(t *testing.T) { + _, tc := fx(t, limit, minGram, false, false) + + defer tc.Free() + + invalidUtf8Hello := string([]byte{0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xff}) + doc := tantivy_go.NewDocument() + err := doc.AddField(NameBody, invalidUtf8Hello, tc) + + require.Error(t, err, "invalid utf-8 sequence of 1 bytes from index 5") }) t.Run("docs search and remove - when title", func(t *testing.T) { - schema, tc := fx(t, limit, minGram, false) + schema, tc := fx(t, limit, minGram, false, false) defer tc.Free() @@ -383,7 +466,7 @@ func Test(t *testing.T) { }) t.Run("docs search - when jieba", func(t *testing.T) { - _, tc := fx(t, limit, 1, false) + _, tc := fx(t, limit, 1, false, false) defer tc.Free() @@ -416,7 +499,7 @@ func Test(t *testing.T) { }) t.Run("docs search - when weights apply", func(t *testing.T) { - schema, tc := fx(t, limit, 1, false) + schema, tc := fx(t, limit, 1, false, false) defer tc.Free() @@ -503,8 +586,9 @@ func fx( limit uintptr, minGram uintptr, isFastId bool, + utf8Lenient bool, ) (*tantivy_go.Schema, *tantivy_go.TantivyContext) { - err := tantivy_go.LibInit(true, "debug") + err := internal.LibInit(true, utf8Lenient, "debug") assert.NoError(t, err) builder, err := tantivy_go.NewSchemaBuilder() require.NoError(t, err)