From 4ff7ebb7a5d65b1d63f5047a03f7dda400a569fb Mon Sep 17 00:00:00 2001 From: Dylan Bowker Date: Sat, 4 May 2024 11:55:28 -0600 Subject: [PATCH] Corrected panic on UTF-8 searches. --- CHANGELOG.md | 2 ++ Cargo.toml | 4 ++-- .../internal/eddie/eddie_context_autocomplete.rs | 16 +++++++++++++--- .../internal/eddie/eddie_global_autocomplete.rs | 16 +++++++++++++--- .../internal/eddie/eddie_global_keyword.rs | 16 +++++++++++++--- .../strsim/strsim_context_autocomplete.rs | 16 +++++++++++++--- .../strsim/strsim_global_autocomplete.rs | 16 +++++++++++++--- .../internal/strsim/strsim_global_keyword.rs | 16 +++++++++++++--- src/simple/tests.rs | 5 +++++ 9 files changed, 87 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2e9430..1b74098 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ * Release notes are available on [GitHub](https://github.com/leontoeides/indicium/releases). +* `0.6.2`: Corrected [panic on UTF-8 searches](https://github.com/leontoeides/indicium/issues/2). + * `0.6.1`: Removed `eddie` as the default string similarity crate, for now, due to a potential `panic`. diff --git a/Cargo.toml b/Cargo.toml index ab298e8..362b0b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "indicium" -version = "0.6.1" +version = "0.6.2" authors = ["Dylan Bowker "] edition = "2021" categories = [ "database-implementations" ] @@ -17,7 +17,7 @@ rust-version = "1.62.1" default = [ "simple", "strsim", "ahash" ] simple = [] select2 = [ "simple", "serde" ] -fuzzy = [ "eddie" ] # Deprecated feature. Redirects to `eddie` feature. +fuzzy = [ "strsim" ] # Deprecated feature. Redirects to `strsim` feature. ahash = [ "dep:ahash" ] eddie = [ "dep:eddie" ] gxhash = [ "dep:gxhash" ] diff --git a/src/simple/internal/eddie/eddie_context_autocomplete.rs b/src/simple/internal/eddie/eddie_context_autocomplete.rs index 789716c..5e77fef 100644 --- a/src/simple/internal/eddie/eddie_context_autocomplete.rs +++ b/src/simple/internal/eddie/eddie_context_autocomplete.rs @@ -46,9 +46,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return vec![], + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/internal/eddie/eddie_global_autocomplete.rs b/src/simple/internal/eddie/eddie_global_autocomplete.rs index 3350332..cbe34ab 100644 --- a/src/simple/internal/eddie/eddie_global_autocomplete.rs +++ b/src/simple/internal/eddie/eddie_global_autocomplete.rs @@ -45,9 +45,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return vec![], + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/internal/eddie/eddie_global_keyword.rs b/src/simple/internal/eddie/eddie_global_keyword.rs index 86c46e1..f0e24c4 100644 --- a/src/simple/internal/eddie/eddie_global_keyword.rs +++ b/src/simple/internal/eddie/eddie_global_keyword.rs @@ -41,9 +41,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return vec![], + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/internal/strsim/strsim_context_autocomplete.rs b/src/simple/internal/strsim/strsim_context_autocomplete.rs index f99d90a..7004cfd 100644 --- a/src/simple/internal/strsim/strsim_context_autocomplete.rs +++ b/src/simple/internal/strsim/strsim_context_autocomplete.rs @@ -46,9 +46,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return vec![], + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/internal/strsim/strsim_global_autocomplete.rs b/src/simple/internal/strsim/strsim_global_autocomplete.rs index 329ea1e..3797820 100644 --- a/src/simple/internal/strsim/strsim_global_autocomplete.rs +++ b/src/simple/internal/strsim/strsim_global_autocomplete.rs @@ -45,9 +45,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return vec![], + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/internal/strsim/strsim_global_keyword.rs b/src/simple/internal/strsim/strsim_global_keyword.rs index ac33d41..e2f5e5d 100644 --- a/src/simple/internal/strsim/strsim_global_keyword.rs +++ b/src/simple/internal/strsim/strsim_global_keyword.rs @@ -41,9 +41,19 @@ impl SearchIndex { // The user keyword must be longer than the match length to be // evaluated for fuzzy-matches: if user_keyword.len() >= self.fuzzy_length { - // Use the first _n_ characters of the user's keyword to find - // search index keywords to compare against: - &user_keyword[0..self.fuzzy_length] + // Get the byte index of the _n_th character: + let byte_index: Option = user_keyword + .char_indices() + .take(self.fuzzy_length) + .map(|(idx, _ch)| idx) + .max(); + // Use the first _n_ characters of the user's keyword. These + // first characters are used to find search index keywords to + // fuzzy match against: + match byte_index { + Some(byte_index) => &user_keyword[0..byte_index], + None => return None, + } // match } else { // The user's keyword is too short. Do not perform any fuzzy // matching: diff --git a/src/simple/tests.rs b/src/simple/tests.rs index a8be770..e1e5040 100644 --- a/src/simple/tests.rs +++ b/src/simple/tests.rs @@ -291,4 +291,9 @@ fn simple() { search_index.autocomplete_type(&AutocompleteType::Context, "stars are dancers"); #[cfg(any(feature = "eddie", feature = "strsim"))] assert_eq!(autocomplete_options, vec!["stars are dancing".to_string()]); + + // Test UTF-8: + let index = crate::simple::SearchIndex::::default(); + index.search("лол"); // lol in Cyrillic + } // fn