Skip to content

Commit

Permalink
use phf and lazy_static for lang map
Browse files Browse the repository at this point in the history
  • Loading branch information
zarkone committed Dec 10, 2024
1 parent ed01394 commit 8f8134d
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 24 deletions.
3 changes: 2 additions & 1 deletion rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ unicode-segmentation = "1.11.0"
logcall = "0.1"
lazy_static = "1.5.0"
tantivy-jieba = { version = "0.11.0", optional = true }
phf = { version = "0.11.2", features = ["macros"] }

[features]
default = ["jieba"]
jieba = ["dep:tantivy-jieba"]
jieba = ["dep:tantivy-jieba"]
53 changes: 30 additions & 23 deletions rust/src/tantivy_util/stemmer.rs
Original file line number Diff line number Diff line change
@@ -1,28 +1,35 @@
use tantivy::tokenizer::{Language, Stemmer};
use crate::tantivy_util::TantivyGoError;
use lazy_static::lazy_static;
use phf::phf_map;
use tantivy::tokenizer::{Language, Stemmer};

lazy_static! {
pub static ref LANGUAGES: phf::Map<&'static str, Language> = phf_map! {
"ar" => Language::Arabic,
"da" => Language::Danish,
"nl" => Language::Dutch,
"en" => Language::English,
"fi" => Language::Finnish,
"fr" => Language::French,
"de" => Language::German,
"el" => Language::Greek,
"hu" => Language::Hungarian,
"it" => Language::Italian,
"no" => Language::Norwegian,
"pt" => Language::Portuguese,
"ro" => Language::Romanian,
"ru" => Language::Russian,
"es" => Language::Spanish,
"sv" => Language::Swedish,
"ta" => Language::Tamil,
"tr" => Language::Turkish,
};
}

pub fn create_stemmer(lang: &str) -> Result<Stemmer, TantivyGoError> {
let stemmer_language = match lang {
"ar" => Ok(Language::Arabic),
"da" => Ok(Language::Danish),
"nl" => Ok(Language::Dutch),
"en" => Ok(Language::English),
"fi" => Ok(Language::Finnish),
"fr" => Ok(Language::French),
"de" => Ok(Language::German),
"el" => Ok(Language::Greek),
"hu" => Ok(Language::Hungarian),
"it" => Ok(Language::Italian),
"no" => Ok(Language::Norwegian),
"pt" => Ok(Language::Portuguese),
"ro" => Ok(Language::Romanian),
"ru" => Ok(Language::Russian),
"es" => Ok(Language::Spanish),
"sv" => Ok(Language::Swedish),
"ta" => Ok(Language::Tamil),
"tr" => Ok(Language::Turkish),
_ => Err(TantivyGoError(format!("{} is an unsupported language", lang))),
}?;
let stemmer_language = LANGUAGES
.get(lang)
.ok_or_else(|| TantivyGoError(format!("{lang} is an unsupported language")))?;

Ok(Stemmer::new(stemmer_language))
Ok(Stemmer::new(stemmer_language.to_owned()))
}

0 comments on commit 8f8134d

Please sign in to comment.