diff --git a/Cargo.lock b/Cargo.lock index 74326b5..d1bf3ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -101,11 +101,11 @@ name = "app" version = "0.1.0" dependencies = [ "dirs", - "pyo3", "reqwest", "serde", "serde_json", "shared", + "spacy-parsing", "stardict", "tauri", "tauri-build", @@ -3844,6 +3844,14 @@ dependencies = [ "system-deps 5.0.0", ] +[[package]] +name = "spacy-parsing" +version = "0.1.0" +dependencies = [ + "pyo3", + "shared", +] + [[package]] name = "src-ui" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 752ec69..e7dc0ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,4 +1,4 @@ [workspace] resolver = "2" -members = ["src-ui", "src-tauri", "shared"] +members = ["src-ui", "src-tauri", "shared", "spacy-parsing"] diff --git a/shared/src/lib.rs b/shared/src/lib.rs index 32f6575..82cc132 100644 --- a/shared/src/lib.rs +++ b/shared/src/lib.rs @@ -1,4 +1,4 @@ -use std::error::Error; +use std::collections::HashMap; use serde::{Deserialize, Serialize}; @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; pub struct Word { pub text: String, pub lemma: String, - pub morph: Option, + pub morph: HashMap, pub clickable: bool, } @@ -30,8 +30,11 @@ pub enum SakinyjeResult { Err(String), } -impl From>> for SakinyjeResult { - fn from(value: Result>) -> Self { +impl From> for SakinyjeResult +where + E: ToString, +{ + fn from(value: Result) -> Self { match value { Ok(v) => Self::Ok(v), Err(e) => Self::Err(e.to_string()), diff --git a/spacy-parsing/Cargo.toml b/spacy-parsing/Cargo.toml new file mode 100644 index 0000000..a69a8ca --- /dev/null +++ b/spacy-parsing/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "spacy-parsing" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +pyo3 = { version = "0.20.0", features = ["auto-initialize"] } +shared = { path = "../shared" } diff --git a/spacy-parsing/src/lib.rs b/spacy-parsing/src/lib.rs new file mode 100644 index 0000000..849f8ed --- /dev/null +++ b/spacy-parsing/src/lib.rs @@ -0,0 +1,91 @@ +use pyo3::{exceptions::PyEnvironmentError, prelude::*}; +use std::{collections::HashMap, str::FromStr}; + +pub struct Token { + pub text: String, + pub lemma: String, + pub pos: PartOfSpeech, + pub morph: HashMap, +} + +pub enum PartOfSpeech { + Adjective, + Adposition, + Adverb, + Auxiliary, + CoordinatingConjunction, + Determiner, + Interjection, + Noun, + Numeral, + Particle, + Pronoun, + ProperNoun, + Punctuation, + SubordinatingConjunction, + Symbol, + Verb, + Other, +} + +impl FromStr for PartOfSpeech { + type Err = (); + fn from_str(s: &str) -> Result { + match s { + "ADJ" => Ok(Self::Adjective), + "ADP" => Ok(Self::Adposition), + "ADV" => Ok(Self::Adverb), + "AUX" => Ok(Self::Auxiliary), + "CCONJ" => Ok(Self::CoordinatingConjunction), + "DET" => Ok(Self::Determiner), + "INTJ" => Ok(Self::Interjection), + "NOUN" => Ok(Self::Noun), + "NUM" => Ok(Self::Numeral), + "PART" => Ok(Self::Particle), + "PRON" => Ok(Self::Pronoun), + "PROPN" => Ok(Self::ProperNoun), + "PUNCT" => Ok(Self::Punctuation), + "SCONJ" => Ok(Self::SubordinatingConjunction), + "SYM" => Ok(Self::Symbol), + "VERB" => Ok(Self::Verb), + "X:" => Ok(Self::Other), + _ => Err(()), + } + } +} + +pub fn get_spacy_info(sent: &str, model: &str) -> Result, String> { + Python::with_gil(|py| -> PyResult> { + let mut words = Vec::new(); + let spacy = PyModule::import(py, "spacy")?; + let morphologizer = match spacy.getattr("load")?.call1((model,)) { + Ok(v) => v, + Err(_) => { + return Err(PyEnvironmentError::new_err(format!( + "Unable to load {model}" + ))) + } + }; + let total: Vec = morphologizer.call1((sent,))?.extract()?; + for token in total { + let text: String = token.getattr(py, "text")?.extract(py)?; + let pos_str: String = token.getattr(py, "pos_")?.extract(py)?; + let pos = PartOfSpeech::from_str(&pos_str).unwrap(); + let lemma: String = token.getattr(py, "lemma_")?.extract(py)?; + let morph: HashMap = token + .getattr(py, "morph")? + .getattr(py, "to_dict")? + .call0(py)? + .extract(py)?; + + words.push(Token { + text, + lemma, + pos, + morph, + }) + } + Ok(words) + }) + .map_err(|e| e.to_string()) +} diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index fd1cca6..8393803 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -18,10 +18,10 @@ tauri-build = { version = "1.5.0", features = [] } serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } tauri = { version = "1.5.2", features = ["dialog-all"] } -pyo3 = { version = "0.20.0", features = ["auto-initialize"] } stardict = "0.2.0" reqwest = { version = "0.11.22", features = ["json"] } shared = { path = "../shared" } +spacy-parsing = { path = "../spacy-parsing" } toml = "0.8.2" dirs = "5.0.1" diff --git a/src-tauri/src/language_parsing.rs b/src-tauri/src/language_parsing.rs index b9f1620..c00cffd 100644 --- a/src-tauri/src/language_parsing.rs +++ b/src-tauri/src/language_parsing.rs @@ -1,44 +1,21 @@ -use pyo3::{exceptions::PyEnvironmentError, prelude::*}; use shared::*; -use tauri::Window; +use spacy_parsing::{get_spacy_info, PartOfSpeech}; #[tauri::command] -pub async fn parse_text(_window: Window, sent: &str, model: &str) -> Result, String> { - Python::with_gil(|py| -> PyResult> { - let mut words = Vec::new(); - let spacy = PyModule::import(py, "spacy")?; - let morphologizer = match spacy.getattr("load")?.call1((model,)) { - Ok(v) => v, - Err(_) => { - return Err(PyEnvironmentError::new_err(format!( - "Unable to load {model}" - ))) - } - }; - let total: Vec = morphologizer.call1((sent,))?.extract()?; - for i in total { - let text: String = i.getattr(py, "text")?.extract(py)?; - let pos: String = i.getattr(py, "pos_")?.extract(py)?; - let clickable = pos != "PUNCT"; - let lemma: String = i.getattr(py, "lemma_")?.extract(py)?; - let morph: Option = match i - .getattr(py, "morph") - .and_then(|v| v.getattr(py, "get")?.call1(py, ("Case",))) - .and_then(|v| v.extract::>(py)) - { - Ok(mut s) if !s.is_empty() => Some(s.remove(0)), - _ => None, - }; - - println!("{:?}", morph); - words.push(Word { - text, - lemma, - morph, - clickable, - }) - } - Ok(words) - }) - .map_err(|e| e.to_string()) +pub async fn parse_text(sent: &str, model: &str) -> Result, String> { + let mut words = Vec::new(); + let parsed_words = get_spacy_info(sent, model)?; + for word in parsed_words { + let clickable = !matches!( + word.pos, + PartOfSpeech::Punctuation | PartOfSpeech::Symbol | PartOfSpeech::Numeral + ); + words.push(Word { + text: word.text, + clickable, + lemma: word.lemma, + morph: word.morph, + }); + } + Ok(words) } diff --git a/src-ui/src/reader.rs b/src-ui/src/reader.rs index dda1a6a..1f4bb27 100644 --- a/src-ui/src/reader.rs +++ b/src-ui/src/reader.rs @@ -62,7 +62,7 @@ async fn send_sentence(sent: String) -> Vec { Err(e) => vec![Word { text: e.to_string(), lemma: e.to_string(), - morph: None, + morph: HashMap::new(), clickable: false, }], } @@ -231,17 +231,18 @@ pub fn ReaderView(settings: Resource<(), Settings>) -> impl IntoView { fn Word(word: Word, i: usize, word_selector: WriteSignal>) -> impl IntoView { let mut class = String::from("word"); if !word.clickable { - class.push_str(" punctuation"); + class.push_str(" punctuation "); } - if let Some(morph) = word.morph { - class.push(' '); - class.push_str(&morph); + for (feat, value) in &word.morph { + class.push_str(&format!(" {feat}-{value}")); } view! {