Skip to content

Commit

Permalink
Merge pull request #13 from BrewingWeasel/spacy
Browse files Browse the repository at this point in the history
refactor: move spacy parsing into its own crate
  • Loading branch information
BrewingWeasel authored Nov 5, 2023
2 parents bf55629 + 963e684 commit 1322508
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 53 deletions.
10 changes: 9 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[workspace]
resolver = "2"

members = ["src-ui", "src-tauri", "shared"]
members = ["src-ui", "src-tauri", "shared", "spacy-parsing"]
11 changes: 7 additions & 4 deletions shared/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use std::error::Error;
use std::collections::HashMap;

use serde::{Deserialize, Serialize};

#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct Word {
pub text: String,
pub lemma: String,
pub morph: Option<String>,
pub morph: HashMap<String, String>,
pub clickable: bool,
}

Expand All @@ -30,8 +30,11 @@ pub enum SakinyjeResult<T> {
Err(String),
}

impl<T> From<Result<T, Box<dyn Error>>> for SakinyjeResult<T> {
fn from(value: Result<T, Box<dyn Error>>) -> Self {
impl<T, E> From<Result<T, E>> for SakinyjeResult<T>
where
E: ToString,
{
fn from(value: Result<T, E>) -> Self {
match value {
Ok(v) => Self::Ok(v),
Err(e) => Self::Err(e.to_string()),
Expand Down
10 changes: 10 additions & 0 deletions spacy-parsing/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "spacy-parsing"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
pyo3 = { version = "0.20.0", features = ["auto-initialize"] }
shared = { path = "../shared" }
91 changes: 91 additions & 0 deletions spacy-parsing/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use pyo3::{exceptions::PyEnvironmentError, prelude::*};
use std::{collections::HashMap, str::FromStr};

pub struct Token {
pub text: String,
pub lemma: String,
pub pos: PartOfSpeech,
pub morph: HashMap<String, String>,
}

pub enum PartOfSpeech {
Adjective,
Adposition,
Adverb,
Auxiliary,
CoordinatingConjunction,
Determiner,
Interjection,
Noun,
Numeral,
Particle,
Pronoun,
ProperNoun,
Punctuation,
SubordinatingConjunction,
Symbol,
Verb,
Other,
}

impl FromStr for PartOfSpeech {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"ADJ" => Ok(Self::Adjective),
"ADP" => Ok(Self::Adposition),
"ADV" => Ok(Self::Adverb),
"AUX" => Ok(Self::Auxiliary),
"CCONJ" => Ok(Self::CoordinatingConjunction),
"DET" => Ok(Self::Determiner),
"INTJ" => Ok(Self::Interjection),
"NOUN" => Ok(Self::Noun),
"NUM" => Ok(Self::Numeral),
"PART" => Ok(Self::Particle),
"PRON" => Ok(Self::Pronoun),
"PROPN" => Ok(Self::ProperNoun),
"PUNCT" => Ok(Self::Punctuation),
"SCONJ" => Ok(Self::SubordinatingConjunction),
"SYM" => Ok(Self::Symbol),
"VERB" => Ok(Self::Verb),
"X:" => Ok(Self::Other),
_ => Err(()),
}
}
}

pub fn get_spacy_info(sent: &str, model: &str) -> Result<Vec<Token>, String> {
Python::with_gil(|py| -> PyResult<Vec<Token>> {
let mut words = Vec::new();
let spacy = PyModule::import(py, "spacy")?;
let morphologizer = match spacy.getattr("load")?.call1((model,)) {
Ok(v) => v,
Err(_) => {
return Err(PyEnvironmentError::new_err(format!(
"Unable to load {model}"
)))
}
};
let total: Vec<PyObject> = morphologizer.call1((sent,))?.extract()?;
for token in total {
let text: String = token.getattr(py, "text")?.extract(py)?;
let pos_str: String = token.getattr(py, "pos_")?.extract(py)?;
let pos = PartOfSpeech::from_str(&pos_str).unwrap();
let lemma: String = token.getattr(py, "lemma_")?.extract(py)?;
let morph: HashMap<String, String> = token
.getattr(py, "morph")?
.getattr(py, "to_dict")?
.call0(py)?
.extract(py)?;

words.push(Token {
text,
lemma,
pos,
morph,
})
}
Ok(words)
})
.map_err(|e| e.to_string())
}
2 changes: 1 addition & 1 deletion src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ tauri-build = { version = "1.5.0", features = [] }
serde_json = "1.0"
serde = { version = "1.0", features = ["derive"] }
tauri = { version = "1.5.2", features = ["dialog-all"] }
pyo3 = { version = "0.20.0", features = ["auto-initialize"] }
stardict = "0.2.0"
reqwest = { version = "0.11.22", features = ["json"] }
shared = { path = "../shared" }
spacy-parsing = { path = "../spacy-parsing" }
toml = "0.8.2"
dirs = "5.0.1"

Expand Down
57 changes: 17 additions & 40 deletions src-tauri/src/language_parsing.rs
Original file line number Diff line number Diff line change
@@ -1,44 +1,21 @@
use pyo3::{exceptions::PyEnvironmentError, prelude::*};
use shared::*;
use tauri::Window;
use spacy_parsing::{get_spacy_info, PartOfSpeech};

#[tauri::command]
pub async fn parse_text(_window: Window, sent: &str, model: &str) -> Result<Vec<Word>, String> {
Python::with_gil(|py| -> PyResult<Vec<Word>> {
let mut words = Vec::new();
let spacy = PyModule::import(py, "spacy")?;
let morphologizer = match spacy.getattr("load")?.call1((model,)) {
Ok(v) => v,
Err(_) => {
return Err(PyEnvironmentError::new_err(format!(
"Unable to load {model}"
)))
}
};
let total: Vec<PyObject> = morphologizer.call1((sent,))?.extract()?;
for i in total {
let text: String = i.getattr(py, "text")?.extract(py)?;
let pos: String = i.getattr(py, "pos_")?.extract(py)?;
let clickable = pos != "PUNCT";
let lemma: String = i.getattr(py, "lemma_")?.extract(py)?;
let morph: Option<String> = match i
.getattr(py, "morph")
.and_then(|v| v.getattr(py, "get")?.call1(py, ("Case",)))
.and_then(|v| v.extract::<Vec<String>>(py))
{
Ok(mut s) if !s.is_empty() => Some(s.remove(0)),
_ => None,
};

println!("{:?}", morph);
words.push(Word {
text,
lemma,
morph,
clickable,
})
}
Ok(words)
})
.map_err(|e| e.to_string())
pub async fn parse_text(sent: &str, model: &str) -> Result<Vec<Word>, String> {
let mut words = Vec::new();
let parsed_words = get_spacy_info(sent, model)?;
for word in parsed_words {
let clickable = !matches!(
word.pos,
PartOfSpeech::Punctuation | PartOfSpeech::Symbol | PartOfSpeech::Numeral
);
words.push(Word {
text: word.text,
clickable,
lemma: word.lemma,
morph: word.morph,
});
}
Ok(words)
}
13 changes: 7 additions & 6 deletions src-ui/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async fn send_sentence(sent: String) -> Vec<Word> {
Err(e) => vec![Word {
text: e.to_string(),
lemma: e.to_string(),
morph: None,
morph: HashMap::new(),
clickable: false,
}],
}
Expand Down Expand Up @@ -231,17 +231,18 @@ pub fn ReaderView(settings: Resource<(), Settings>) -> impl IntoView {
fn Word(word: Word, i: usize, word_selector: WriteSignal<Option<usize>>) -> impl IntoView {
let mut class = String::from("word");
if !word.clickable {
class.push_str(" punctuation");
class.push_str(" punctuation ");
}
if let Some(morph) = word.morph {
class.push(' ');
class.push_str(&morph);
for (feat, value) in &word.morph {
class.push_str(&format!(" {feat}-{value}"));
}
view! {
<span
class=class
on:click=move |_| {
word_selector.set(Some(i));
if word.clickable {
word_selector.set(Some(i));
}
}
>

Expand Down

0 comments on commit 1322508

Please sign in to comment.