From dd2ece823b619e6d93dadf87df5fd19398b0359f Mon Sep 17 00:00:00 2001
From: Ryo Yamashita <qryxip@gmail.com>
Date: Mon, 4 Nov 2024 14:56:16 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20=E9=9D=9E=E5=90=8C=E6=9C=9F=E9=96=A2?=
 =?UTF-8?q?=E9=80=A3=E3=81=AEtodo=E3=81=A8fixme=E3=82=92=E8=A7=A3=E6=B6=88?=
 =?UTF-8?q?=20(#868)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

非同期APIに関して以下の二つを行う。

* `load_voice_model`にて、ONNX Runtimeの`Session`作成（結構重い）をス
    レッドプール上でやるようにする。
* Python APIの`Synthesizer`にて、`Closable`の機構が適切に役割を果たすよ
    うにする。
---
 crates/voicevox_core/src/synthesizer.rs       |  8 ++-
 crates/voicevox_core/src/voice_model.rs       |  9 +--
 .../voicevox_core_python_api/src/convert.rs   |  5 +-
 crates/voicevox_core_python_api/src/lib.rs    | 64 ++++++++++++-------
 4 files changed, 54 insertions(+), 32 deletions(-)
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
index b30d7c3ca..b68d1ab2c 100644
--- a/crates/voicevox_core/src/synthesizer.rs
+++ b/crates/voicevox_core/src/synthesizer.rs
@@ -248,9 +248,11 @@ mod inner {
             &self,
             model: &voice_model::Inner<A>,
         ) -> crate::Result<()> {
-            let model_bytes = &model.read_inference_models().await?;
-            // TODO: 重い操作なので、asyncにする
-            self.status.insert_model(model.header(), model_bytes)
+            let model_bytes = model.read_inference_models().await?;
+
+            let status = self.status.clone();
+            let header = model.header().clone();
+            A::unblock(move || status.insert_model(&header, &model_bytes)).await
         }
 
         pub(super) fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> {
diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs
index c2920398b..9914409eb 100644
--- a/crates/voicevox_core/src/voice_model.rs
+++ b/crates/voicevox_core/src/voice_model.rs
@@ -63,7 +63,7 @@ impl VoiceModelId {
 
 #[self_referencing]
 pub(crate) struct Inner<A: Async> {
-    header: VoiceModelHeader,
+    header: Arc<VoiceModelHeader>,
 
     #[borrows(header)]
     #[not_covariant]
@@ -126,11 +126,12 @@ impl<A: Async> Inner<A> {
             )
         })?;
 
-        let header = VoiceModelHeader::new(manifest, metas, path)?;
+        let header = VoiceModelHeader::new(manifest, metas, path)?.into();
 
         InnerTryBuilder {
             header,
-            inference_model_entries_builder: |VoiceModelHeader { manifest, .. }| {
+            inference_model_entries_builder: |header| {
+                let VoiceModelHeader { manifest, .. } = &**header;
                 manifest
                     .domains()
                     .each_ref()
@@ -182,7 +183,7 @@ impl<A: Async> Inner<A> {
         &self.borrow_header().metas
     }
 
-    pub(crate) fn header(&self) -> &VoiceModelHeader {
+    pub(crate) fn header(&self) -> &Arc<VoiceModelHeader> {
         self.borrow_header()
     }
 
diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs
index d4a867606..711da5fe4 100644
--- a/crates/voicevox_core_python_api/src/convert.rs
+++ b/crates/voicevox_core_python_api/src/convert.rs
@@ -111,7 +111,7 @@ pub(crate) fn async_modify_accent_phrases<'py, Fun, Fut>(
 ) -> PyResult<&'py PyAny>
 where
     Fun: FnOnce(Vec<AccentPhrase>, StyleId) -> Fut + Send + 'static,
-    Fut: Future<Output = voicevox_core::Result<Vec<AccentPhrase>>> + Send + 'static,
+    Fut: Future<Output = PyResult<Vec<AccentPhrase>>> + Send + 'static,
 {
     let rust_accent_phrases = accent_phrases
         .iter()
@@ -121,10 +121,9 @@ where
         py,
         pyo3_asyncio::tokio::get_current_locals(py)?,
         async move {
-            let replaced_accent_phrases = method(rust_accent_phrases, speaker_id).await;
+            let replaced_accent_phrases = method(rust_accent_phrases, speaker_id).await?;
             Python::with_gil(|py| {
                 let replaced_accent_phrases = replaced_accent_phrases
-                    .into_py_result(py)?
                     .iter()
                     .map(move |accent_phrase| {
                         to_pydantic_dataclass(
diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs
index a2d1c2475..52ba87c2c 100644
--- a/crates/voicevox_core_python_api/src/lib.rs
+++ b/crates/voicevox_core_python_api/src/lib.rs
@@ -1055,11 +1055,9 @@ mod asyncio {
 
     #[pyclass]
     pub(crate) struct Synthesizer {
-        // FIXME: `Arc<voicevox_core::nonblocking::Synthesizer>`ではなく、`Arc<Closable<_>>`を
-        // `clone`する
         synthesizer: Arc<
             Closable<
-                Arc<voicevox_core::nonblocking::Synthesizer<voicevox_core::nonblocking::OpenJtalk>>,
+                voicevox_core::nonblocking::Synthesizer<voicevox_core::nonblocking::OpenJtalk>,
                 Self,
                 Tokio,
             >,
@@ -1090,7 +1088,7 @@ mod asyncio {
                     cpu_num_threads,
                 },
             );
-            let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?.into();
+            let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?;
             let synthesizer = Closable::new(synthesizer).into();
             Ok(Self { synthesizer })
         }
@@ -1139,9 +1137,12 @@ mod asyncio {
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
             let model: VoiceModelFile = model.extract()?;
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             pyo3_asyncio::tokio::future_into_py(py, async move {
-                let result = synthesizer.load_voice_model(&*model.model.read()?).await;
+                let result = synthesizer
+                    .read()?
+                    .load_voice_model(&*model.model.read()?)
+                    .await;
                 Python::with_gil(|py| result.into_py_result(py))
             })
         }
@@ -1173,13 +1174,14 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let kana = kana.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
                     let audio_query = synthesizer
+                        .read()?
                         .audio_query_from_kana(&kana, StyleId::new(style_id))
                         .await;
 
@@ -1201,13 +1203,16 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let text = text.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
-                    let audio_query = synthesizer.audio_query(&text, StyleId::new(style_id)).await;
+                    let audio_query = synthesizer
+                        .read()?
+                        .audio_query(&text, StyleId::new(style_id))
+                        .await;
 
                     Python::with_gil(|py| {
                         let audio_query = audio_query.into_py_result(py)?;
@@ -1225,13 +1230,14 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let kana = kana.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
                     let accent_phrases = synthesizer
+                        .read()?
                         .create_accent_phrases_from_kana(&kana, StyleId::new(style_id))
                         .await;
                     Python::with_gil(|py| {
@@ -1254,13 +1260,14 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let text = text.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
                     let accent_phrases = synthesizer
+                        .read()?
                         .create_accent_phrases(&text, StyleId::new(style_id))
                         .await;
                     Python::with_gil(|py| {
@@ -1283,12 +1290,15 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             crate::convert::async_modify_accent_phrases(
                 accent_phrases,
                 StyleId::new(style_id),
                 py,
-                |a, s| async move { synthesizer.replace_mora_data(&a, s).await },
+                |a, s| async move {
+                    let result = synthesizer.read()?.replace_mora_data(&a, s).await;
+                    Python::with_gil(|py| result.into_py_result(py))
+                },
             )
         }
 
@@ -1298,12 +1308,15 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             crate::convert::async_modify_accent_phrases(
                 accent_phrases,
                 StyleId::new(style_id),
                 py,
-                |a, s| async move { synthesizer.replace_phoneme_length(&a, s).await },
+                |a, s| async move {
+                    let result = synthesizer.read()?.replace_phoneme_length(&a, s).await;
+                    Python::with_gil(|py| result.into_py_result(py))
+                },
             )
         }
 
@@ -1313,12 +1326,15 @@ mod asyncio {
             style_id: u32,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             crate::convert::async_modify_accent_phrases(
                 accent_phrases,
                 StyleId::new(style_id),
                 py,
-                |a, s| async move { synthesizer.replace_mora_pitch(&a, s).await },
+                |a, s| async move {
+                    let result = synthesizer.read()?.replace_mora_pitch(&a, s).await;
+                    Python::with_gil(|py| result.into_py_result(py))
+                },
             )
         }
 
@@ -1330,12 +1346,13 @@ mod asyncio {
             enable_interrogative_upspeak: bool,
             py: Python<'py>,
         ) -> PyResult<&'py PyAny> {
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
                     let wav = synthesizer
+                        .read()?
                         .synthesis(
                             &audio_query,
                             StyleId::new(style_id),
@@ -1368,13 +1385,16 @@ mod asyncio {
             let options = TtsOptions {
                 enable_interrogative_upspeak,
             };
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let kana = kana.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
-                    let wav = synthesizer.tts_from_kana(&kana, style_id, &options).await;
+                    let wav = synthesizer
+                        .read()?
+                        .tts_from_kana(&kana, style_id, &options)
+                        .await;
 
                     Python::with_gil(|py| {
                         let wav = wav.into_py_result(py)?;
@@ -1400,13 +1420,13 @@ mod asyncio {
             let options = TtsOptions {
                 enable_interrogative_upspeak,
             };
-            let synthesizer = self.synthesizer.read()?.clone();
+            let synthesizer = self.synthesizer.clone();
             let text = text.to_owned();
             pyo3_asyncio::tokio::future_into_py_with_locals(
                 py,
                 pyo3_asyncio::tokio::get_current_locals(py)?,
                 async move {
-                    let wav = synthesizer.tts(&text, style_id, &options).await;
+                    let wav = synthesizer.read()?.tts(&text, style_id, &options).await;
 
                     Python::with_gil(|py| {
                         let wav = wav.into_py_result(py)?;