VOICEVOX · Hiroshiba · Nov 16, 2023 · Nov 5, 2023 · Nov 5, 2023 · Nov 5, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,6 +19,7 @@ easy-ext = "1.0.1"
 fs-err = { version = "2.9.0", features = ["tokio"] }
 futures = "0.3.26"
 itertools = "0.10.5"
+ndarray = "0.15.6"
 once_cell = "1.18.0"
 regex = "1.10.0"
 rstest = "0.15.0"

diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml
@@ -17,11 +17,14 @@ derive-new = "0.5.9"
 derive_more.workspace = true
 duplicate = "1.0.0"
 easy-ext.workspace = true
+educe = "0.4.23"
+enum-map = "3.0.0-beta.1"
 fs-err.workspace = true
 futures.workspace = true
 indexmap = { version = "2.0.0", features = ["serde"] }
 itertools.workspace = true
 nanoid = "0.4.0"
+ndarray.workspace = true
 once_cell.workspace = true
 regex.workspace = true
 serde.workspace = true

diff --git a/crates/voicevox_core/src/devices.rs b/crates/voicevox_core/src/devices.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 
 use super::*;
+use crate::{infer::InferenceRuntime, synthesizer::InferenceRuntimeImpl};
 
 /// このライブラリで利用可能なデバイスの情報。
 ///
@@ -11,21 +12,21 @@ pub struct SupportedDevices {
     /// CPUが利用可能。
     ///
     /// 常に`true`。
-    cpu: bool,
+    pub cpu: bool,
     /// CUDAが利用可能。
     ///
     /// ONNX Runtimeの[CUDA Execution Provider] (`CUDAExecutionProvider`)に対応する。必要な環境につ
     /// いてはそちらを参照。
     ///
     /// [CUDA Execution Provider]: https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html
-    cuda: bool,
+    pub cuda: bool,
     /// DirectMLが利用可能。
     ///
     /// ONNX Runtimeの[DirectML Execution Provider] (`DmlExecutionProvider`)に対応する。必要な環境に
     /// ついてはそちらを参照。
     ///
     /// [DirectML Execution Provider]: https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html
-    dml: bool,
+    pub dml: bool,
 }
 
 impl SupportedDevices {
@@ -42,24 +43,7 @@ impl SupportedDevices {
     /// # Result::<_, anyhow::Error>::Ok(())
     /// ```
     pub fn create() -> Result<Self> {
-        let mut cuda_support = false;
-        let mut dml_support = false;
-        for provider in onnxruntime::session::get_available_providers()
-            .map_err(ErrorRepr::GetSupportedDevices)?
-            .iter()
-        {
-            match provider.as_str() {
-                "CUDAExecutionProvider" => cuda_support = true,
-                "DmlExecutionProvider" => dml_support = true,
-                _ => {}
-            }
-        }
-
-        Ok(SupportedDevices {
-            cpu: true,
-            cuda: cuda_support,
-            dml: dml_support,
-        })
+        <InferenceRuntimeImpl as InferenceRuntime>::supported_devices()
     }
 
     pub fn to_json(&self) -> serde_json::Value {

diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use super::full_context_label::Utterance;
 use super::open_jtalk::OpenJtalk;
 use super::*;
+use crate::infer::{InferenceRuntime, Output};
 use crate::numerics::F32Ext as _;
 use crate::InferenceCore;
 
@@ -15,18 +16,19 @@ const MORA_PHONEME_LIST: &[&str] = &[
 ];
 
 #[derive(new)]
-pub struct SynthesisEngine {
-    inference_core: InferenceCore,
+pub(crate) struct SynthesisEngine<R: InferenceRuntime> {
+    inference_core: InferenceCore<R>,
     open_jtalk: Arc<OpenJtalk>,
 }
 
-#[allow(unsafe_code)]
-unsafe impl Send for SynthesisEngine {}
-
-impl SynthesisEngine {
+impl<R> SynthesisEngine<R>
+where
+    R: InferenceRuntime,
+    (Vec<f32>,): Output<R>,
+{
     pub const DEFAULT_SAMPLING_RATE: u32 = 24000;
 
-    pub fn inference_core(&self) -> &InferenceCore {
+    pub fn inference_core(&self) -> &InferenceCore<R> {
         &self.inference_core
     }
 
@@ -123,7 +125,7 @@ impl SynthesisEngine {
         accent_phrases: &[AccentPhraseModel],
         style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
-        let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases);
+        let (_, phoneme_data_list) = Self::initial_process(accent_phrases);
 
         let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list);
 
@@ -185,36 +187,20 @@ impl SynthesisEngine {
         accent_phrases: &[AccentPhraseModel],
         style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
-        let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases);
+        let (_, phoneme_data_list) = Self::initial_process(accent_phrases);
 
         let mut base_start_accent_list = vec![0];
         let mut base_end_accent_list = vec![0];
         let mut base_start_accent_phrase_list = vec![0];
         let mut base_end_accent_phrase_list = vec![0];
         for accent_phrase in accent_phrases {
             let mut accent = usize::from(*accent_phrase.accent() != 1);
-            SynthesisEngine::create_one_accent_list(
-                &mut base_start_accent_list,
-                accent_phrase,
-                accent as i32,
-            );
+            Self::create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32);
 
             accent = *accent_phrase.accent() - 1;
-            SynthesisEngine::create_one_accent_list(
-                &mut base_end_accent_list,
-                accent_phrase,
-                accent as i32,
-            );
-            SynthesisEngine::create_one_accent_list(
-                &mut base_start_accent_phrase_list,
-                accent_phrase,
-                0,
-            );
-            SynthesisEngine::create_one_accent_list(
-                &mut base_end_accent_phrase_list,
-                accent_phrase,
-                -1,
-            );
+            Self::create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32);
+            Self::create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0);
+            Self::create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1);
         }
         base_start_accent_list.push(0);
         base_end_accent_list.push(0);
@@ -328,7 +314,7 @@ impl SynthesisEngine {
             query.accent_phrases().clone()
         };
 
-        let (flatten_moras, phoneme_data_list) = SynthesisEngine::initial_process(&accent_phrases);
+        let (flatten_moras, phoneme_data_list) = Self::initial_process(&accent_phrases);
 
         let mut phoneme_length_list = vec![pre_phoneme_length];
         let mut f0_list = vec![0.];
@@ -647,12 +633,12 @@ mod tests {
     use ::test_util::OPEN_JTALK_DIC_DIR;
     use pretty_assertions::assert_eq;
 
-    use crate::*;
+    use crate::{synthesizer::InferenceRuntimeImpl, *};
 
     #[rstest]
     #[tokio::test]
     async fn is_openjtalk_dict_loaded_works() {
-        let core = InferenceCore::new(false, 0).unwrap();
+        let core = InferenceCore::<InferenceRuntimeImpl>::new(false, 0).unwrap();
         let synthesis_engine =
             SynthesisEngine::new(core, OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap().into());
 
@@ -662,7 +648,7 @@ mod tests {
     #[rstest]
     #[tokio::test]
     async fn create_accent_phrases_works() {
-        let core = InferenceCore::new(false, 0).unwrap();
+        let core = InferenceCore::<InferenceRuntimeImpl>::new(false, 0).unwrap();
 
         let model = &VoiceModel::sample().await.unwrap();
         core.load_model(model).await.unwrap();

diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs
@@ -2,7 +2,6 @@ use self::engine::{FullContextLabelError, KanaParseError};
 use super::*;
 //use engine::
 use duplicate::duplicate_item;
-use onnxruntime::OrtError;
 use std::path::PathBuf;
 use thiserror::Error;
 use uuid::Uuid;
@@ -65,7 +64,7 @@ pub(crate) enum ErrorRepr {
     LoadModel(#[from] LoadModelError),
 
     #[error("サポートされているデバイス情報取得中にエラーが発生しました")]
-    GetSupportedDevices(#[source] OrtError),
+    GetSupportedDevices(#[source] anyhow::Error),
 
     #[error(
         "`{style_id}`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読\