diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs index e3fae420d..3c3c02bc7 100644 --- a/crates/voicevox_core/src/engine/synthesis_engine.rs +++ b/crates/voicevox_core/src/engine/synthesis_engine.rs @@ -38,7 +38,7 @@ impl SynthesisEngine { pub fn create_accent_phrases( &mut self, text: impl AsRef, - speaker_id: usize, + speaker_id: u32, ) -> Result> { if text.as_ref().is_empty() { return Ok(Vec::new()); @@ -115,7 +115,7 @@ impl SynthesisEngine { pub fn replace_mora_data( &mut self, accent_phrases: &[AccentPhraseModel], - speaker_id: usize, + speaker_id: u32, ) -> Result> { let accent_phrases = self.replace_phoneme_length(accent_phrases, speaker_id)?; self.replace_mora_pitch(&accent_phrases, speaker_id) @@ -124,7 +124,7 @@ impl SynthesisEngine { pub fn replace_phoneme_length( &mut self, accent_phrases: &[AccentPhraseModel], - speaker_id: usize, + speaker_id: u32, ) -> Result> { let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases); @@ -136,7 +136,7 @@ impl SynthesisEngine { .collect(); let phoneme_length = self .inference_core_mut() - .yukarin_s_forward(&phoneme_list_s, speaker_id)?; + .predict_duration(&phoneme_list_s, speaker_id)?; let mut index = 0; let new_accent_phrases = accent_phrases @@ -188,7 +188,7 @@ impl SynthesisEngine { pub fn replace_mora_pitch( &mut self, accent_phrases: &[AccentPhraseModel], - speaker_id: usize, + speaker_id: u32, ) -> Result> { let (_, phoneme_data_list) = SynthesisEngine::initial_process(accent_phrases); @@ -250,8 +250,8 @@ impl SynthesisEngine { end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); } - let mut f0_list = self.inference_core_mut().yukarin_sa_forward( - vowel_phoneme_list.len() as i64, + let mut f0_list = self.inference_core_mut().predict_intonation( + vowel_phoneme_list.len(), &vowel_phoneme_list, &consonant_phoneme_list, &start_accent_list, @@ -315,7 +315,7 @@ impl SynthesisEngine { pub fn synthesis( &mut self, query: &AudioQueryModel, - speaker_id: usize, + speaker_id: u32, enable_interrogative_upspeak: bool, ) -> Result> { let speed_scale = *query.speed_scale(); @@ -410,7 +410,7 @@ impl SynthesisEngine { // 2次元のvectorを1次元に変換し、アドレスを連続させる let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); - self.inference_core_mut().decode_forward( + self.inference_core_mut().decode( f0.len(), OjtPhoneme::num_phoneme(), &f0, @@ -422,7 +422,7 @@ impl SynthesisEngine { pub fn synthesis_wave_format( &mut self, query: &AudioQueryModel, - speaker_id: usize, + speaker_id: u32, enable_interrogative_upspeak: bool, ) -> Result> { let wave = self.synthesis(query, speaker_id, enable_interrogative_upspeak)?; diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 357127120..c81f4570e 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -39,7 +39,7 @@ pub enum Error { UninitializedStatus, #[error("{},{0}", base_error_message(VOICEVOX_RESULT_INVALID_SPEAKER_ID))] - InvalidSpeakerId { speaker_id: usize }, + InvalidSpeakerId { speaker_id: u32 }, #[error("{},{0}", base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX))] InvalidModelIndex { model_index: usize }, @@ -58,7 +58,7 @@ pub enum Error { } fn base_error_message(result_code: VoicevoxResultCode) -> &'static str { - let c_message: &'static str = crate::voicevox_error_result_to_message(result_code); + let c_message: &'static str = crate::error_result_to_message(result_code); &c_message[..(c_message.len() - 1)] } diff --git a/crates/voicevox_core/src/include_models.rs b/crates/voicevox_core/src/include_models.rs index 7dbb8f01c..f9c06ecc8 100644 --- a/crates/voicevox_core/src/include_models.rs +++ b/crates/voicevox_core/src/include_models.rs @@ -1,12 +1,12 @@ [ Model{ - yukarin_s_model: include_bytes!(concat!( + predict_duration_model: include_bytes!(concat!( env!("CARGO_WORKSPACE_DIR"), - "/model/yukarin_s.onnx" + "/model/predict_duration.onnx" )), - yukarin_sa_model: include_bytes!(concat!( + predict_intonation_model: include_bytes!(concat!( env!("CARGO_WORKSPACE_DIR"), - "/model/yukarin_sa.onnx" + "/model/predict_intonation.onnx" )), decode_model: include_bytes!(concat!( env!("CARGO_WORKSPACE_DIR"), diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index b11c8e294..42514d15d 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -6,20 +6,21 @@ use onnxruntime::{ session::{AnyArray, NdArray}, }; use result_code::VoicevoxResultCode; -use std::collections::BTreeMap; use std::ffi::CStr; use std::sync::Mutex; +use std::{collections::BTreeMap, path::PathBuf}; use status::*; use std::ffi::CString; const PHONEME_LENGTH_MINIMAL: f32 = 0.01; -static SPEAKER_ID_MAP: Lazy> = +static SPEAKER_ID_MAP: Lazy> = Lazy::new(|| include!("include_speaker_id_map.rs").into_iter().collect()); pub struct VoicevoxCore { synthesis_engine: SynthesisEngine, + use_gpu: bool, } impl VoicevoxCore { @@ -29,29 +30,51 @@ impl VoicevoxCore { InferenceCore::new(false, None), OpenJtalk::initialize(), ), + use_gpu: false, }) } - pub fn initialize( - &mut self, - use_gpu: bool, - cpu_num_threads: usize, - load_all_models: bool, - ) -> Result<()> { + pub fn initialize(&mut self, options: InitializeOptions) -> Result<()> { + let use_gpu = match options.acceleration_mode { + AccelerationMode::Auto => { + let supported_devices = SupportedDevices::get_supported_devices()?; + + cfg_if! { + if #[cfg(feature="directml")]{ + *supported_devices.dml() + + } else { + *supported_devices.cuda() + } + } + } + AccelerationMode::Cpu => false, + AccelerationMode::Gpu => true, + }; + self.use_gpu = use_gpu; self.synthesis_engine.inference_core_mut().initialize( use_gpu, - cpu_num_threads, - load_all_models, - ) + options.cpu_num_threads, + options.load_all_models, + )?; + if let Some(open_jtalk_dict_dir) = options.open_jtalk_dict_dir { + self.synthesis_engine + .load_openjtalk_dict(open_jtalk_dict_dir)?; + } + Ok(()) } - pub fn load_model(&mut self, speaker_id: usize) -> Result<()> { + pub fn is_gpu_mode(&self) -> bool { + self.use_gpu + } + + pub fn load_model(&mut self, speaker_id: u32) -> Result<()> { self.synthesis_engine .inference_core_mut() .load_model(speaker_id) } - pub fn is_model_loaded(&self, speaker_id: usize) -> bool { + pub fn is_model_loaded(&self, speaker_id: u32) -> bool { self.synthesis_engine .inference_core() .is_model_loaded(speaker_id) @@ -61,39 +84,35 @@ impl VoicevoxCore { self.synthesis_engine.inference_core_mut().finalize() } - pub fn metas(&self) -> &'static CStr { + pub fn get_metas_json(&self) -> &'static CStr { &METAS_CSTRING } - pub fn supported_devices(&self) -> &'static CStr { + pub fn get_supported_devices_json(&self) -> &'static CStr { &SUPPORTED_DEVICES_CSTRING } - pub fn yukarin_s_forward( - &mut self, - phoneme_list: &[i64], - speaker_id: usize, - ) -> Result> { + pub fn predict_duration(&mut self, phoneme_list: &[i64], speaker_id: u32) -> Result> { self.synthesis_engine .inference_core_mut() - .yukarin_s_forward(phoneme_list, speaker_id) + .predict_duration(phoneme_list, speaker_id) } #[allow(clippy::too_many_arguments)] - pub fn yukarin_sa_forward( + pub fn predict_intonation( &mut self, - length: i64, + length: usize, vowel_phoneme_list: &[i64], consonant_phoneme_list: &[i64], start_accent_list: &[i64], end_accent_list: &[i64], start_accent_phrase_list: &[i64], end_accent_phrase_list: &[i64], - speaker_id: usize, + speaker_id: u32, ) -> Result> { self.synthesis_engine .inference_core_mut() - .yukarin_sa_forward( + .predict_intonation( length, vowel_phoneme_list, consonant_phoneme_list, @@ -105,15 +124,15 @@ impl VoicevoxCore { ) } - pub fn decode_forward( + pub fn decode( &mut self, length: usize, phoneme_size: usize, f0: &[f32], phoneme: &[f32], - speaker_id: usize, + speaker_id: u32, ) -> Result> { - self.synthesis_engine.inference_core_mut().decode_forward( + self.synthesis_engine.inference_core_mut().decode( length, phoneme_size, f0, @@ -122,21 +141,21 @@ impl VoicevoxCore { ) } - pub fn voicevox_load_openjtalk_dict(&mut self, dict_path: &str) -> Result<()> { - self.synthesis_engine.load_openjtalk_dict(dict_path) - } - - pub fn voicevox_audio_query( + pub fn audio_query( &mut self, text: &str, - speaker_id: usize, + speaker_id: u32, + options: AudioQueryOptions, ) -> Result { if !self.synthesis_engine.is_openjtalk_dict_loaded() { return Err(Error::NotLoadedOpenjtalkDict); } - let accent_phrases = self - .synthesis_engine - .create_accent_phrases(text, speaker_id)?; + let accent_phrases = if options.kana { + parse_kana(text)? + } else { + self.synthesis_engine + .create_accent_phrases(text, speaker_id)? + }; Ok(AudioQueryModel::new( accent_phrases, @@ -152,47 +171,75 @@ impl VoicevoxCore { )) } - pub fn voicevox_audio_query_from_kana( + pub fn synthesis( &mut self, - text: &str, - speaker_id: usize, - ) -> Result { - let accent_phrases = parse_kana(text)?; - let accent_phrases = self - .synthesis_engine - .replace_mora_data(&accent_phrases, speaker_id)?; + audio_query: &AudioQueryModel, + speaker_id: u32, + options: SynthesisOptions, + ) -> Result> { + self.synthesis_engine.synthesis_wave_format( + audio_query, + speaker_id, + options.enable_interrogative_upspeak, + ) + } - Ok(AudioQueryModel::new( - accent_phrases, - 1., - 0., - 1., - 1., - 0.1, - 0.1, - SynthesisEngine::DEFAULT_SAMPLING_RATE, - false, - "".into(), - )) + pub fn tts(&mut self, text: &str, speaker_id: u32, options: TtsOptions) -> Result> { + let audio_query = &self.audio_query(text, speaker_id, AudioQueryOptions::from(&options))?; + self.synthesis(audio_query, speaker_id, SynthesisOptions::from(&options)) } +} - pub fn voicevox_synthesis( - &mut self, - audio_query: &AudioQueryModel, - speaker_id: usize, - ) -> Result> { - self.synthesis_engine - .synthesis_wave_format(audio_query, speaker_id, true) // TODO: 疑問文化を設定可能にする +#[derive(Default)] +pub struct AudioQueryOptions { + pub kana: bool, +} + +impl From<&TtsOptions> for AudioQueryOptions { + fn from(options: &TtsOptions) -> Self { + Self { kana: options.kana } } +} + +#[derive(Default, Debug, PartialEq, Eq)] +pub enum AccelerationMode { + #[default] + Auto, + Cpu, + Gpu, +} + +#[derive(Default)] +pub struct InitializeOptions { + pub acceleration_mode: AccelerationMode, + pub cpu_num_threads: u16, + pub load_all_models: bool, + pub open_jtalk_dict_dir: Option, +} - pub fn voicevox_tts(&mut self, text: &str, speaker_id: usize) -> Result> { - let audio_query = &self.voicevox_audio_query(text, speaker_id)?; - self.voicevox_synthesis(audio_query, speaker_id) +pub struct SynthesisOptions { + pub enable_interrogative_upspeak: bool, +} + +impl From<&TtsOptions> for SynthesisOptions { + fn from(options: &TtsOptions) -> Self { + Self { + enable_interrogative_upspeak: options.enable_interrogative_upspeak, + } } +} - pub fn voicevox_tts_from_kana(&mut self, text: &str, speaker_id: usize) -> Result> { - let audio_query = &self.voicevox_audio_query_from_kana(text, speaker_id)?; - self.voicevox_synthesis(audio_query, speaker_id) +pub struct TtsOptions { + pub kana: bool, + pub enable_interrogative_upspeak: bool, +} + +impl Default for TtsOptions { + fn default() -> Self { + Self { + enable_interrogative_upspeak: true, + kana: Default::default(), + } } } @@ -206,7 +253,7 @@ impl InferenceCore { pub fn initialize( &mut self, use_gpu: bool, - cpu_num_threads: usize, + cpu_num_threads: u16, load_all_models: bool, ) -> Result<()> { self.initialized = false; @@ -239,7 +286,7 @@ impl InferenceCore { } } } - pub fn load_model(&mut self, speaker_id: usize) -> Result<()> { + pub fn load_model(&mut self, speaker_id: u32) -> Result<()> { if self.initialized { let status = self .status_option @@ -254,7 +301,7 @@ impl InferenceCore { Err(Error::UninitializedStatus) } } - pub fn is_model_loaded(&self, speaker_id: usize) -> bool { + pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { status.is_model_loaded(model_index) @@ -270,11 +317,7 @@ impl InferenceCore { self.status_option = None; } - pub fn yukarin_s_forward( - &mut self, - phoneme_list: &[i64], - speaker_id: usize, - ) -> Result> { + pub fn predict_duration(&mut self, phoneme_list: &[i64], speaker_id: u32) -> Result> { if !self.initialized { return Err(Error::UninitializedStatus); } @@ -305,7 +348,7 @@ impl InferenceCore { let input_tensors: Vec<&mut dyn AnyArray> = vec![&mut phoneme_list_array, &mut speaker_id_array]; - let mut output = status.yukarin_s_session_run(model_index, input_tensors)?; + let mut output = status.predict_duration_session_run(model_index, input_tensors)?; for output_item in output.iter_mut() { if *output_item < PHONEME_LENGTH_MINIMAL { @@ -317,16 +360,16 @@ impl InferenceCore { } #[allow(clippy::too_many_arguments)] - pub fn yukarin_sa_forward( + pub fn predict_intonation( &mut self, - length: i64, + length: usize, vowel_phoneme_list: &[i64], consonant_phoneme_list: &[i64], start_accent_list: &[i64], end_accent_list: &[i64], start_accent_phrase_list: &[i64], end_accent_phrase_list: &[i64], - speaker_id: usize, + speaker_id: u32, ) -> Result> { if !self.initialized { return Err(Error::UninitializedStatus); @@ -352,7 +395,7 @@ impl InferenceCore { return Err(Error::InvalidModelIndex { model_index }); } - let mut length_array = NdArray::new(ndarray::arr0(length)); + let mut length_array = NdArray::new(ndarray::arr0(length as i64)); let mut vowel_phoneme_list_array = NdArray::new(ndarray::arr1(vowel_phoneme_list)); let mut consonant_phoneme_list_array = NdArray::new(ndarray::arr1(consonant_phoneme_list)); let mut start_accent_list_array = NdArray::new(ndarray::arr1(start_accent_list)); @@ -373,16 +416,16 @@ impl InferenceCore { &mut speaker_id_array, ]; - status.yukarin_sa_session_run(model_index, input_tensors) + status.predict_intonation_session_run(model_index, input_tensors) } - pub fn decode_forward( + pub fn decode( &mut self, length: usize, phoneme_size: usize, f0: &[f32], phoneme: &[f32], - speaker_id: usize, + speaker_id: u32, ) -> Result> { if !self.initialized { return Err(Error::UninitializedStatus); @@ -502,11 +545,11 @@ static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| { .unwrap() }); -fn get_model_index_and_speaker_id(speaker_id: usize) -> Option<(usize, usize)> { +fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { SPEAKER_ID_MAP.get(&speaker_id).copied() } -pub const fn voicevox_error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { +pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { // C APIのため、messageには必ず末尾にNULL文字を追加する use VoicevoxResultCode::*; match result_code { @@ -547,7 +590,10 @@ mod tests { #[rstest] fn finalize_works() { let internal = VoicevoxCore::new_with_mutex(); - let result = internal.lock().unwrap().initialize(false, 0, false); + let result = internal + .lock() + .unwrap() + .initialize(InitializeOptions::default()); assert_eq!(Ok(()), result); internal.lock().unwrap().finalize(); assert_eq!( @@ -576,7 +622,7 @@ mod tests { #[case(1, Err(Error::UninitializedStatus), Ok(()))] #[case(999, Err(Error::UninitializedStatus), Err(Error::InvalidSpeakerId{speaker_id:999}))] fn load_model_works( - #[case] speaker_id: usize, + #[case] speaker_id: u32, #[case] expected_result_at_uninitialized: Result<()>, #[case] expected_result_at_initialized: Result<()>, ) { @@ -587,7 +633,10 @@ mod tests { internal .lock() .unwrap() - .initialize(false, 0, false) + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) .unwrap(); let result = internal.lock().unwrap().load_model(speaker_id); assert_eq!( @@ -596,11 +645,26 @@ mod tests { ); } + #[rstest] + fn is_use_gpu_works() { + let internal = VoicevoxCore::new_with_mutex(); + assert_eq!(false, internal.lock().unwrap().is_gpu_mode()); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); + assert_eq!(false, internal.lock().unwrap().is_gpu_mode()); + } + #[rstest] #[case(0, true)] #[case(1, true)] #[case(999, false)] - fn is_model_loaded_works(#[case] speaker_id: usize, #[case] expected: bool) { + fn is_model_loaded_works(#[case] speaker_id: u32, #[case] expected: bool) { let internal = VoicevoxCore::new_with_mutex(); assert!( !internal.lock().unwrap().is_model_loaded(speaker_id), @@ -610,7 +674,10 @@ mod tests { internal .lock() .unwrap() - .initialize(false, 0, false) + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) .unwrap(); assert!( !internal.lock().unwrap().is_model_loaded(speaker_id), @@ -635,7 +702,7 @@ mod tests { #[rstest] fn supported_devices_works() { let internal = VoicevoxCore::new_with_mutex(); - let cstr_result = internal.lock().unwrap().supported_devices(); + let cstr_result = internal.lock().unwrap().get_supported_devices_json(); assert!(cstr_result.to_str().is_ok(), "{:?}", cstr_result); let json_result: std::result::Result = @@ -648,17 +715,25 @@ mod tests { #[case(1, Some((0,1)))] #[case(999, None)] fn get_model_index_and_speaker_id_works( - #[case] speaker_id: usize, - #[case] expected: Option<(usize, usize)>, + #[case] speaker_id: u32, + #[case] expected: Option<(usize, u32)>, ) { let actual = get_model_index_and_speaker_id(speaker_id); assert_eq!(expected, actual); } #[rstest] - fn yukarin_s_forward_works() { + fn predict_duration_works() { let internal = VoicevoxCore::new_with_mutex(); - internal.lock().unwrap().initialize(false, 0, true).unwrap(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); // 「こんにちは、音声合成の世界へようこそ」という文章を変換して得た phoneme_list let phoneme_list = [ @@ -666,16 +741,24 @@ mod tests { 30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0, ]; - let result = internal.lock().unwrap().yukarin_s_forward(&phoneme_list, 0); + let result = internal.lock().unwrap().predict_duration(&phoneme_list, 0); assert!(result.is_ok(), "{:?}", result); assert_eq!(result.unwrap().len(), phoneme_list.len()); } #[rstest] - fn yukarin_sa_forward_works() { + fn predict_intonation_works() { let internal = VoicevoxCore::new_with_mutex(); - internal.lock().unwrap().initialize(false, 0, true).unwrap(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); // 「テスト」という文章に対応する入力 let vowel_phoneme_list = [0, 14, 6, 30, 0]; @@ -685,8 +768,8 @@ mod tests { let start_accent_phrase_list = [0, 1, 0, 0, 0]; let end_accent_phrase_list = [0, 0, 0, 1, 0]; - let result = internal.lock().unwrap().yukarin_sa_forward( - vowel_phoneme_list.len() as i64, + let result = internal.lock().unwrap().predict_intonation( + vowel_phoneme_list.len(), &vowel_phoneme_list, &consonant_phoneme_list, &start_accent_list, @@ -701,9 +784,17 @@ mod tests { } #[rstest] - fn decode_forward_works() { + fn decode_works() { let internal = VoicevoxCore::new_with_mutex(); - internal.lock().unwrap().initialize(false, 0, true).unwrap(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + load_all_models: true, + ..Default::default() + }) + .unwrap(); // 「テスト」という文章に対応する入力 const F0_LENGTH: usize = 69; @@ -727,25 +818,12 @@ mod tests { set_one(30, 45..60); set_one(0, 60..69); - let result = - internal - .lock() - .unwrap() - .decode_forward(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, 0); - - assert!(result.is_ok(), "{:?}", result); - assert_eq!(result.unwrap().len(), F0_LENGTH * 256); - } - - #[rstest] - #[async_std::test] - async fn voicevox_load_openjtalk_dict_works() { - let internal = VoicevoxCore::new_with_mutex(); - let open_jtalk_dic_dir = download_open_jtalk_dict_if_no_exists().await; let result = internal .lock() .unwrap() - .voicevox_load_openjtalk_dict(open_jtalk_dic_dir.to_str().unwrap()); - assert_eq!(result, Ok(())); + .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, 0); + + assert!(result.is_ok(), "{:?}", result); + assert_eq!(result.unwrap().len(), F0_LENGTH * 256); } } diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index f87327efa..2c2865d79 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -18,24 +18,24 @@ pub struct Status { models: StatusModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う - supported_styles: BTreeSet, + supported_styles: BTreeSet, } struct StatusModels { - yukarin_s: BTreeMap>, - yukarin_sa: BTreeMap>, + predict_duration: BTreeMap>, + predict_intonation: BTreeMap>, decode: BTreeMap>, } #[derive(new, Getters)] struct SessionOptions { - cpu_num_threads: usize, + cpu_num_threads: u16, use_gpu: bool, } struct Model { - yukarin_s_model: &'static [u8], - yukarin_sa_model: &'static [u8], + predict_duration_model: &'static [u8], + predict_intonation_model: &'static [u8], decode_model: &'static [u8], } @@ -106,11 +106,11 @@ impl Status { pub const MODELS_COUNT: usize = Self::MODELS.len(); - pub fn new(use_gpu: bool, cpu_num_threads: usize) -> Self { + pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { models: StatusModels { - yukarin_s: BTreeMap::new(), - yukarin_sa: BTreeMap::new(), + predict_duration: BTreeMap::new(), + predict_intonation: BTreeMap::new(), decode: BTreeMap::new(), }, light_session_options: SessionOptions::new(cpu_num_threads, false), @@ -125,7 +125,7 @@ impl Status { for meta in metas.iter() { for style in meta.styles().iter() { - self.supported_styles.insert(*style.id() as usize); + self.supported_styles.insert(*style.id() as u32); } } @@ -135,20 +135,22 @@ impl Status { pub fn load_model(&mut self, model_index: usize) -> Result<()> { if model_index < Self::MODELS.len() { let model = &Self::MODELS[model_index]; - let yukarin_s_session = self - .new_session(model.yukarin_s_model, &self.light_session_options) + let predict_duration_session = self + .new_session(model.predict_duration_model, &self.light_session_options) .map_err(Error::LoadModel)?; - let yukarin_sa_session = self - .new_session(model.yukarin_sa_model, &self.light_session_options) + let predict_intonation_session = self + .new_session(model.predict_intonation_model, &self.light_session_options) .map_err(Error::LoadModel)?; let decode_model = self .new_session(model.decode_model, &self.heavy_session_options) .map_err(Error::LoadModel)?; - self.models.yukarin_s.insert(model_index, yukarin_s_session); self.models - .yukarin_sa - .insert(model_index, yukarin_sa_session); + .predict_duration + .insert(model_index, predict_duration_session); + self.models + .predict_intonation + .insert(model_index, predict_intonation_session); self.models.decode.insert(model_index, decode_model); @@ -159,8 +161,8 @@ impl Status { } pub fn is_model_loaded(&self, model_index: usize) -> bool { - self.models.yukarin_sa.contains_key(&model_index) - && self.models.yukarin_s.contains_key(&model_index) + self.models.predict_intonation.contains_key(&model_index) + && self.models.predict_duration.contains_key(&model_index) && self.models.decode.contains_key(&model_index) } @@ -194,16 +196,16 @@ impl Status { Ok(session_builder.with_model_from_memory(model_bytes)?) } - pub fn validate_speaker_id(&self, speaker_id: usize) -> bool { + pub fn validate_speaker_id(&self, speaker_id: u32) -> bool { self.supported_styles.contains(&speaker_id) } - pub fn yukarin_s_session_run( + pub fn predict_duration_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.yukarin_s.get_mut(&model_index) { + if let Some(model) = self.models.predict_duration.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -214,12 +216,12 @@ impl Status { } } - pub fn yukarin_sa_session_run( + pub fn predict_intonation_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.yukarin_sa.get_mut(&model_index) { + if let Some(model) = self.models.predict_intonation.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -261,7 +263,7 @@ mod tests { #[case(false, 4)] #[case(false, 8)] #[case(false, 0)] - fn status_new_works(#[case] use_gpu: bool, #[case] cpu_num_threads: usize) { + fn status_new_works(#[case] use_gpu: bool, #[case] cpu_num_threads: u16) { let status = Status::new(use_gpu, cpu_num_threads); assert_eq!(false, status.light_session_options.use_gpu); assert_eq!(use_gpu, status.heavy_session_options.use_gpu); @@ -273,8 +275,8 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.yukarin_s.is_empty()); - assert!(status.models.yukarin_sa.is_empty()); + assert!(status.models.predict_duration.is_empty()); + assert!(status.models.predict_intonation.is_empty()); assert!(status.models.decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -302,8 +304,8 @@ mod tests { let mut status = Status::new(false, 0); let result = status.load_model(0); assert_eq!(Ok(()), result); - assert_eq!(1, status.models.yukarin_s.len()); - assert_eq!(1, status.models.yukarin_sa.len()); + assert_eq!(1, status.models.predict_duration.len()); + assert_eq!(1, status.models.predict_intonation.len()); assert_eq!(1, status.models.decode.len()); } diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs new file mode 100644 index 000000000..f8f045cf7 --- /dev/null +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -0,0 +1,195 @@ +use super::*; + +pub(crate) fn convert_result(result: Result) -> (Option, VoicevoxResultCode) { + match result { + Ok(target) => (Some(target), VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED), + Err(err) => { + eprintln!("{}", err); + dbg!(&err); + match err { + Error::NotLoadedOpenjtalkDict => ( + None, + VoicevoxResultCode::VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT, + ), + Error::CantGpuSupport => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_CANT_GPU_SUPPORT) + } + Error::LoadModel(_) => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_LOAD_MODEL) + } + Error::LoadMetas(_) => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_LOAD_METAS) + } + Error::GetSupportedDevices(_) => ( + None, + VoicevoxResultCode::VOICEVOX_RESULT_FAILED_GET_SUPPORTED_DEVICES, + ), + Error::UninitializedStatus => ( + None, + VoicevoxResultCode::VOICEVOX_RESULT_UNINITIALIZED_STATUS, + ), + Error::InvalidSpeakerId { .. } => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_SPEAKER_ID) + } + Error::InvalidModelIndex { .. } => ( + None, + VoicevoxResultCode::VOICEVOX_RESULT_INVALID_MODEL_INDEX, + ), + Error::InferenceFailed => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_INFERENCE_FAILED) + } + Error::FailedExtractFullContextLabel(_) => ( + None, + VoicevoxResultCode::VOICEVOX_RESULT_FAILED_EXTRACT_FULL_CONTEXT_LABEL, + ), + Error::FailedParseKana(_) => { + (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_PARSE_KANA) + } + } + } + } +} + +pub(crate) fn create_audio_query( + japanese_or_kana: &CStr, + speaker_id: u32, + method: fn( + &mut Internal, + &str, + u32, + voicevox_core::AudioQueryOptions, + ) -> Result, + options: VoicevoxAudioQueryOptions, +) -> std::result::Result { + let japanese_or_kana = ensure_utf8(japanese_or_kana)?; + + let (audio_query, result_code) = convert_result(method( + &mut lock_internal(), + japanese_or_kana, + speaker_id, + options.into(), + )); + let audio_query = audio_query.ok_or(result_code)?; + Ok(CString::new(audio_query_model_to_json(&audio_query)).expect("should not contain '\\0'")) +} + +fn audio_query_model_to_json(audio_query_model: &AudioQueryModel) -> String { + serde_json::to_string(audio_query_model).expect("should be always valid") +} + +pub(crate) unsafe fn write_json_to_ptr(output_ptr: *mut *mut c_char, json: &CStr) { + let n = json.to_bytes_with_nul().len(); + let json_heap = libc::malloc(n); + libc::memcpy(json_heap, json.as_ptr() as *const c_void, n); + output_ptr.write(json_heap as *mut c_char); +} + +pub(crate) unsafe fn write_wav_to_ptr( + output_wav_ptr: *mut *mut u8, + output_size_ptr: *mut usize, + data: &[u8], +) { + output_size_ptr.write(data.len()); + let wav_heap = libc::malloc(data.len()); + libc::memcpy(wav_heap, data.as_ptr() as *const c_void, data.len()); + output_wav_ptr.write(wav_heap as *mut u8); +} + +pub(crate) fn ensure_utf8(s: &CStr) -> std::result::Result<&str, VoicevoxResultCode> { + s.to_str() + .map_err(|_| VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT) +} + +impl From for VoicevoxAudioQueryOptions { + fn from(options: voicevox_core::AudioQueryOptions) -> Self { + Self { kana: options.kana } + } +} +impl From for voicevox_core::AudioQueryOptions { + fn from(options: VoicevoxAudioQueryOptions) -> Self { + Self { kana: options.kana } + } +} + +impl From for voicevox_core::SynthesisOptions { + fn from(options: VoicevoxSynthesisOptions) -> Self { + Self { + enable_interrogative_upspeak: options.enable_interrogative_upspeak, + } + } +} + +impl From for VoicevoxAccelerationMode { + fn from(mode: voicevox_core::AccelerationMode) -> Self { + use voicevox_core::AccelerationMode::*; + match mode { + Auto => Self::VOICEVOX_ACCELERATION_MODE_AUTO, + Cpu => Self::VOICEVOX_ACCELERATION_MODE_CPU, + Gpu => Self::VOICEVOX_ACCELERATION_MODE_GPU, + } + } +} + +impl From for voicevox_core::AccelerationMode { + fn from(mode: VoicevoxAccelerationMode) -> Self { + use VoicevoxAccelerationMode::*; + match mode { + VOICEVOX_ACCELERATION_MODE_AUTO => Self::Auto, + VOICEVOX_ACCELERATION_MODE_CPU => Self::Cpu, + VOICEVOX_ACCELERATION_MODE_GPU => Self::Gpu, + } + } +} + +impl Default for VoicevoxInitializeOptions { + fn default() -> Self { + let options = voicevox_core::InitializeOptions::default(); + Self { + acceleration_mode: options.acceleration_mode.into(), + cpu_num_threads: options.cpu_num_threads, + load_all_models: options.load_all_models, + open_jtalk_dict_dir: null(), + } + } +} + +impl VoicevoxInitializeOptions { + pub(crate) unsafe fn try_into_options( + self, + ) -> std::result::Result { + let open_jtalk_dict_dir = ensure_utf8(CStr::from_ptr(self.open_jtalk_dict_dir))?; + Ok(voicevox_core::InitializeOptions { + acceleration_mode: self.acceleration_mode.into(), + cpu_num_threads: self.cpu_num_threads, + load_all_models: self.load_all_models, + open_jtalk_dict_dir: Some(PathBuf::from(open_jtalk_dict_dir)), + }) + } +} + +impl From for VoicevoxTtsOptions { + fn from(options: voicevox_core::TtsOptions) -> Self { + Self { + kana: options.kana, + enable_interrogative_upspeak: options.enable_interrogative_upspeak, + } + } +} + +impl From for voicevox_core::TtsOptions { + fn from(options: VoicevoxTtsOptions) -> Self { + Self { + kana: options.kana, + enable_interrogative_upspeak: options.enable_interrogative_upspeak, + } + } +} + +impl Default for VoicevoxSynthesisOptions { + fn default() -> Self { + let options = voicevox_core::TtsOptions::default(); + Self { + enable_interrogative_upspeak: options.enable_interrogative_upspeak, + } + } +} diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index 37300bbde..b8a6de8c3 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -1,10 +1,14 @@ // TODO: ドキュメントを作成する段階になったらこのallowを外し、各pointerを使用している関数にunsafeとSafety documentを追加する #![allow(clippy::not_unsafe_ptr_arg_deref)] +mod helpers; +use helpers::*; use libc::c_void; use once_cell::sync::Lazy; use std::ffi::{CStr, CString}; -use std::os::raw::{c_char, c_int}; +use std::os::raw::c_char; +use std::path::PathBuf; +use std::ptr::null; use std::sync::{Mutex, MutexGuard}; use voicevox_core::AudioQueryModel; use voicevox_core::VoicevoxCore; @@ -22,237 +26,178 @@ fn lock_internal() -> MutexGuard<'static, Internal> { } /* - * Cの関数として公開するための型や関数を定義するこれらの実装はinternal.rsに定義してある同名関数にある - * この関数ではinternal.rsにある同名関数の呼び出しと、その戻り値をCの形式に変換する処理のみとする + * Cの関数として公開するための型や関数を定義するこれらの実装はvoicevox_core/publish.rsに定義してある対応する関数にある + * この関数ではvoicevox_core/publish.rsにある対応する関数の呼び出しと、その戻り値をCの形式に変換する処理のみとする * これはC文脈の処理と実装をわけるためと、内部実装の変更がAPIに影響を与えにくくするためである + * voicevox_core/publish.rsにある対応する関数とはこのファイルに定義してある公開関数からvoicevoxプレフィックスを取り除いた名前の関数である */ pub use voicevox_core::result_code::VoicevoxResultCode; -fn convert_result(result: Result) -> (Option, VoicevoxResultCode) { - match result { - Ok(target) => (Some(target), VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED), - Err(err) => { - eprintln!("{}", err); - dbg!(&err); - match err { - Error::NotLoadedOpenjtalkDict => ( - None, - VoicevoxResultCode::VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT, - ), - Error::CantGpuSupport => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_CANT_GPU_SUPPORT) - } - Error::LoadModel(_) => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_LOAD_MODEL) - } - Error::LoadMetas(_) => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_LOAD_METAS) - } - Error::GetSupportedDevices(_) => ( - None, - VoicevoxResultCode::VOICEVOX_RESULT_FAILED_GET_SUPPORTED_DEVICES, - ), - Error::UninitializedStatus => ( - None, - VoicevoxResultCode::VOICEVOX_RESULT_UNINITIALIZED_STATUS, - ), - Error::InvalidSpeakerId { .. } => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_SPEAKER_ID) - } - Error::InvalidModelIndex { .. } => ( - None, - VoicevoxResultCode::VOICEVOX_RESULT_INVALID_MODEL_INDEX, - ), - Error::InferenceFailed => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_INFERENCE_FAILED) - } - Error::FailedExtractFullContextLabel(_) => ( - None, - VoicevoxResultCode::VOICEVOX_RESULT_FAILED_EXTRACT_FULL_CONTEXT_LABEL, - ), - Error::FailedParseKana(_) => { - (None, VoicevoxResultCode::VOICEVOX_RESULT_FAILED_PARSE_KANA) - } - } - } - } +#[repr(i32)] +#[derive(Debug, PartialEq, Eq)] +#[allow(non_camel_case_types)] +pub enum VoicevoxAccelerationMode { + VOICEVOX_ACCELERATION_MODE_AUTO = 0, + VOICEVOX_ACCELERATION_MODE_CPU = 1, + VOICEVOX_ACCELERATION_MODE_GPU = 2, } -// FIXME:各関数の戻り値をboolからVoicevoxResultCodeに変えてこのstatic変数を削除する -static ERROR_MESSAGE: Lazy> = Lazy::new(|| Mutex::new(String::new())); +#[repr(C)] +pub struct VoicevoxInitializeOptions { + acceleration_mode: VoicevoxAccelerationMode, + cpu_num_threads: u16, + load_all_models: bool, + open_jtalk_dict_dir: *const c_char, +} -fn set_message(message: &str) { - ERROR_MESSAGE - .lock() - .unwrap() - .replace_range(.., &format!("{}\0", message)); +#[no_mangle] +pub extern "C" fn voicevox_make_default_initialize_options() -> VoicevoxInitializeOptions { + VoicevoxInitializeOptions::default() } #[no_mangle] -pub extern "C" fn initialize(use_gpu: bool, cpu_num_threads: c_int, load_all_models: bool) -> bool { - let result = lock_internal().initialize(use_gpu, cpu_num_threads as usize, load_all_models); - //TODO: VoicevoxResultCodeを返すようにする - if let Some(err) = result.err() { - set_message(&format!("{}", err)); - false - } else { - true +pub extern "C" fn voicevox_initialize(options: VoicevoxInitializeOptions) -> VoicevoxResultCode { + match unsafe { options.try_into_options() } { + Ok(options) => { + let result = lock_internal().initialize(options); + let (_, result_code) = convert_result(result); + result_code + } + Err(result_code) => result_code, } } #[no_mangle] -pub extern "C" fn load_model(speaker_id: i64) -> bool { - let result = lock_internal().load_model(speaker_id as usize); - //TODO: VoicevoxResultCodeを返すようにする - if let Some(err) = result.err() { - set_message(&format!("{}", err)); - false - } else { - true - } +pub extern "C" fn voicevox_load_model(speaker_id: u32) -> VoicevoxResultCode { + let result = lock_internal().load_model(speaker_id); + let (_, result_code) = convert_result(result); + result_code } #[no_mangle] -pub extern "C" fn is_model_loaded(speaker_id: i64) -> bool { - lock_internal().is_model_loaded(speaker_id as usize) +pub extern "C" fn voicevox_is_gpu_mode() -> bool { + lock_internal().is_gpu_mode() } #[no_mangle] -pub extern "C" fn finalize() { - lock_internal().finalize() +pub extern "C" fn voicevox_is_model_loaded(speaker_id: u32) -> bool { + lock_internal().is_model_loaded(speaker_id) } #[no_mangle] -pub extern "C" fn metas() -> *const c_char { - lock_internal().metas().as_ptr() +pub extern "C" fn voicevox_finalize() { + lock_internal().finalize() } #[no_mangle] -pub extern "C" fn last_error_message() -> *const c_char { - ERROR_MESSAGE.lock().unwrap().as_ptr() as *const c_char +pub extern "C" fn voicevox_get_metas_json() -> *const c_char { + lock_internal().get_metas_json().as_ptr() } #[no_mangle] -pub extern "C" fn supported_devices() -> *const c_char { - lock_internal().supported_devices().as_ptr() +pub extern "C" fn voicevox_get_supported_devices_json() -> *const c_char { + lock_internal().get_supported_devices_json().as_ptr() } #[no_mangle] -pub extern "C" fn yukarin_s_forward( - length: i64, +pub extern "C" fn voicevox_predict_duration( + length: usize, phoneme_list: *mut i64, - speaker_id: *mut i64, + speaker_id: u32, output: *mut f32, -) -> bool { - let result = lock_internal().yukarin_s_forward( - unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) }, - unsafe { *speaker_id as usize }, +) -> VoicevoxResultCode { + let result = lock_internal().predict_duration( + unsafe { std::slice::from_raw_parts_mut(phoneme_list, length) }, + speaker_id, ); - //TODO: VoicevoxResultCodeを返すようにする - match result { - Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; + + let (output_vec, result_code) = convert_result(result); + if result_code == VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED { + if let Some(output_vec) = output_vec { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; output_slice.clone_from_slice(&output_vec); - true - } - Err(err) => { - set_message(&format!("{}", err)); - false } } + result_code } #[no_mangle] -pub extern "C" fn yukarin_sa_forward( - length: i64, +pub extern "C" fn voicevox_predict_intonation( + length: usize, vowel_phoneme_list: *mut i64, consonant_phoneme_list: *mut i64, start_accent_list: *mut i64, end_accent_list: *mut i64, start_accent_phrase_list: *mut i64, end_accent_phrase_list: *mut i64, - speaker_id: *mut i64, + speaker_id: u32, output: *mut f32, -) -> bool { - let result = lock_internal().yukarin_sa_forward( +) -> VoicevoxResultCode { + let result = lock_internal().predict_intonation( length, - unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) }, - unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) }, - unsafe { std::slice::from_raw_parts(start_accent_list, length as usize) }, - unsafe { std::slice::from_raw_parts(end_accent_list, length as usize) }, - unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length as usize) }, - unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length as usize) }, - unsafe { *speaker_id as usize }, + unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length) }, + unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length) }, + unsafe { std::slice::from_raw_parts(start_accent_list, length) }, + unsafe { std::slice::from_raw_parts(end_accent_list, length) }, + unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length) }, + unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length) }, + speaker_id, ); - //TODO: VoicevoxResultCodeを返すようにする - match result { - Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; - output_slice.clone_from_slice(&output_vec); - true - } - Err(err) => { - set_message(&format!("{}", err)); - false - } + let (output_vec, result_code) = convert_result(result); + if let Some(output_vec) = output_vec { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); } + result_code } #[no_mangle] -pub extern "C" fn decode_forward( - length: i64, +pub extern "C" fn voicevox_decode( + length: usize, phoneme_size: i64, f0: *mut f32, phoneme: *mut f32, - speaker_id: *mut i64, + speaker_id: u32, output: *mut f32, -) -> bool { +) -> VoicevoxResultCode { let length = length as usize; let phoneme_size = phoneme_size as usize; - let result = lock_internal().decode_forward( + let result = lock_internal().decode( length, phoneme_size, unsafe { std::slice::from_raw_parts(f0, length) }, unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, - unsafe { *speaker_id as usize }, + speaker_id, ); - //TODO: VoicevoxResultCodeを返すようにする - match result { - Ok(output_vec) => { - let output_slice = - unsafe { std::slice::from_raw_parts_mut(output, (length as usize) * 256) }; - output_slice.clone_from_slice(&output_vec); - true - } - Err(err) => { - set_message(&format!("{}", err)); - false - } + let (output_vec, result_code) = convert_result(result); + if let Some(output_vec) = output_vec { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); } + result_code +} + +#[repr(C)] +pub struct VoicevoxAudioQueryOptions { + kana: bool, } #[no_mangle] -pub extern "C" fn voicevox_load_openjtalk_dict(dict_path: *const c_char) -> VoicevoxResultCode { - let (_, result_code) = { - if let Ok(dict_path) = unsafe { CStr::from_ptr(dict_path) }.to_str() { - convert_result(lock_internal().voicevox_load_openjtalk_dict(dict_path)) - } else { - (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT) - } - }; - result_code +pub extern "C" fn voicevox_make_default_audio_query_options() -> VoicevoxAudioQueryOptions { + voicevox_core::AudioQueryOptions::default().into() } #[no_mangle] pub extern "C" fn voicevox_audio_query( text: *const c_char, - speaker_id: i64, + speaker_id: u32, + options: VoicevoxAudioQueryOptions, output_audio_query_json: *mut *mut c_char, ) -> VoicevoxResultCode { let text = unsafe { CStr::from_ptr(text) }; - let audio_query = &match create_audio_query(text, speaker_id, Internal::voicevox_audio_query) { + let audio_query = &match create_audio_query(text, speaker_id, Internal::audio_query, options) { Ok(audio_query) => audio_query, Err(result_code) => return result_code, }; @@ -263,63 +208,21 @@ pub extern "C" fn voicevox_audio_query( VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED } -#[no_mangle] -pub extern "C" fn voicevox_audio_query_from_kana( - text: *const c_char, - speaker_id: i64, - output_audio_query_json: *mut *mut c_char, -) -> VoicevoxResultCode { - let text = unsafe { CStr::from_ptr(text) }; - - let audio_query = - &match create_audio_query(text, speaker_id, Internal::voicevox_audio_query_from_kana) { - Ok(audio_query) => audio_query, - Err(result_code) => return result_code, - }; - - unsafe { - write_json_to_ptr(output_audio_query_json, audio_query); - } - VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED -} - -fn create_audio_query( - japanese_or_kana: &CStr, - speaker_id: i64, - method: fn(&mut Internal, &str, usize) -> Result, -) -> std::result::Result { - let japanese_or_kana = ensure_utf8(japanese_or_kana)?; - let speaker_id = speaker_id as usize; - - let (audio_query, result_code) = - convert_result(method(&mut lock_internal(), japanese_or_kana, speaker_id)); - let audio_query = audio_query.ok_or(result_code)?; - Ok(CString::new(audio_query_model_to_json(&audio_query)).expect("should not contain '\\0'")) -} - -fn audio_query_model_to_json(audio_query_model: &AudioQueryModel) -> String { - serde_json::to_string(audio_query_model).expect("should be always valid") -} - -unsafe fn write_json_to_ptr(output_ptr: *mut *mut c_char, json: &CStr) { - let n = json.to_bytes_with_nul().len(); - let json_heap = libc::malloc(n); - libc::memcpy(json_heap, json.as_ptr() as *const c_void, n); - output_ptr.write(json_heap as *mut c_char); +#[repr(C)] +pub struct VoicevoxSynthesisOptions { + enable_interrogative_upspeak: bool, } -unsafe fn write_wav_to_ptr(output_wav_ptr: *mut *mut u8, output_size_ptr: *mut c_int, data: &[u8]) { - output_size_ptr.write(data.len() as c_int); - let wav_heap = libc::malloc(data.len()); - libc::memcpy(wav_heap, data.as_ptr() as *const c_void, data.len()); - output_wav_ptr.write(wav_heap as *mut u8); +pub extern "C" fn voicevox_make_default_synthesis_options() -> VoicevoxSynthesisOptions { + VoicevoxSynthesisOptions::default() } #[no_mangle] pub extern "C" fn voicevox_synthesis( audio_query_json: *const c_char, - speaker_id: i64, - output_binary_size: *mut c_int, + speaker_id: u32, + options: VoicevoxSynthesisOptions, + output_wav_size: *mut usize, output_wav: *mut *mut u8, ) -> VoicevoxResultCode { let audio_query_json = unsafe { CStr::from_ptr(audio_query_json) }; @@ -334,10 +237,8 @@ pub extern "C" fn voicevox_synthesis( return VoicevoxResultCode::VOICEVOX_RESULT_INVALID_AUDIO_QUERY; }; - let speaker_id = speaker_id as usize; - let (wav, result_code) = - convert_result(lock_internal().voicevox_synthesis(audio_query, speaker_id)); + convert_result(lock_internal().synthesis(audio_query, speaker_id, options.into())); let wav = &if let Some(wav) = wav { wav } else { @@ -345,55 +246,40 @@ pub extern "C" fn voicevox_synthesis( }; unsafe { - write_wav_to_ptr(output_wav, output_binary_size, wav); + write_wav_to_ptr(output_wav, output_wav_size, wav); } VoicevoxResultCode::VOICEVOX_RESULT_SUCCEED } -fn ensure_utf8(s: &CStr) -> std::result::Result<&str, VoicevoxResultCode> { - s.to_str() - .map_err(|_| VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT) +#[repr(C)] +pub struct VoicevoxTtsOptions { + kana: bool, + enable_interrogative_upspeak: bool, } #[no_mangle] -pub extern "C" fn voicevox_tts( - text: *const c_char, - speaker_id: i64, - output_binary_size: *mut c_int, - output_wav: *mut *mut u8, -) -> VoicevoxResultCode { - let (output_opt, result_code) = { - if let Ok(text) = unsafe { CStr::from_ptr(text) }.to_str() { - convert_result(lock_internal().voicevox_tts(text, speaker_id as usize)) - } else { - (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT) - } - }; - if let Some(output) = output_opt { - unsafe { - write_wav_to_ptr(output_wav, output_binary_size, output.as_slice()); - } - } - result_code +pub extern "C" fn voicevox_make_default_tts_options() -> VoicevoxTtsOptions { + voicevox_core::TtsOptions::default().into() } #[no_mangle] -pub extern "C" fn voicevox_tts_from_kana( +pub extern "C" fn voicevox_tts( text: *const c_char, - speaker_id: i64, - output_binary_size: *mut c_int, + speaker_id: u32, + options: VoicevoxTtsOptions, + output_wav_size: *mut usize, output_wav: *mut *mut u8, ) -> VoicevoxResultCode { let (output_opt, result_code) = { if let Ok(text) = unsafe { CStr::from_ptr(text) }.to_str() { - convert_result(lock_internal().voicevox_tts_from_kana(text, speaker_id as usize)) + convert_result(lock_internal().tts(text, speaker_id, options.into())) } else { (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT) } }; if let Some(output) = output_opt { unsafe { - write_wav_to_ptr(output_wav, output_binary_size, output.as_slice()); + write_wav_to_ptr(output_wav, output_wav_size, output.as_slice()); } } result_code @@ -417,7 +303,7 @@ pub extern "C" fn voicevox_wav_free(wav: *mut u8) { pub extern "C" fn voicevox_error_result_to_message( result_code: VoicevoxResultCode, ) -> *const c_char { - voicevox_core::voicevox_error_result_to_message(result_code).as_ptr() as *const c_char + voicevox_core::error_result_to_message(result_code).as_ptr() as *const c_char } #[cfg(test)] diff --git a/example/cpp/unix/simple_tts.cpp b/example/cpp/unix/simple_tts.cpp index 0e7e39add..4a9d8dea9 100644 --- a/example/cpp/unix/simple_tts.cpp +++ b/example/cpp/unix/simple_tts.cpp @@ -17,29 +17,23 @@ int main(int argc, char *argv[]) { std::cout << "coreの初期化中..." << std::endl; - if (!initialize(false, 0, true)) { + auto initialize_options = voicevox_make_default_initialize_options(); + initialize_options.load_all_models = true; + initialize_options.open_jtalk_dict_dir = open_jtalk_dict_path.c_str(); + if (voicevox_initialize(initialize_options) != VOICEVOX_RESULT_SUCCEED) { std::cout << "coreの初期化に失敗しました" << std::endl; return 1; } - VoicevoxResultCode result; - - std::cout << "openjtalk辞書の読み込み中..." << std::endl; - - result = voicevox_load_openjtalk_dict(open_jtalk_dict_path.c_str()); - if (result != VOICEVOX_RESULT_SUCCEED) { - std::cout << voicevox_error_result_to_message(result) << std::endl; - return 1; - } - std::cout << "音声生成中..." << std::endl; int64_t speaker_id = 0; - int output_binary_size = 0; + size_t output_wav_size = 0; uint8_t *output_wav = nullptr; - result = - voicevox_tts(text.c_str(), speaker_id, &output_binary_size, &output_wav); + auto result = voicevox_tts(text.c_str(), speaker_id, + voicevox_make_default_tts_options(), + &output_wav_size, &output_wav); if (result != VOICEVOX_RESULT_SUCCEED) { std::cout << voicevox_error_result_to_message(result) << std::endl; return 1; @@ -48,8 +42,7 @@ int main(int argc, char *argv[]) { std::cout << "音声ファイル保存中..." << std::endl; std::ofstream wav_file(OUTPUT_WAV_NAME, std::ios::binary); - wav_file.write(reinterpret_cast(output_wav), - output_binary_size); + wav_file.write(reinterpret_cast(output_wav), output_wav_size); voicevox_wav_free(output_wav); std::cout << "音声ファイル保存完了 (" << OUTPUT_WAV_NAME << ")" << std::endl; diff --git a/model/yukarin_s.onnx b/model/predict_duration.onnx similarity index 100% rename from model/yukarin_s.onnx rename to model/predict_duration.onnx diff --git a/model/yukarin_sa.onnx b/model/predict_intonation.onnx similarity index 100% rename from model/yukarin_sa.onnx rename to model/predict_intonation.onnx