Skip to content

Commit

Permalink
move padding to generate_full_intermediate and margin to render_audio…
Browse files Browse the repository at this point in the history
…_segment
  • Loading branch information
Yosshi999 committed Nov 3, 2024
1 parent d1be041 commit eb26303
Showing 1 changed file with 129 additions and 105 deletions.
234 changes: 129 additions & 105 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,15 @@ mod inner {
use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions};

const DEFAULT_SAMPLING_RATE: u32 = 24000;
// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅
const PADDING_DURATION: f64 = 0.4; // パディング秒
const PADDING_FRAME_LENGTH: usize = {
// パディングフレーム数
const fn div_round(num: f64, den: f64) -> u32 {
((num + den / 2.0) / den) as u32
}
div_round(PADDING_DURATION * DEFAULT_SAMPLING_RATE as f64, 256.0) as _
};

/// 音声の中間表現。
pub struct AudioFeature {
Expand All @@ -138,8 +147,6 @@ mod inner {
pub frame_length: usize,
/// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。
pub frame_rate: f64,
/// workaroundとして付け足されているパディング長。
padding_frame_length: usize,
/// 生成時に利用したクエリ。
audio_query: AudioQuery,
}
Expand Down Expand Up @@ -373,28 +380,12 @@ mod inner {
}
}

// 音が途切れてしまうのを避けるworkaround処理が入っている
// NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
let padding_size =
((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
let start_and_end_padding_size = 2 * padding_size;
let length_with_padding = f0.len() + start_and_end_padding_size;
let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size);
let phoneme_with_padding = make_phoneme_with_padding(
phoneme.as_flattened(),
OjtPhoneme::num_phoneme(),
length_with_padding,
padding_size,
);

let spec = self
.generate_full_intermediate(
f0_with_padding.len(),
f0.len(),
OjtPhoneme::num_phoneme(),
&f0_with_padding,
&phoneme_with_padding,
&f0,
&phoneme.as_flattened(),
style_id,
)
.await?;
Expand All @@ -403,7 +394,6 @@ mod inner {
style_id,
frame_length: f0.len(),
frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0,
padding_frame_length: padding_size,
audio_query: audio_query.clone(),
});

Expand Down Expand Up @@ -455,46 +445,6 @@ mod inner {
pitch,
}
}

fn make_f0_with_padding(
f0_slice: &[f32],
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut f0_with_padding = Vec::with_capacity(length_with_padding);
let padding = vec![0.0; padding_size];
f0_with_padding.extend_from_slice(&padding);
f0_with_padding.extend_from_slice(f0_slice);
f0_with_padding.extend_from_slice(&padding);
f0_with_padding
}

fn make_phoneme_with_padding(
phoneme_slice: &[f32],
phoneme_size: usize,
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut padding_phoneme = vec![0.0; phoneme_size];
padding_phoneme[0] = 1.0;
let padding_phoneme_len = padding_phoneme.len();
let padding_phonemes: Vec<f32> = padding_phoneme
.into_iter()
.cycle()
.take(padding_phoneme_len * padding_size)
.collect();
let mut phoneme_with_padding =
Vec::with_capacity(phoneme_size * length_with_padding);
phoneme_with_padding.extend_from_slice(&padding_phonemes);
phoneme_with_padding.extend_from_slice(phoneme_slice);
phoneme_with_padding.extend_from_slice(&padding_phonemes);

phoneme_with_padding
}
}

pub(super) async fn render(
Expand All @@ -504,41 +454,10 @@ mod inner {
end: usize,
) -> Result<Vec<u8>> {
// TODO: 44.1kHzなどの対応
const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
use std::cmp::min;
// 実態(workaround paddingを含まない)上での区間
let clipped_start = min(start, audio.frame_length);
let clipped_end = min(end, audio.frame_length);
// 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン
if (clipped_start..clipped_end).is_empty() {
return Ok(vec![]);
}
// マージンがデータからはみ出さないことを保証
// cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
if MARGIN > audio.padding_frame_length + clipped_start
|| MARGIN > audio.padding_frame_length + (audio.frame_length - clipped_end)
{
unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
}
let left_margin = MARGIN;
let right_margin = MARGIN;
// 安全マージンを追加したデータ上での区間
let slice_start = audio.padding_frame_length + clipped_start - left_margin;
let slice_end = audio.padding_frame_length + clipped_end + right_margin;
let segment = audio
.internal_state
.slice(ndarray::s![slice_start..slice_end, ..]);
let wave_with_margin = self
.render_audio_segment(segment.into_owned(), audio.style_id)
let wave = self
.render_audio_segment(audio.internal_state.clone(), start, end, audio.style_id)
.await?;
// 変換前に追加した安全マージンを生成音声から取り除く
let wave = wave_with_margin
.slice(ndarray::s![
left_margin * 256..wave_with_margin.len() - right_margin * 256
])
.into_owned()
.into_raw_vec();
return Ok(to_s16le_pcm(&wave, &audio.audio_query));
return Ok(to_s16le_pcm(&wave.into_raw_vec(), &audio.audio_query));

fn to_s16le_pcm(
wave: &[f32],
Expand Down Expand Up @@ -908,10 +827,12 @@ mod inner {
pub(super) async fn render_audio_segment(
&self,
spec: ndarray::Array2<f32>,
start: usize,
end: usize,
style_id: StyleId,
) -> Result<ndarray::Array1<f32>> {
let status = self.status.clone();
A::unblock(move || status.render_audio_segment(spec, style_id)).await
A::unblock(move || status.render_audio_segment(spec, start, end, style_id)).await
}

pub(super) async fn decode(
Expand Down Expand Up @@ -1000,34 +921,133 @@ mod inner {
/// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。
fn generate_full_intermediate(
&self,
length: usize,
_length: usize,
phoneme_size: usize,
f0: ndarray::Array1<f32>,
phoneme_vector: ndarray::Array1<f32>,
style_id: StyleId,
) -> Result<ndarray::Array2<f32>> {
let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。
// TODO: 改善したらここのpadding処理を取り除く
let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH;
let length_with_padding = f0.len() + start_and_end_padding_size;
let f0_with_padding =
make_f0_with_padding(&f0.to_vec(), length_with_padding, PADDING_FRAME_LENGTH);
let phoneme_with_padding = make_phoneme_with_padding(
&phoneme_vector.to_vec(),
OjtPhoneme::num_phoneme(),
length_with_padding,
PADDING_FRAME_LENGTH,
);

let GenerateFullIntermediateOutput { spec } = self.run_session(
model_id,
GenerateFullIntermediateInput {
f0: f0.into_shape([length, 1]).unwrap(),
phoneme: phoneme_vector.into_shape([length, phoneme_size]).unwrap(),
f0: ndarray::arr1(&f0_with_padding)
.into_shape([length_with_padding, 1])
.unwrap(),
phoneme: ndarray::arr1(&phoneme_with_padding)
.into_shape([length_with_padding, phoneme_size])
.unwrap(),
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
},
)?;
Ok(spec)
return Ok(spec);

fn make_f0_with_padding(
f0_slice: &[f32],
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut f0_with_padding = Vec::with_capacity(length_with_padding);
let padding = vec![0.0; padding_size];
f0_with_padding.extend_from_slice(&padding);
f0_with_padding.extend_from_slice(f0_slice);
f0_with_padding.extend_from_slice(&padding);
f0_with_padding
}

fn make_phoneme_with_padding(
phoneme_slice: &[f32],
phoneme_size: usize,
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut padding_phoneme = vec![0.0; phoneme_size];
padding_phoneme[0] = 1.0;
let padding_phoneme_len = padding_phoneme.len();
let padding_phonemes: Vec<f32> = padding_phoneme
.into_iter()
.cycle()
.take(padding_phoneme_len * padding_size)
.collect();
let mut phoneme_with_padding =
Vec::with_capacity(phoneme_size * length_with_padding);
phoneme_with_padding.extend_from_slice(&padding_phonemes);
phoneme_with_padding.extend_from_slice(phoneme_slice);
phoneme_with_padding.extend_from_slice(&padding_phonemes);

phoneme_with_padding
}
}

/// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。
fn render_audio_segment(
&self,
spec: ndarray::Array2<f32>,
start: usize,
end: usize,
style_id: StyleId,
) -> Result<ndarray::Array1<f32>> {
let (model_id, _inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
let RenderAudioSegmentOutput { wave } =
self.run_session(model_id, RenderAudioSegmentInput { spec })?;
let frame_length_with_padding = spec.nrows();
// 音が途切れてしまうのを避けるworkaround処理が入っている
let frame_length = frame_length_with_padding - PADDING_FRAME_LENGTH * 2;

// [start, end) の区間を両端にマージンを追加した上で音声変換する
const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
use std::cmp::min;
// 実態(workaround paddingを含まない)上での区間
let clipped_start = min(start, frame_length);
let clipped_end = min(end, frame_length);
// 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン
if (clipped_start..clipped_end).is_empty() {
return Ok(ndarray::arr1(&vec![]));
}
// マージンがデータからはみ出さないことを保証
// cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
if MARGIN > PADDING_FRAME_LENGTH + clipped_start
|| MARGIN > PADDING_FRAME_LENGTH + (frame_length - clipped_end)
{
unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
}
let left_margin = MARGIN;
let right_margin = MARGIN;
// 安全マージンを追加したデータ上での区間
let slice_start = PADDING_FRAME_LENGTH + clipped_start - left_margin;
let slice_end = PADDING_FRAME_LENGTH + clipped_end + right_margin;
let spec_segment = spec.slice(ndarray::s![slice_start..slice_end, ..]);
let RenderAudioSegmentOutput {
wave: wave_with_margin,
} = self.run_session(
model_id,
RenderAudioSegmentInput {
spec: spec_segment.into_owned(),
},
)?;
// 変換前に追加した安全マージンを生成音声から取り除く
let wave = wave_with_margin
.slice(ndarray::s![
left_margin * 256..wave_with_margin.len() - right_margin * 256
])
.into_owned();
Ok(wave)
}

Expand All @@ -1047,7 +1067,7 @@ mod inner {
phoneme_vector,
style_id,
)?;
let output = self.render_audio_segment(intermediate, style_id)?;
let output = self.render_audio_segment(intermediate, 0, length, style_id)?;
Ok(output.into_raw_vec())
}
}
Expand Down Expand Up @@ -1565,9 +1585,13 @@ pub(crate) mod blocking {
pub fn render_audio_segment(
&self,
spec: ndarray::Array2<f32>,
start: usize,
end: usize,
style_id: StyleId,
) -> crate::Result<ndarray::Array1<f32>> {
self.0.render_audio_segment(spec, style_id).block_on()
self.0
.render_audio_segment(spec, start, end, style_id)
.block_on()
}

pub fn decode(
Expand Down

0 comments on commit eb26303

Please sign in to comment.