forked from r9y9/deepvoice3_pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vctk.py
87 lines (68 loc) · 2.8 KB
/
vctk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.datasets import vctk
from nnmnkwii.io import hts
from hparams import hparams
from os.path import exists
import librosa
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
speakers = vctk.available_speakers
td = vctk.TranscriptionDataSource(in_dir, speakers=speakers)
transcriptions = td.collect_files()
speaker_ids = td.labels
wav_paths = vctk.WavFileDataSource(
in_dir, speakers=speakers).collect_files()
for index, (speaker_id, text, wav_path) in enumerate(
zip(speaker_ids, transcriptions, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text)))
return [future.result() for future in tqdm(futures)]
def start_at(labels):
has_silence = labels[0][-1] == "pau"
if not has_silence:
return labels[0][0]
for i in range(1, len(labels)):
if labels[i][-1] != "pau":
return labels[i][0]
assert False
def end_at(labels):
has_silence = labels[-1][-1] == "pau"
if not has_silence:
return labels[-1][1]
for i in range(len(labels) - 2, 0, -1):
if labels[i][-1] != "pau":
return labels[i][1]
assert False
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
wav, _ = librosa.effects.trim(wav, top_db=15)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'vctk-spec-%05d.npy' % index
mel_filename = 'vctk-mel-%05d.npy' % index
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)