-
Notifications
You must be signed in to change notification settings - Fork 12
/
eval_tacotron.py
220 lines (176 loc) · 8.52 KB
/
eval_tacotron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# -*- coding: utf-8 -*- #
"""*********************************************************************************************"""
# FileName [ eval_tacotron.py ]
# Synopsis [ Testing algorithms for a trained Tacotron model for the ZeroSpeech TTS-without-T project ]
# Author [ Ting-Wei Liu (Andi611) ]
# Copyright [ Copyleft(c), Speech Lab, NTU, Taiwan ]
"""*********************************************************************************************"""
###############
# IMPORTATION #
###############
import os
import sys
import json
import argparse
import librosa
import librosa.display
import numpy as np
from tqdm import tqdm
#--------------------------------#
import torch
from torch.autograd import Variable
#--------------------------------#
from model.tacotron import audio
from model.tacotron.config import config
from model.tacotron.text import text_to_sequence, symbols
from model.tacotron.tacotron import Tacotron
from convert import get_trainer, encode, parse_encodings, write_encodings
from preprocess import get_spectrograms
from hps.hps import hp, Hps
############
# CONSTANT #
############
USE_CUDA = torch.cuda.is_available()
#######################
# TEST CONFIGURATIONS #
#######################
def get_test_args():
parser = argparse.ArgumentParser(description='testing arguments')
parser.add_argument('--dataset', choices=['english', 'surprise'], default='english', help='which dataset are we testing')
parser.add_argument('--test_single', default=False, action='store_true', help='test the trained model on a single file')
parser.add_argument('--eval_t', choices=['V001', 'V002', 'None'], default='None', help='target to be evalutated must be either (V001, or V002).')
ckpt_parser = parser.add_argument_group('ckpt')
ckpt_parser.add_argument('--ckpt_dir', type=str, default='./ckpt_tacotron_english/', help='path to the directory where model checkpoints are saved')
ckpt_parser.add_argument('--model_name', type=str, default='checkpoint_step500000.pth-english-V002', help='name for the checkpoint file')
ckpt_parser.add_argument('--encoder_path', type=str, default='./ckpt_english/model.pth-ae-400000-128-multi-6/', help='path to the encoder model')
#---the arguments below will be handled automatically, should not change these---#
path_parser = parser.add_argument_group('path')
path_parser.add_argument('--result_dir', type=str, default='./result/', help='path to output test results')
path_parser.add_argument('--sub_result_dir', type=str, default='./english/test', help='sub result directory for generating zerospeech synthesis results')
path_parser.add_argument('--testing_dir', type=str, default='./data/english/test', help='path to the input test audios')
path_parser.add_argument('--synthesis_list', type=str, default='./data/english/synthesis.txt', help='path to the input test transcripts')
path_parser.add_argument('--speaker2id_path', type=str, default='./data/speaker2id_english.json', help='records speaker and speaker id')
path_parser.add_argument('--multi2idx_path', type=str, default='./data/multi2idx.json', help='records encoding and idx mapping')
path_parser.add_argument('--hps_path', type=str, default='./hps/zerospeech_english.json', help='hyperparameter path, please refer to the default settings in zerospeech.json')
args = parser.parse_args()
#---reparse if switching dataset---#
if args.dataset == 'surprise':
for action in parser._actions:
if ('path' in action.dest or 'synthesis_list' in action.dest or 'dir' in action.dest):
if 'english' in action.default:
action.default = action.default.replace('english', 'surprise')
args = parser.parse_args()
return args
def valid_arguments(valid_target, arg):
if not valid_target in arg:
raise RuntimeWarning('The key word {} should be in the argument: {}, make sure you are running the correct file!'.format(valid_target, arg))
##################
# TEXT TO SPEECH #
##################
def tts(model, text):
"""Convert text to speech waveform given a Tacotron model.
"""
if USE_CUDA:
model = model.cuda()
# NOTE: dropout in the decoder should be activated for generalization!
# model.decoder.eval()
model.encoder.eval()
model.postnet.eval()
sequence = np.array(text_to_sequence(text))
sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
if USE_CUDA:
sequence = sequence.cuda()
# Greedy decoding
mel_outputs, linear_outputs, gate_outputs, alignments = model(sequence)
linear_output = linear_outputs[0].cpu().data.numpy()
spectrogram = audio._denormalize(linear_output)
alignment = alignments[0].cpu().data.numpy()
# Predicted audio signal
waveform = audio.inv_spectrogram(linear_output.T)
return waveform, alignment, spectrogram
####################
# SYNTHESIS SPEECH #
####################
def synthesis_speech(model, text, path):
waveform, alignment, spectrogram = tts(model, text)
librosa.output.write_wav(path, waveform, hp.sr)
########
# MAIN #
########
def main():
#---initialize---#
args = get_test_args()
HPS = Hps(args.hps_path)
hps = HPS.get_tuple()
trainer = get_trainer(args.hps_path, args.encoder_path, hps.g_mode, hps.enc_mode)
if args.eval_t == 'None':
print('[Tacotron] - None is not a valid evaluation target! Please specify target manually, must be either V001, or V002.')
return
# Tacotron implementation: https://github.com/andi611/TTS-Tacotron-Pytorch
model = Tacotron(n_vocab=len(symbols),
embedding_dim=config.embedding_dim,
mel_dim=config.num_mels,
linear_dim=config.num_freq,
r=config.outputs_per_step,
padding_idx=config.padding_idx,
attention=config.attention,
use_mask=config.use_mask)
#---handle path---#
result_dir = os.path.join(args.result_dir, args.sub_result_dir)
os.makedirs(result_dir, exist_ok=True)
checkpoint_path = os.path.join(args.ckpt_dir, args.model_name)
if args.dataset == 'english' and not os.path.isdir('./ckpt_tacotron_english'):
print('[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_english/')
elif args.dataset == 'surprise' and not os.path.isdir('./ckpt_tacotron_surprise'):
print('[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_surprise/')
#---load and set model---#
print('[Tacotron] - Testing on the {} set.'.format(args.dataset) )
print('[Tacotron] - Loading model: ', checkpoint_path)
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])
#---load and set mappings---#
print('[Tacotron] - Loading mapping files: ', args.speaker2id_path)
valid_arguments(valid_target=args.dataset, arg=args.speaker2id_path)
with open(args.speaker2id_path, 'r') as f_json:
speaker2id = json.load(f_json)
print('[Tacotron] - Loading mapping files: ', args.multi2idx_path)
with open(args.multi2idx_path, 'r') as f_json:
multi2idx = json.load(f_json)
if not args.test_single:
#---parse testing list---#
print('[Tacotron] - Testing from list: ', args.synthesis_list)
valid_arguments(valid_target=args.dataset, arg=args.synthesis_list)
feeds = []
with open(args.synthesis_list, 'r') as f:
file = f.readlines()
for line in file:
line = line.split('\n')[0].split(' ')
feeds.append({'s_id' : line[0].split('/')[1].split('_')[0],
'utt_id' : line[0].split('/')[1].split('_')[1],
't_id' : line[1], })
print('[Tester] - Number of files to be resynthesize: ', len(feeds))
for feed in tqdm(feeds):
if feed['t_id'] == args.eval_t:
wav_path = os.path.join(args.testing_dir, feed['s_id'] + '_' + feed['utt_id'] + '.wav')
_, spec = get_spectrograms(wav_path)
encodings = encode(spec, trainer, hps.seg_len, save=False)
encodings = parse_encodings(encodings)
line = ''.join([multi2idx[encoding] for encoding in encodings])
print(line)
out_path = os.path.join(result_dir, feed['t_id'] + '_' + feed['utt_id'] + '.wav')
synthesis_speech(model, text=line, path=out_path)
else:
wav_path = './data/english/train/voice/V002_0674932509.wav'
# wav_path = './data/english/train/voice/V002_2252538703.wav'
# wav_path = './data/english/train/voice/V002_1665800749.wav'
_, spec = get_spectrograms(wav_path)
encodings = encode(spec, trainer, hps.seg_len, save=False)
write_encodings(path='./result/result.wav', encodings=encodings)
parsed_encodings = parse_encodings(encodings)
line = ''.join([multi2idx[encoding] for encoding in parsed_encodings])
print(line)
synthesis_speech(model, text=line, path='./result/result.wav')
# model.decoder.max_decoder_steps = config.max_decoder_steps # Set large max_decoder steps to handle long sentence outputs
sys.exit(0)
if __name__ == "__main__":
main()