From d4e79eb52e8b01d24703b2dfc0385544092958f3 Mon Sep 17 00:00:00 2001 From: ming024 Date: Thu, 8 Jul 2021 11:40:51 +0800 Subject: [PATCH] update README and demo page --- README.md | 27 ++++-- index.html | 235 +++++++++++++++++++++++++++++++++++++++++++++-- model/modules.py | 6 +- 3 files changed, 249 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d9ac71c03f..90811f18c9 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ On the other hand, pitch spectrograms extracted by continuous wavelet transform ![](./img/model.png) # Updates +- 2021/7/8: Release the checkpoint and audio samples of a multi-speaker English TTS model trained on LibriTTS - 2021/2/26: Support English and Mandarin TTS - 2021/2/26: Support multi-speaker TTS (AISHELL-3 and LibriTTS) - 2021/2/26: Support MelGAN and HiFi-GAN vocoder @@ -27,7 +28,7 @@ pip3 install -r requirements.txt ## Inference -You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/`` or ``output/ckpt/AISHELL3``. +You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/``, ``output/ckpt/AISHELL3``, or ``output/ckpt/LibriTTS/``. For English single-speaker TTS, run ``` @@ -36,7 +37,12 @@ python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode si For Mandarin multi-speaker TTS, try ``` -python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml +python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 600000 --mode single -p config/AISHELL3/preprocess.yaml -m config/AISHELL3/model.yaml -t config/AISHELL3/train.yaml +``` + +For English multi-speaker TTS, run +``` +python3 synthesize.py --text "YOUR_DESIRED_TEXT" --speaker_id SPEAKER_ID --restore_step 800000 --mode single -p config/LibriTTS/preprocess.yaml -m config/LibriTTS/model.yaml -t config/LibriTTS/train.yaml ``` The generated utterances will be put in ``output/result/``. @@ -81,7 +87,7 @@ python3 prepare_align.py config/LJSpeech/preprocess.yaml for some preparations. As described in the paper, [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/) (MFA) is used to obtain the alignments between the utterances and the phoneme sequences. -Alignments for the LJSpeech and AISHELL-3 datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing). +Alignments of the supported datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing). You have to unzip the files in ``preprocessed_data/LJSpeech/TextGrid/``. After that, run the preprocessing script by @@ -129,7 +135,7 @@ The loss curves, synthesized mel-spectrograms, and audios are shown. # Implementation Issues -- Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Postnet after the decoder, which is not used in the original paper. +- Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Post-Net after the decoder, which is not used in the original FastSpeech 2. - Gradient clipping is used in the training. - In my experience, using phoneme-level pitch and energy prediction instead of frame-level prediction results in much better prosody, and normalizing the pitch and energy features also helps. Please refer to ``config/README.md`` for more details. @@ -143,12 +149,13 @@ Please inform me if you find any mistakes in this repo, or any useful tips to tr # Citation ``` -@misc{chien2021investigating, +@INPROCEEDINGS{chien2021investigating, + author={Chien, Chung-Ming and Lin, Jheng-Hao and Huang, Chien-yu and Hsu, Po-chun and Lee, Hung-yi}, + booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, title={Investigating on Incorporating Pretrained and Learnable Speaker Representations for Multi-Speaker Multi-Style Text-to-Speech}, - author={Chung-Ming Chien and Jheng-Hao Lin and Chien-yu Huang and Po-chun Hsu and Hung-yi Lee}, year={2021}, - eprint={2103.04088}, - archivePrefix={arXiv}, - primaryClass={eess.AS} -} + volume={}, + number={}, + pages={8588-8592}, + doi={10.1109/ICASSP39728.2021.9413880}} ``` diff --git a/index.html b/index.html index dee8bb0e32..118f4452aa 100644 --- a/index.html +++ b/index.html @@ -52,10 +52,11 @@

English Single-Speaker TTS

Dataset: LJSpeech
- Checkpoint: link + Checkpoint: link
- Config: link + Config: link
Vocoder: HiFi-GAN (LJSpeech) @@ -292,10 +293,11 @@

Mandarin Multi-Speaker TTS

Dataset: AISHELL-3
- Checkpoint: link + Checkpoint: link
- Config: link + Config: link
Vocoder: HiFi-GAN (universal) @@ -337,7 +339,8 @@

Mandarin Multi-Speaker TTS

Speaker: SSB0794
-
Text: 中国,今日有望创下历史单,日,出行人次最高 (zhong1 guo2 sp jin1 ri4 you3 wang4 chuang4 xia4 li4 shi3 dan1 sp ri4 sp chu1 xing2 ren2 ci4 zui4 gao1)
+
Text: 中国,今日有望创下历史单,日,出行人次最高 (zhong1 guo2 sp jin1 ri4 you3 wang4 chuang4 xia4 li4 shi3 dan1 sp ri4 sp + chu1 xing2 ren2 ci4 zui4 gao1)

@@ -489,7 +492,227 @@

Mandarin Multi-Speaker TTS

Speaker: SSB0822
-
Text: 该职位员工可以,居住在西雅,图或,洛,杉矶 (gai1 zhi2 wei4 yuan2 gong1 ke2 yi3 sp ju1 zhu4 zai4 xi1 ya3 sp tu2 huo4 sp luo4 sp shan3 ji1)
+
Text: 该职位员工可以,居住在西雅,图或,洛,杉矶 (gai1 zhi2 wei4 yuan2 gong1 ke2 yi3 sp ju1 zhu4 zai4 xi1 ya3 sp tu2 huo4 + sp luo4 sp shan3 ji1)
+ +

+ +
+ +

+

English Multi-Speaker TTS

+
+ Dataset: LibriTTS +
+
+ Checkpoint: link +
+
+ Config: link +
+
+ Vocoder: HiFi-GAN (universal) +
+ +
+ + + + + + + + + + +
207_131203_000011_000000 (Ground-Truth) 207_131203_000011_000000 (Synthesized)
+
Speaker: 207
+
Text: his mother, however, was a little shy of the company for him, and besides she could not always + spare him.
+ +
+ + + + + + + + + + +
2299_6524_000057_000000 (Ground-Truth) 2299_6524_000057_000000 (Synthesized)
+
Speaker: 2299
+
Text: on the arrival at the hut to my chagrin we found it filled with snow.
+ +
+ + + + + + + + + + +
2388_153731_000003_000000 (Ground-Truth) 2388_153731_000003_000000 (Synthesized)
+
Speaker: 2388
+
Text: the story of the first scientific observation of the corona and the prominences is thrillingly + interesting, and in fact dramatic.
+ +
+ + + + + + + + + + +
3615_14677_000014_000000 (Ground-Truth) 3615_14677_000014_000000 (Synthesized)
+
Speaker: 3615
+
Text: beat three eggs with a pinch of salt; add one pint of milk and two thirds of a cup of + flour.
+ +
+ + + + + + + + + + +
4744_31668_000003_000001 (Ground-Truth) 4744_31668_000003_000001 (Synthesized)
+
Speaker: 4744
+
Text: far from morbid naturally, she did her best to deny the thought, and so simple and unartificial + was her type of mind that for weeks together she would wholly lose it.
+ +
+ + + + + + + + + + +
480_127525_000006_000000 (Ground-Truth) 480_127525_000006_000000 (Synthesized)
+
Speaker: 480
+
Text: i tried and found by experiment that the tide kept sweeping us westward until i had laid her + head due east, or just about right angles to the way we ought to go.
+ +
+ + + + + + + + + + +
5126_27504_000011_000003 (Ground-Truth) 5126_27504_000011_000003 (Synthesized)
+
Speaker: 5126
+
Text: it looked as if our luck was dead out, and we began to think our chance of getting across the + border to queensland, and clear out of the colony that way, looked worse every day.
+ +
+ + + + + + + + + + +
6098_57837_000008_000000 (Ground-Truth) 6098_57837_000008_000000 (Synthesized)
+
Speaker: 6098
+
Text: word was sent of their predicament to the nearest fort, and lieutenant pershing was sent with a + small detachment to their rescue.
+ +
+ + + + + + + + + + +
6544_231862_000065_000001 (Ground-Truth) 6544_231862_000065_000001 (Synthesized)
+
Speaker: 6544
+
Text: denzil did look, and uttered a second cry more startling than the first.
+ +
+ + + + + + + + + + +
968_122545_000053_000000 (Ground-Truth) 968_122545_000053_000000 (Synthesized)
+
Speaker: 968
+
Text: the bee fought the window angrily, up and down, up and down, for several minutes; then found the + open glass and whirled + out into the sunshine, joyfully.

diff --git a/model/modules.py b/model/modules.py index f694d56a6e..9933725ffc 100644 --- a/model/modules.py +++ b/model/modules.py @@ -15,7 +15,7 @@ class VarianceAdaptor(nn.Module): - """ Variance Adaptor """ + """Variance Adaptor""" def __init__(self, preprocess_config, model_config): super(VarianceAdaptor, self).__init__() @@ -159,7 +159,7 @@ def forward( class LengthRegulator(nn.Module): - """ Length Regulator """ + """Length Regulator""" def __init__(self): super(LengthRegulator, self).__init__() @@ -195,7 +195,7 @@ def forward(self, x, duration, max_len): class VariancePredictor(nn.Module): - """ Duration, Pitch and Energy Predictor """ + """Duration, Pitch and Energy Predictor""" def __init__(self, model_config): super(VariancePredictor, self).__init__()