From d4e79eb52e8b01d24703b2dfc0385544092958f3 Mon Sep 17 00:00:00 2001
From: ming024 <r08922080@ntu.edu.tw>
Date: Thu, 8 Jul 2021 11:40:51 +0800
Subject: [PATCH] update README and demo page

---
 README.md        |  27 ++++--
 index.html       | 235 +++++++++++++++++++++++++++++++++++++++++++++--
 model/modules.py |   6 +-
 3 files changed, 249 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d9ac71c03f..90811f18c9 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ On the other hand, pitch spectrograms extracted by continuous wavelet transform
 ![](./img/model.png)
 
 # Updates
+- 2021/7/8: Release the checkpoint and audio samples of a multi-speaker English TTS model trained on LibriTTS
 - 2021/2/26: Support English and Mandarin TTS
 - 2021/2/26: Support multi-speaker TTS (AISHELL-3 and LibriTTS)
 - 2021/2/26: Support MelGAN and HiFi-GAN vocoder
@@ -27,7 +28,7 @@ pip3 install -r requirements.txt
 
 ## Inference
 
-You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/`` or ``output/ckpt/AISHELL3``.
+You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/``,  ``output/ckpt/AISHELL3``, or ``output/ckpt/LibriTTS/``.
 
 For English single-speaker TTS, run
 ```
@@ -36,7 +37,12 @@ python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode si
 
 For Mandarin multi-speaker TTS, try
 ```
-python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 600000 --mode single -p config/AISHELL3/preprocess.yaml -m config/AISHELL3/model.yaml -t config/AISHELL3/train.yaml
+```
+
+For English multi-speaker TTS, run
+```
+python3 synthesize.py --text "YOUR_DESIRED_TEXT"  --speaker_id SPEAKER_ID --restore_step 800000 --mode single -p config/LibriTTS/preprocess.yaml -m config/LibriTTS/model.yaml -t config/LibriTTS/train.yaml
 ```
 
 The generated utterances will be put in ``output/result/``.
@@ -81,7 +87,7 @@ python3 prepare_align.py config/LJSpeech/preprocess.yaml
 for some preparations.
 
 As described in the paper, [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/) (MFA) is used to obtain the alignments between the utterances and the phoneme sequences.
-Alignments for the LJSpeech and AISHELL-3 datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing).
+Alignments of the supported datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing).
 You have to unzip the files in ``preprocessed_data/LJSpeech/TextGrid/``.
 
 After that, run the preprocessing script by
@@ -129,7 +135,7 @@ The loss curves, synthesized mel-spectrograms, and audios are shown.
 
 # Implementation Issues
 
-- Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Postnet after the decoder, which is not used in the original paper.
+- Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Post-Net after the decoder, which is not used in the original FastSpeech 2.
 - Gradient clipping is used in the training.
 - In my experience, using phoneme-level pitch and energy prediction instead of frame-level prediction results in much better prosody, and normalizing the pitch and energy features also helps. Please refer to ``config/README.md`` for more details.
 
@@ -143,12 +149,13 @@ Please inform me if you find any mistakes in this repo, or any useful tips to tr
 
 # Citation
 ```
-@misc{chien2021investigating,
+@INPROCEEDINGS{chien2021investigating,
+  author={Chien, Chung-Ming and Lin, Jheng-Hao and Huang, Chien-yu and Hsu, Po-chun and Lee, Hung-yi},
+  booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
   title={Investigating on Incorporating Pretrained and Learnable Speaker Representations for Multi-Speaker Multi-Style Text-to-Speech}, 
-  author={Chung-Ming Chien and Jheng-Hao Lin and Chien-yu Huang and Po-chun Hsu and Hung-yi Lee},
   year={2021},
-  eprint={2103.04088},
-  archivePrefix={arXiv},
-  primaryClass={eess.AS}
-}
+  volume={},
+  number={},
+  pages={8588-8592},
+  doi={10.1109/ICASSP39728.2021.9413880}}
 ```
diff --git a/index.html b/index.html
index dee8bb0e32..118f4452aa 100644
--- a/index.html
+++ b/index.html
@@ -52,10 +52,11 @@ <h2>English Single-Speaker TTS</h2>
     <b>Dataset</b>: <a href="https://keithito.com/LJ-Speech-Dataset/">LJSpeech</a>
   </div>
   <div>
-    <b>Checkpoint</b>: <a href="https://drive.google.com/file/d/1r3fYhnblBJ8hDKDSUDtidJ-BN-xAM9pe/view?usp=sharing">link</a>
+    <b>Checkpoint</b>: <a
+      href="https://drive.google.com/file/d/1r3fYhnblBJ8hDKDSUDtidJ-BN-xAM9pe/view?usp=sharing">link</a>
   </div>
   <div>
-    <b>Config</b>: <a href="https://github.com/ming024/FastSpeech2/config/LJSpeech">link</a>
+    <b>Config</b>: <a href="https://github.com/ming024/FastSpeech2/tree/master/config/LJSpeech">link</a>
   </div>
   <div>
     <b>Vocoder</b>: <a href="https://github.com/jik876/hifi-gan">HiFi-GAN (LJSpeech)</a>
@@ -292,10 +293,11 @@ <h2>Mandarin Multi-Speaker TTS</h2>
     <b>Dataset</b>: <a href="http://www.aishelltech.com/aishell_3">AISHELL-3</a>
   </div>
   <div>
-    <b>Checkpoint</b>: <a href="https://drive.google.com/file/d/1uYWd5JlaK-fochQ2JFgIP_wOEkoLPLqs/view?usp=sharing">link</a>
+    <b>Checkpoint</b>: <a
+      href="https://drive.google.com/file/d/1uYWd5JlaK-fochQ2JFgIP_wOEkoLPLqs/view?usp=sharing">link</a>
   </div>
   <div>
-    <b>Config</b>: <a href="https://github.com/ming024/FastSpeech2/config/AISHELL3">link</a>
+    <b>Config</b>: <a href="https://github.com/ming024/FastSpeech2/tree/master/config/AISHELL3">link</a>
   </div>
   <div>
     <b>Vocoder</b>: <a href="https://github.com/jik876/hifi-gan">HiFi-GAN (universal)</a>
@@ -337,7 +339,8 @@ <h2>Mandarin Multi-Speaker TTS</h2>
     </tr>
   </table>
   <div><b>Speaker</b>: <em>SSB0794</em></div>
-  <div><b>Text</b>: <em>中国，今日有望创下历史单，日，出行人次最高 (zhong1 guo2 sp jin1 ri4 you3 wang4 chuang4 xia4 li4 shi3 dan1 sp ri4 sp chu1 xing2 ren2 ci4 zui4 gao1)</em></div>
+  <div><b>Text</b>: <em>中国，今日有望创下历史单，日，出行人次最高 (zhong1 guo2 sp jin1 ri4 you3 wang4 chuang4 xia4 li4 shi3 dan1 sp ri4 sp
+      chu1 xing2 ren2 ci4 zui4 gao1)</em></div>
 
   <hr>
 
@@ -489,7 +492,227 @@ <h2>Mandarin Multi-Speaker TTS</h2>
     </tr>
   </table>
   <div><b>Speaker</b>: <em>SSB0822</em></div>
-  <div><b>Text</b>: <em>该职位员工可以，居住在西雅，图或，洛，杉矶 (gai1 zhi2 wei4 yuan2 gong1 ke2 yi3 sp ju1 zhu4 zai4 xi1 ya3 sp tu2 huo4 sp luo4 sp shan3 ji1)</em></div>
+  <div><b>Text</b>: <em>该职位员工可以，居住在西雅，图或，洛，杉矶 (gai1 zhi2 wei4 yuan2 gong1 ke2 yi3 sp ju1 zhu4 zai4 xi1 ya3 sp tu2 huo4
+      sp luo4 sp shan3 ji1)</em></div>
+
+  </p>
+
+  <hr>
+
+  <p>
+  <h2>English Multi-Speaker TTS</h2>
+  <div>
+    <b>Dataset</b>: <a href="https://research.google/tools/datasets/libri-tts/">LibriTTS</a>
+  </div>
+  <div>
+    <b>Checkpoint</b>: <a
+      href="https://drive.google.com/file/d/1M6BxJtTxYW56dG1Myz9MqZmG_OXJLUqy/view?usp=sharing">link</a>
+  </div>
+  <div>
+    <b>Config</b>: <a href="https://github.com/ming024/FastSpeech2/tree/master/config/LibriTTS">link</a>
+  </div>
+  <div>
+    <b>Vocoder</b>: <a href="https://github.com/jik876/hifi-gan">HiFi-GAN (universal)</a>
+  </div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">207_131203_000011_000000 (Ground-Truth) </th>
+      <th style="text-align: center">207_131203_000011_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/207_131203_000011_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/207_131203_000011_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>207</em></div>
+  <div><b>Text</b>: <em>his mother, however, was a little shy of the company for him, and besides she could not always
+      spare him.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">2299_6524_000057_000000 (Ground-Truth) </th>
+      <th style="text-align: center">2299_6524_000057_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/2299_6524_000057_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/2299_6524_000057_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>2299</em></div>
+  <div><b>Text</b>: <em>on the arrival at the hut to my chagrin we found it filled with snow.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">2388_153731_000003_000000 (Ground-Truth) </th>
+      <th style="text-align: center">2388_153731_000003_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/2388_153731_000003_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/2388_153731_000003_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>2388</em></div>
+  <div><b>Text</b>: <em>the story of the first scientific observation of the corona and the prominences is thrillingly
+      interesting, and in fact dramatic.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">3615_14677_000014_000000 (Ground-Truth) </th>
+      <th style="text-align: center">3615_14677_000014_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/3615_14677_000014_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/3615_14677_000014_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>3615</em></div>
+  <div><b>Text</b>: <em>beat three eggs with a pinch of salt; add one pint of milk and two thirds of a cup of
+      flour.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">4744_31668_000003_000001 (Ground-Truth) </th>
+      <th style="text-align: center">4744_31668_000003_000001 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/4744_31668_000003_000001_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/4744_31668_000003_000001_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>4744</em></div>
+  <div><b>Text</b>: <em>far from morbid naturally, she did her best to deny the thought, and so simple and unartificial
+      was her type of mind that for weeks together she would wholly lose it.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">480_127525_000006_000000 (Ground-Truth) </th>
+      <th style="text-align: center">480_127525_000006_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/480_127525_000006_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/480_127525_000006_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>480</em></div>
+  <div><b>Text</b>: <em>i tried and found by experiment that the tide kept sweeping us westward until i had laid her
+      head due east, or just about right angles to the way we ought to go.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">5126_27504_000011_000003 (Ground-Truth) </th>
+      <th style="text-align: center">5126_27504_000011_000003 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/5126_27504_000011_000003_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/5126_27504_000011_000003_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>5126</em></div>
+  <div><b>Text</b>: <em>it looked as if our luck was dead out, and we began to think our chance of getting across the
+      border to queensland, and clear out of the colony that way, looked worse every day.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">6098_57837_000008_000000 (Ground-Truth) </th>
+      <th style="text-align: center">6098_57837_000008_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/6098_57837_000008_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/6098_57837_000008_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>6098</em></div>
+  <div><b>Text</b>: <em>word was sent of their predicament to the nearest fort, and lieutenant pershing was sent with a
+      small detachment to their rescue.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">6544_231862_000065_000001 (Ground-Truth) </th>
+      <th style="text-align: center">6544_231862_000065_000001 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/6544_231862_000065_000001_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/6544_231862_000065_000001_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>6544</em></div>
+  <div><b>Text</b>: <em>denzil did look, and uttered a second cry more startling than the first.</em></div>
+
+  <hr>
+
+  <table>
+    <tr>
+      <th style="text-align: center">968_122545_000053_000000 (Ground-Truth) </th>
+      <th style="text-align: center">968_122545_000053_000000 (Synthesized) </th>
+    </tr>
+    <tr>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/968_122545_000053_000000_ground-truth.wav" autoplay />
+        </audio></td>
+      <td style="text-align: center"><audio controls="controls">
+          <source src="./demo/LibriTTS/968_122545_000053_000000_synthesized.wav" autoplay />
+        </audio></td>
+    </tr>
+  </table>
+  <div><b>Speaker</b>: <em>968</em></div>
+  <div><b>Text</b>: <em>the bee fought the window angrily, up and down, up and down, for several minutes; then found the
+      open glass and whirled
+      out into the sunshine, joyfully.</em></div>
 
   </p>
 
diff --git a/model/modules.py b/model/modules.py
index f694d56a6e..9933725ffc 100644
--- a/model/modules.py
+++ b/model/modules.py
@@ -15,7 +15,7 @@
 
 
 class VarianceAdaptor(nn.Module):
-    """ Variance Adaptor """
+    """Variance Adaptor"""
 
     def __init__(self, preprocess_config, model_config):
         super(VarianceAdaptor, self).__init__()
@@ -159,7 +159,7 @@ def forward(
 
 
 class LengthRegulator(nn.Module):
-    """ Length Regulator """
+    """Length Regulator"""
 
     def __init__(self):
         super(LengthRegulator, self).__init__()
@@ -195,7 +195,7 @@ def forward(self, x, duration, max_len):
 
 
 class VariancePredictor(nn.Module):
-    """ Duration, Pitch and Energy Predictor """
+    """Duration, Pitch and Energy Predictor"""
 
     def __init__(self, model_config):
         super(VariancePredictor, self).__init__()