forked from gooofy/zamia-speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_gspv2.py
executable file
·147 lines (111 loc) · 4.93 KB
/
import_gspv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 by Marc Puels
# Copyright 2016 by G.Bartsch
# Copyright 2015 Language Technology, Technische Universitaet Darmstadt (author: Benjamin Milde)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import codecs
from bs4 import BeautifulSoup
import shutil
import sys
import plac
from pathlib2 import Path
from nltools import misc
#
# convert german-speechdata-package-v2 to voxforge-style packages
#
@plac.annotations(
verbose=("Enable verbose logging", "flag", "v"))
def main(verbose=False):
"""Convert gspv2 corpus to the VoxForge corpus format
The variable `speech_arc` in ~/.speechrc must point to a folder
gspv2 which is used as the source containing the original gspv2 corpus,
i.e. containing the subfolders dev, test, and train.
The variable `speech_corpora` in ~/.speechrc must point to a folder
where the resulting corpus should be written. The script will create
a subfolder gspv2 here for the resulting voxforge-formatted data.
"""
misc.init_app('speech_audio_scan')
config = misc.load_config('.speechrc')
if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
speech_arc_dir = Path(config.get("speech", "speech_arc"))
speech_corpora_dir = Path(config.get("speech", "speech_corpora"))
src_root_dir = speech_arc_dir / "gspv2"
dst_root_dir = speech_corpora_dir / "gspv2"
exit_if_dst_root_dir_exists(dst_root_dir)
speakers = set()
speaker_gender = {}
for folder in ['train', 'test', 'dev']:
destdir = dst_root_dir
src_dir = src_root_dir / folder
num_files = len([f for f in src_dir.glob("*.xml")])
cnt_files = 0
for xml_path in src_dir.glob("*.xml"):
f = str(xml_path)
cnt_files += 1
fbase = f[0:len(f) - 4]
with codecs.open(f, 'r', 'utf-8') as xmlfile:
# remove urls
text = xmlfile.read()
soup = BeautifulSoup(text)
sentence = (soup.recording.sentence.string).strip()
cleaned_sentence = (
soup.recording.cleaned_sentence.string).strip()
sentence_id = int(
(soup.recording.sentence_id.string).strip())
speaker_id = (soup.recording.speaker_id.string).strip()
gender = (soup.recording.gender.string).strip()
name = 'gsp%s' % speaker_id.replace('-', '')
speakerdir = destdir / (name + "-1")
if not speaker_id in speakers:
speakers.add(speaker_id)
speaker_gender[name] = \
'm' if gender == 'male' else 'f'
(speakerdir / "wav").mkdir(parents=True, exist_ok=True)
(speakerdir / "etc").mkdir(parents=True, exist_ok=True)
for mic in ['Yamaha', 'Kinect-Beam', 'Kinect-RAW',
'Realtek', 'Samson']:
srcaudiofn = src_dir / ('%s_%s.wav' % (fbase, mic))
if not srcaudiofn.is_file():
continue
audiofn = Path('%s-%s' % (fbase, mic)).name
dstaudiofn = speakerdir / "wav" / (audiofn + ".wav")
logging.info('%5d/%5d %s %s %s' % (
cnt_files, num_files, name, audiofn,
str(srcaudiofn)))
prompts_orig = speakerdir / "etc" / "prompts-original"
with open(str(prompts_orig), 'a') as promptsf:
promptsf.write((u'%s %s\n' % (
audiofn, cleaned_sentence)).encode('utf8'))
copy_file(str(srcaudiofn), str(dstaudiofn))
# Usually, you don't to run this code. It was taken as the source for
# data/src/speech/gspv2/spk2gender.
#with open('gender.txt', 'w') as genderf:
# for name in sorted(speaker_gender.keys()):
# genderf.write('%s %s\n' % (name, speaker_gender[name]))
def exit_if_dst_root_dir_exists(dst_root_dir):
if dst_root_dir.is_dir():
logging.error(
"Destination folder {} already exists. Please either rename or "
"remove it.".format(dst_root_dir))
sys.exit(1)
def copy_file (src, dst):
logging.debug("copying %s to %s" % (src, dst))
shutil.copy(src, dst)
if __name__ == "__main__":
plac.call(main)