abook-transcribe.py

#!/usr/bin/env python
# -*- coding: utf-8 -*- 

#
# Copyright 2014, 2018 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import sys
import os
import codecs
import traceback
import logging
import re
import readline
import wave
import datetime

from optparse               import OptionParser
from StringIO               import StringIO
from nltools                import misc
from nltools.tts            import TTS
from nltools.tokenizer      import tokenize
from nltools.sequiturclient import sequitur_gen_ipa
from nltools.phonetics      import ipa2xsampa, xsampa2ipa
from speech_lexicon         import Lexicon

#
# - play back segments
# - edit/review transcripts
# - add missing words to the dictionary
#

SAMPLE_RATE       = 16000

PROC_TITLE        = 'abook-transcribe'
AUDACITY_DURATION = 3.0
SEQUITUR_MODEL    = 'data/models/sequitur-dict-de.ipa-latest'

#
# menu subroutines
#

def play_wav(start=0.0, stop=1.0):

    global tts, segmentfn

    wavef = wave.open(segmentfn, 'rb')

    num_channels = wavef.getnchannels()
    num_frames   = wavef.getnframes()
    frame_rate   = wavef.getframerate()
    sampwidth    = wavef.getsampwidth()

    duration = float(num_frames) / float(frame_rate)

    buf = StringIO()
    wavout = wave.open(buf, 'w')

    wavout.setframerate(frame_rate)
    wavout.setnchannels(num_channels)
    wavout.setsampwidth(sampwidth)
    
    wavef.setpos(int(start * num_frames))
    samples = wavef.readframes(int((stop-start) * num_frames))
    wavout.writeframes(samples)

    wavef.close()
    wavout.close()

    # with open(segmentfn) as wavf:
    #     wav = wavf.read()
    tts.play_wav(buf.getvalue(), async=True)

def audacity():

    global tts, segmentfn

    cmd = 'audacity %s &' % segmentfn
    os.system(cmd)

def next_segment():

    global segmentfn
    global prompt

    segmentfn = None
    prompt    = u''

    for fn in sorted(os.listdir(segdirfn)):
        if not fn.endswith('.wav'):
            continue
        segmentfn = '%s/%s' % (segdirfn, fn)
        break

def lex_gen_ipa (lex_base, locale, engine, voice, speak=False):

    global tts

    ipas = u''
    try:

        if engine == 'sequitur':
            ipas = sequitur_gen_ipa (SEQUITUR_MODEL, lex_base)
        
        else:
            tts.locale = locale
            tts.engine = engine
            tts.voice  = voice
            ipas = tts.gen_ipa (lex_base)

        if speak:
            tts.locale = 'de'
            tts.engine = 'mary'
            tts.voice  = 'dfki-pavoque-neutral-hsmm'
            tts.say_ipa(ipas, async=True)

    except:
        logging.error('EXCEPTION CAUGHT %s' % traceback.format_exc())

    return ipas

def lex_edit(lex_token):

    global lex, lang

    lex_base  = lex_token.split('_')[0]

    if lex_token in lex:
        lex_entry = lex[lex_token]

    else:
        ipas = lex_gen_ipa(lex_base, 'de', 'sequitur', 'de')
        lex_entry = {'ipa': ipas}
        lex[lex_token] = lex_entry

    ipas = lex_entry['ipa']

    try:
        tts.locale ='de'
        tts.engine ='mary'
        tts.voice  ='dfki-pavoque-neutral-hsmm'
        tts.say_ipa(ipas, async=True)
    except:
        logging.error('EXCEPTION CAUGHT %s' % traceback.format_exc())

    lex_gen = {}

    lex_gen['de-mary']     = lex_gen_ipa(lex_base, 'de', 'mary',     'bits3')
    lex_gen['de-espeak']   = lex_gen_ipa(lex_base, 'de', 'espeak',   'de')
    lex_gen['de-sequitur'] = lex_gen_ipa(lex_base, 'de', 'sequitur', 'de')

    while True:

        print
        print u"Token       : %s" % lex_token
        print u"IPA         : %s" % lex_entry['ipa']
        print

        for engine in sorted(lex_gen):
            print u"%-11s : %s" % (engine, lex_gen[engine])
        print

        if lex_token in lex:
            m = lex.get_multi(lex_token)
            for k in m:
                print u"%s [%s]" % (k, m[k]['ipa'])

        else:
            print u"NEW TOKEN"

        print u"SPEAK  P:de-unitsel  O:de-hsmm                   I:fr-hsmm   U:en-hsmm"
        print u"GEN    G:de-mary     H:de-espeak  J:de-sequitur  K:fr-mary   L:en-mary"
        print u"       E:Edit        Q:Quit "

        try:

            resp = raw_input("Lex> ")

            # quit
            if resp.lower() == 'q':
                break  
        
            # generate de-mary
            elif resp.lower() == 'g':
                lex_entry['ipa'] = lex_gen_ipa (lex_base, 'de', 'mary', 'bits3', True)

            # generate de-espeak
            elif resp.lower() == 'h':
                lex_entry['ipa'] = lex_gen_ipa (lex_base, 'de', 'espeak', 'de', True)
                
            # generate en-mary 
            elif resp.lower() == 'l':
                
                tts.locale ='en-US'
                tts.engine ='mary'
                tts.voice  ='cmu-rms-hsmm'

                ipas = tts.gen_ipa (lex_base)
                tts.say_ipa(ipas, async=True)
                lex_entry['ipa'] = ipas

            # generate fr-mary 
            elif resp.lower() == 'k':
                
                tts.locale ='fr'
                tts.engine ='mary'
                tts.voice  ='upmc-pierre-hsmm'

                ipas = tts.gen_ipa (lex_base)
                tts.say_ipa(ipas, async=True)
                lex_entry['ipa'] = ipas

            # generate de-sequitur
            elif resp.lower() == 'j':
                lex_entry['ipa'] = lex_gen_ipa (lex_base, 'de', 'sequitur', 'de', True)
                
            # speak de mary unitsel 
            elif resp.lower() == 'p':
        
                if len(lex_entry['ipa']) == 0:
                    continue
        
                ipas = lex_entry['ipa']

                tts.locale ='de'
                tts.engine ='mary'
                tts.voice  ='bits3'

                tts.say_ipa(ipas, async=True)

            # speak de mary hsmm
            elif resp.lower() == 'o':
        
                if len(lex_entry['ipa']) == 0:
                    continue
        
                ipas = lex_entry['ipa']

                tts.locale = 'de'
                tts.engine = 'mary'
                tts.voice  = 'dfki-pavoque-neutral-hsmm'

                tts.say_ipa(ipas, async=True)

            # speak fr mary hsmm
            elif resp.lower() == 'i':
       
                if len(lex_entry['ipa']) == 0:
                    continue
        
                ipas = lex_entry['ipa']

                tts.locale ='fr'
                tts.engine ='mary'
                tts.voice  ='upmc-pierre-hsmm'

                tts.say_ipa(ipas, async=True)
       
            # speak en mary hsmm
            elif resp.lower() == 'u':
        
                ipas = lex_entry['ipa']

                tts.locale = 'en-US'
                tts.engine = 'mary'
                tts.voice  = 'cmu-rms-hsmm'

                tts.say_ipa(ipas, async=True)
       
            # edit XS
            elif resp.lower() == 'e':
        
                ipas = lex_entry['ipa']

                xs = ipa2xsampa (lex_token, ipas, stress_to_vowels=False)
                readline.add_history(xs)
                xs = raw_input(xs + '> ')

                ipas = xsampa2ipa (lex_token, xs)
    
                lex_entry['ipa'] = ipas

        except:
            logging.error('EXCEPTION CAUGHT %s' % traceback.format_exc())

    lex.save()
    print "new lexicon saved."
    print

#
# init terminal
#

misc.init_app (PROC_TITLE)

readline.set_history_length(1000)

#
# command line
#

parser = OptionParser("usage: %prog [options] segmentsdir")

parser.add_option("-s", "--speaker1", dest="speaker1", type = "str", default='alice',
                  help="speaker #1 (default: alice)")
parser.add_option("-S", "--speaker2", dest="speaker2", type = "str", default='bob',
                  help="speaker #2 (default: bob)")
parser.add_option("-l", "--lang", dest="lang", type = "str", default='de',
                  help="language (default: de)")
parser.add_option("-o", "--out-dir", dest="outdir", type = "str", default='abook/out',
                  help="language (default: abook/out)")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", 
                  help="enable debug output")
parser.add_option("-t", "--transcript", dest="transcript", type = "str",
                  help="transcript (optional)")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

if len(args) != 1:
    parser.print_usage()
    sys.exit(1)

lang      = options.lang
segdirfn  = args[0]
speaker1  = options.speaker1
speaker2  = options.speaker2

# wavdirfn  = '%s/wav' % subdirfn
# promptsfn = '%s/etc/prompts-original' % subdirfn

transcript = []

if options.transcript:
    with codecs.open(options.transcript, 'r', 'utf8') as tf:

        ts = tf.read().replace('\n', ' ').replace('\r', ' ')
        for t in ts.split(u' '):
            if not t:
                continue
            transcript.append(t)

#
# config
#

config = misc.load_config('.speechrc')

vf_login    = config.get("speech", "vf_login")
extrasdir   = config.get("speech", "extrasdir_%s" % lang)

#
# TTS (for audio output)
#

tts = TTS ('local', 0, locale='de', voice='bits3', engine='espeak')

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.lang)
logging.info("loading lexicon...done.")

#
# main ui loop
#

next_segment()

while segmentfn:

    print
    print segmentfn
    print prompt

    # any words not covered by our lexicon?

    lex_missing = set()

    tokens = tokenize(prompt, lang=lang)
    ts = u' '.join(tokens)

    for t in tokens:
        if not t in lex:
            lex_missing.add(t)
    if lex_missing:
        print
        print u"MISSING TOKENS: %s" % repr(sorted(lex_missing))

    print

    if transcript:
        ts = u' '.join(transcript[:10])
        print "TS (A:Add, S:Skipt): %s" % ts
        print

    print "Playback: P:All U:1/3 I:2/3 O:3/3 Y:Audacity" 
    resp = raw_input("E:Edit L:Lex 1:%s 2:%s 0:Delete Q:Quit Prompt>" % (speaker1, speaker2))

    if resp.lower() == 'q':
        break
   
    elif resp.lower() == 'u':
        play_wav(0.0,   0.333)
    elif resp.lower() == 'i':
        play_wav(0.333, 0.666)
    elif resp.lower() == 'o':
        play_wav(0.666, 1.0  )

    elif resp.lower() == 'a':
        t = transcript.pop(0)
        if prompt:
            prompt += u' ' + t
        else:
            prompt = t
        readline.add_history(prompt)

    elif resp.lower() == 's':
        t = transcript.pop(0)

    elif resp.lower() == 'p':
        play_wav()

    elif resp.lower() == 'y':
        audacity()

    elif resp.lower() == 'e':
        prompt = raw_input("Prompt> ")

    elif resp.lower() == 'l':
        if not lex_missing:
            print "All words are covered by the dictionary."
            continue
        lex_edit(list(lex_missing)[0])

    elif resp == '0':
        os.remove(segmentfn)
        next_segment()
        play_wav()

    elif resp == '1' or resp == '2':
        if lex_missing:
            print "Not all words are covered by the dictionary."
            continue

        speaker = options.speaker1 if resp == '1' else options.speaker2

        # does a directory for recordings of this speaker already exist?

        speakerdirfn = None
        for fn in os.listdir(options.outdir):
            if fn.startswith(speaker):
                speakerdirfn = '%s/%s' % (options.outdir, fn)
                break
        if not speakerdirfn:
            ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d')
            speakerdirfn = '%s/%s-%s-rec' % (options.outdir, speaker, ds)

        misc.mkdirs('%s/wav' % speakerdirfn)
        misc.mkdirs('%s/etc' % speakerdirfn)

        destfn = '%s/wav/%s' % (speakerdirfn, os.path.basename(segmentfn))
        os.rename(segmentfn, destfn)
        print "moved %s to %s" % (segmentfn, destfn)

        promptsfn = '%s/etc/prompts-original' % speakerdirfn
        with codecs.open(promptsfn, 'a', 'utf8') as promptsf:
            wavbn = os.path.basename(segmentfn)
            wavbn = os.path.splitext(wavbn)[0]
            promptsf.write(u'%s %s\n' % (wavbn, prompt))
        print "%s written." % promptsfn

        next_segment()
        play_wav()

    elif len(resp)>2:
        prompt = resp