Merge pull request #37 from KeneNwogu/main

fix: add speech outputs to sidebar, user can optionally play. fixes #35
mlsanigeria · Oct 21, 2024 · 5dccb95 · 5dccb95
2 parents 66deea9 + 1552303
commit 5dccb95
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ pyvenv.cfg
 *__pycache__/
 *.DS_Store
 *.venv/
-*.wav
+*.wav
+speech_outputs/
diff --git a/main.py b/main.py
@@ -8,6 +8,7 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
 import openai
+import uuid
 
 # Set up page configuration
 st.set_page_config(page_title="Speak-To-Docs", page_icon="📝", layout="wide", initial_sidebar_state="expanded")
@@ -94,14 +95,21 @@ def get_llm() -> ChatOpenAI:
                     logging.error(f"Error extracting content from document: {e}")
     else:
         st.session_state.uploaded_files = None
-
+
+    st.subheader("Speech output responses")
+    if 'speech_outputs' in st.session_state:
+        for speech_output in st.session_state.speech_outputs:
+            st.audio(os.path.join('speech_outputs', speech_output), format="audio/wav", start_time=0)
 
 def send_response(message, response=None):
     dummy_response = "Hello. How are you?"
     st.session_state.messages.append(('assistant', response or dummy_response))
+
     # TODO: make async ??
-    print(response or dummy_response)
-    synthesize_speech(text=response or dummy_response)
+    # generate unique file name
+    output_file = uuid.uuid4().hex + ".wav"
+    synthesize_speech(output_file=output_file, text=response or dummy_response)
+    st.session_state.speech_outputs.append(output_file)
 
 
 # Chat area and audio input handling
@@ -115,6 +123,9 @@ def send_message():
 
 if 'messages' not in st.session_state:
     st.session_state.messages = []
+
+if 'speech_outputs' not in st.session_state:
+    st.session_state.speech_outputs = []
 
 message = st.container()
 

diff --git a/src/speech_io.py b/src/speech_io.py
@@ -76,20 +76,24 @@ def synthesize_speech(text, output_file="output.wav", voice_name='en-NG-EzinneNe
     """
     if not SPEECH_KEY or not SPEECH_REGION:
         return False, "Azure Speech Service credentials are missing."
+
+    path = "speech_outputs"
+    os.makedirs(path, exist_ok=True)
+    output_file = os.path.join(path, output_file)
 
     output = open(output_file, 'w+')
     output.close()
 
     try:
         # Configure speech service
-        speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
-        # audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
+        speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)   
+        audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
 
         # Set the voice for synthesis
         speech_config.speech_synthesis_voice_name = voice_name
 
         # Create synthesizer and generate speech
-        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
+        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
         result = speech_synthesizer.speak_text_async(text).get()
 
         # Handle the result