wip

HumeAI · Nov 1, 2024 · 13b250e · 13b250e
1 parent b9ec240
commit 13b250e
Show file tree

Hide file tree

Showing 9 changed files with 1,598 additions and 199 deletions.
diff --git a/evi-react-native-example/EVIExample/App.tsx b/evi-react-native-example/EVIExample/App.tsx
@@ -6,7 +6,12 @@ import {
   StyleSheet,
   ScrollView,
   SafeAreaView,
+  LayoutAnimation,
 } from 'react-native';
+
+let outstanding = 0
+// We use Hume's low-level typescript SDK for this example.
+// The React SDK (@humeai/voice-react) does not support React Native.
 import { HumeClient, type Hume } from 'hume'
 
 import * as NativeAudio from './modules/audio';
@@ -17,122 +22,207 @@ interface ChatEntry {
   content: string;
 }
 
-const isValidBase64 = (str: string) => {
-  try {
-    return btoa(atob(str)) === str;
-  } catch (err) {
-    return false;
-  }
-}
-
 const hume = new HumeClient({
   apiKey: process.env.EXPO_PUBLIC_HUME_API_KEY || ''
 })
+
+// EVI can send audio output messages faster than they can be played back.
+// It is important to buffer them in a queue so as not to cut off a clip of
+// playing audio with a more recent clip.
+class AudioQueue {
+  private tasks: Array<() => Promise<void>> = []
+  private currentClip: Promise<void> | null = null;
+
+  private advance() {
+    console.log('Advancing audio queue...')
+    if (this.tasks.length === 0) {
+      this.currentClip = null;
+      return
+    }
+    this.currentClip = this.tasks.shift()!().then(() => this.advance())
+  }
+
+  public add(playAudio: () => Promise<void>) {
+    console.log('Adding to queue...')
+    this.tasks.push(playAudio)
+
+    if (!this.currentClip) {
+      this.advance()
+    }
+  }
+
+  public clear() {
+    console.log('Clearing audio queue...')
+    this.tasks = []
+    this.currentClip = null
+  }
+}
+
+const audioQueue = new AudioQueue()
+
 const App = () => {
   const [isConnected, setIsConnected] = useState(false);
   const [isMuted, setIsMuted] = useState(false);
-  const [chatEntries, setChatEntries] = useState<ChatEntry[]>([
-    { role: 'assistant', timestamp: new Date().toString(), content: 'Hello! How can I help you today?' },
-    { role: 'user', timestamp: new Date().toString(), content: 'I am beyond help' },
+  const [chatEntries, setChatEntries] = useState<ChatEntry[]>([]);
+  const addChatEntry = (entry: ChatEntry) => {
+    setChatEntries((prev) => [...prev, entry]);
+  }
 
-  ]);
-  const [playbackQueue, setPlaybackQueue] = useState<any[]>([]);
+  // Scroll to the bottom of the chat display when new messages are added
+  const scrollViewRef = useRef<ScrollView | null>(null);
+  useEffect(() => {
+    if (scrollViewRef.current) {
+      LayoutAnimation.configureNext(LayoutAnimation.Presets.easeInEaseOut);
+      scrollViewRef.current.scrollToEnd();
+    }
+  }, [chatEntries]);
 
   const chatSocketRef = useRef<Hume.empathicVoice.chat.ChatSocket | null>(null);
 
   useEffect(() => {
-    if (isConnected) {
-      NativeAudio.getPermissions().then(() => {
-        NativeAudio.startRecording();
-      }).catch((error) => {
-        console.error('Failed to get permissions:', error);
-      })
-      const chatSocket = hume.empathicVoice.chat.connect({
-        configId: process.env.EXPO_PUBLIC_HUME_CONFIG_ID,
-      })
-      chatSocket.on('open', () => {
-        chatSocket.sendSessionSettings({
-          audio: {
-            // @ts-ignore
-            encoding: "linear16",
-            channels: 1,
-            sampleRate: 48000,
-          }
+    (async () => {
+      if (isConnected) {
+        try {
+          await NativeAudio.getPermissions()
+        } catch (error) {
+          console.error('Failed to get permissions:', error)
+        }
+        try {
+          await NativeAudio.startRecording()
+        } catch (error) {
+          console.error('Failed to start recording:', error)
+        }
+
+        const chatSocket = hume.empathicVoice.chat.connect({
+          configId: process.env.EXPO_PUBLIC_HUME_CONFIG_ID,
         })
-        console.log("Socket opened");
-      })
-      chatSocket.on('message', handleIncomingMessage);
+        chatSocket.on('open', () => {
+          // The code within the native modules converts the default system audio format
+          // system audio to linear 16 PCM, a standard format recognized by EVI. For linear16 PCM
+          // you must send a `session_settings` message to EVI to inform EVI of the 
+          // correct sampling rate.
+          chatSocket.sendSessionSettings({
+            audio: {
+              encoding: "linear16",
+              channels: 1,
+              sampleRate: NativeAudio.sampleRate,
+            }
+          })
+        })
+        chatSocket.on('message', handleIncomingMessage);
 
-      chatSocket.on('error', (error) => {
-        console.error("WebSocket Error:", error);
-      });
+        chatSocket.on('error', (error) => {
+          console.error("WebSocket Error:", error);
+        });
 
-      console.log('Registering handler')
-      chatSocket.on('close', () => {
-        console.log('Socket closing')
-        setIsConnected(false);
-      });
+        console.log('Registering handler')
+        chatSocket.on('close', () => {
+          console.log('Socket closing')
+          setIsConnected(false);
+        });
 
-      chatSocketRef.current = chatSocket;
+        chatSocketRef.current = chatSocket;
 
-      NativeAudio.onAudioInput(({ base64EncodedAudio }: NativeAudio.AudioEventPayload) => {
-        console.log('Sending audio input...')
-        chatSocket.sendAudioInput({ data: base64EncodedAudio });
-      })
-    } else {
-      NativeAudio.stopRecording();
-      if (chatSocketRef.current) {
-        chatSocketRef.current.close();
+        NativeAudio.onAudioInput(({ base64EncodedAudio }: NativeAudio.AudioEventPayload) => {
+          if (chatSocket.readyState !== WebSocket.OPEN) {
+            console.log('Socket not open, not sending audio input...')
+            return
+          }
+          chatSocket.sendAudioInput({ data: base64EncodedAudio });
+        })
+      } else {
+        try {
+          await NativeAudio.stopRecording()
+        } catch (error) {
+          console.error('Error while stopping recording', error)
+        }
+        if (chatSocketRef.current) {
+          chatSocketRef.current.close();
+        }
       }
-    }
+    })()
     return () => {
-      NativeAudio.stopRecording();
+      NativeAudio.stopRecording().catch((error) => {
+        console.error('Error while stopping recording', error)
+      })
       if (chatSocketRef.current && chatSocketRef.current.readyState === WebSocket.OPEN) {
         chatSocketRef.current?.close();
       }
     }
   }, [isConnected]);
 
-  const handleIncomingMessage = (message: any) => {
-    if (message.type === 'audio_output') {
-      const audioData = message.data;
-      const decodedAudio = atob(audioData);
-      playAudio(decodedAudio);
-    } else if (message.type === 'chat_message') {
-      const chatEntry: ChatEntry = {
-        role: message.role === 'assistant' ? 'assistant' : 'user',
-        timestamp: new Date().toString(),
-        content: message.content,
-      };
-      setChatEntries((prev) => [...prev, chatEntry]);
+
+  useEffect(() => {
+    if (isMuted) {
+      NativeAudio.mute().catch((error) => {
+        console.error('Error while muting', error)
+      })
     } else {
-      console.log(message)
+      NativeAudio.unmute().catch((error) => {
+        console.error('Error while unmuting', error)
+      });
     }
-  };
+  }, [isMuted])
 
-  const connectToWebSocket = () => {
-    setIsConnected(true);
-  };
 
-  const disconnectFromWebSocket = () => {
-    setIsConnected(false);
-  };
+  const handleInterruption = () => {
+    console.log("Clearing audio queue...")
+    audioQueue.clear()
+    NativeAudio.stopPlayback()
+  }
 
-  const muteInput = () => {
-    setIsMuted(true);
-    NativeAudio.stopRecording();
-  };
+  const handleIncomingMessage = async (message: Hume.empathicVoice.SubscribeEvent) => {
+    switch (message.type) {
+      case 'error':
+        console.error(message);
+        break;
+      case 'chat_metadata':
+        // Contains useful information:
+        // - chat_id: a unique identifier for the chat session, useful if you want to retrieve transcripts later
+        // - chat_group_id: passing a "chat group" allows you to preserve context and resume the same conversation with EVI
+        //     in a new websocket connection, e.g. after a disconnection.
+        console.log('Received chat metadata:', message);
+        break;
+      case 'audio_output':
+        audioQueue.add(() => NativeAudio.playAudio(message.data));
+        break;
+      case 'user_message':
+      case 'assistant_message':
+        if (message.message.role !== 'user' && message.message.role !== 'assistant') {
+          console.error(`Unhandled: received message with role: ${message.message.role}`);
+          return;
+        }
+        if (message.type === 'user_message') {
+          handleInterruption()
+        }
+        addChatEntry({
+          role: message.message.role,
+          timestamp: new Date().toString(),
+          content: message.message.content!,
+        });
+        break;
+      case 'user_interruption':
+        handleInterruption()
+        break;
 
-  const unmuteInput = () => {
-    setIsMuted(false);
-    NativeAudio.startRecording();
-  };
+      // This message type indicate the end of EVI's "turn" in the conversation. They are not
+      // needed in this example, however they could be useful in an audio environment that didn't have
+      // good echo cancellation, so that you could auto-mute the user's microphone while EVI was
+      // speaking.
+      case 'assistant_end':
 
-  const playAudio = (audioData: string) => {
-    if (playbackQueue.length > 0) {
-      setPlaybackQueue((prev) => [...prev, audioData]);
-    } else {
-      NativeAudio.playAudio(audioData);
+      // These messages are not needed in this example. There are for EVI's "tool use" feature:
+      // https://dev.hume.ai/docs/empathic-voice-interface-evi/tool-use
+      case 'tool_call':
+      case 'tool_error':
+      case 'tool_response':
+        console.log(`Received unhandled message type: ${message.type}`);
+        break;
+      default:
+        const _: never = message;
+        console.error(`Unexpected message`);
+        console.error(message)
+        break;
     }
   };
 
@@ -142,7 +232,7 @@ const App = () => {
         <View style={styles.header}>
           <Text style={styles.headerText}>You are {isConnected ? 'connected' : 'disconnected'}</Text>
         </View>
-        <ScrollView style={styles.chatDisplay}>
+        <ScrollView style={styles.chatDisplay} ref={scrollViewRef}>
           {chatEntries.map((entry, index) => (
             <View
               key={index}
@@ -158,9 +248,9 @@ const App = () => {
         <View style={styles.buttonContainer}>
           <Button
             title={isConnected ? 'Disconnect' : 'Connect'}
-            onPress={isConnected ? disconnectFromWebSocket : connectToWebSocket}
+            onPress={() => setIsConnected(!isConnected)}
           />
-          <Button title={isMuted ? 'Unmute' : 'Mute'} onPress={isMuted ? unmuteInput : muteInput} />
+          <Button title={isMuted ? 'Unmute' : 'Mute'} onPress={() => setIsMuted(!isMuted)} />
         </View>
       </SafeAreaView>
     </View>

diff --git a/evi-react-native-example/EVIExample/AudioQueue.test.ts b/evi-react-native-example/EVIExample/AudioQueue.test.ts
@@ -0,0 +1,32 @@
+import {AudioQueue} from './AudioQueue'
+test('AudioQueue', async () => {
+  const log: Array<string> = []
+  const aq = new AudioQueue()
+  log.push('1') // t=0
+  aq.add(async () => {
+    // t=0
+    log.push('2')
+    await new Promise((resolve) => setTimeout(resolve, 25))
+    // t=25
+    log.push('3')
+    await new Promise((resolve) => setTimeout(resolve, 75))
+    // t=100
+    log.push('5')
+  })
+  await new Promise((resolve) => setTimeout(resolve, 50))
+  // t=50
+  log.push('4')
+  aq.add(async () => {
+    // t=100
+    log.push('6')
+    await new Promise((resolve) => setTimeout(resolve, 100))
+    // t=200
+    log.push('7')
+  })
+  await new Promise((resolve) => setTimeout(resolve, 202))
+  expect(log.join(',')).toBe('1,2,3,4,5,6,7')
+  // @ts-ignore
+  expect(aq.tasks).toEqual([])
+  // @ts-ignore
+  expect(aq.currentClip).toEqual(null)
+})
diff --git a/evi-react-native-example/EVIExample/AudioQueue.ts b/evi-react-native-example/EVIExample/AudioQueue.ts
@@ -0,0 +1,29 @@
+export class AudioQueue {
+  private tasks: Array<() => Promise<void>>
+  private currentClip: Promise<void> | null;
+  constructor() {
+    this.tasks = []
+    this.currentClip = null
+  }
+
+  private advance() {
+    if (this.tasks.length === 0) {
+      this.currentClip = null;
+      return
+    }
+    this.currentClip = this.tasks.shift()!().then(() => this.advance())
+  }
+
+  public add(playAudio: () => Promise<void>) {
+    this.tasks.push(playAudio)
+
+    if (!this.currentClip) {
+      this.advance()
+    }
+  }
+
+  public clear() {
+    this.tasks = []
+    this.currentClip = null
+  }
+}