Add support for voice messages (#11)

Also: * Apply OOP for Chat, User objects * Improve logging * Re-use system message id
daohoangson · Nov 21, 2023 · 7486b25 · 7486b25
1 parent 6d1980d
commit 7486b25
Show file tree

Hide file tree

Showing 14 changed files with 388 additions and 221 deletions.
diff --git a/README.md b/README.md
@@ -8,10 +8,12 @@
 
 ## Roadmap
 
-- [x] Integrate with Assistants API
-- [x] Use GPT-4 Turbo model for conversation
+- [x] Integrate with [Assistants API](https://platform.openai.com/docs/assistants/overview)
+- [x] Use [GPT-4 Turbo](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo) for conversation
 - [x] Use Vision Preview model for image analysis
-- [x] Use DALL-E 3 model for image generation
+- [x] Use [DALL-E 3](https://platform.openai.com/docs/models/dall-e) for image generation
+- [x] Use [whisper-1](https://platform.openai.com/docs/models/whisper) for speech to text transcription
+- [x] Use [tts-1](https://platform.openai.com/docs/models/tts) for text to speech synthesis
 - [x] Add support for memory recall
 - [ ] Allow bot to search for old messages, and maybe continue previous conversation
 - [ ] Add support for reminders

diff --git a/packages/core/src/interfaces/ai/index.ts b/packages/core/src/interfaces/ai/index.ts
@@ -1,2 +1,3 @@
 export * from "./agent";
+export * from "./speech";
 export * from "./tool";
diff --git a/packages/core/src/interfaces/ai/speech.ts b/packages/core/src/interfaces/ai/speech.ts
@@ -0,0 +1,9 @@
+export type SpeechData = {
+  blob: () => Promise<Blob>;
+  url: string;
+};
+
+export type Speech = {
+  fromText(text: string): Promise<SpeechData>;
+  toText(input: SpeechData): Promise<string>;
+};
diff --git a/packages/core/src/interfaces/chat.ts b/packages/core/src/interfaces/chat.ts
@@ -10,7 +10,7 @@ export type ChatPhoto = Chat & {
 };
 
 export type ChatText = Chat & {
-  getTextMessage: () => string;
+  getTextMessage: () => Promise<string>;
 };
 
 export type Reply =

diff --git a/packages/functions/src/handlers/telegram-webhook-handler.ts b/packages/functions/src/handlers/telegram-webhook-handler.ts
@@ -3,7 +3,7 @@ import { Config } from "sst/node/config";
 import { kv } from "@bubby/aws";
 import { AppContext } from "@bubby/core/interfaces/app";
 import { ChatPhoto, ChatText } from "@bubby/core/interfaces/chat";
-import { agent } from "@bubby/openai";
+import { agent, speech } from "@bubby/openai";
 import { onMessage } from "@bubby/telegram";
 import { tools } from "src/tools";
 
@@ -18,6 +18,7 @@ export async function handleTelegramWebhook(secretToken: string, update: any) {
   await onMessage({
     onPhoto: (input) => replyToPhoto({ ...input, kv }),
     onText: (input) => replyToText({ ...input, kv }),
+    speech,
     update,
   });
 }
@@ -30,8 +31,8 @@ function replyToPhoto(ctx: AppContext<ChatPhoto>): Promise<void> {
   return respond(ctx, message);
 }
 
-const replyToText = (ctx: AppContext<ChatText>) =>
-  respond(ctx, ctx.chat.getTextMessage());
+const replyToText = async (ctx: AppContext<ChatText>) =>
+  respond(ctx, await ctx.chat.getTextMessage());
 
 const respond = (ctx: AppContext, message: string) =>
   agent.respond({ ctx, message, tools });
diff --git a/packages/functions/src/tools/image.ts b/packages/functions/src/tools/image.ts
@@ -22,7 +22,7 @@ export const analyzeImage: Tool<z.infer<typeof analyzeImageParameters>> = {
   description: "Analyze an image.",
   name: "analyze_image",
   handler: async ({ ctx, parameters }) => {
-    ctx.chat.reply({ type: "system", system: "🚨 Analyzing image..." });
+    ctx.chat.reply({ type: "system", system: "🚨 Analyzing..." });
     return visionAnalyzeImage({ ctx, ...parameters });
   },
   parametersSchema: analyzeImageParameters,

diff --git a/packages/openai/src/agent.ts b/packages/openai/src/agent.ts
@@ -23,7 +23,6 @@ export const agent: Agent = {
         assistantGetNewMessages(threadId, runId, messageIds).then(
           (messages) => {
             for (const message of messages) {
-              console.log(JSON.stringify({ loopCount, message }, null, 2));
               for (const messageContent of message.content) {
                 if (messageContent.type === "text") {
                   const markdown = messageContent.text.value;

diff --git a/packages/openai/src/index.ts b/packages/openai/src/index.ts
@@ -1,4 +1,5 @@
 export * from "./agent";
+export * from "./speech";
 
 // TODO: avoid exporting internal functions
 export { assistantThreadIdInsert } from "./internal/assistant_thread";

diff --git a/packages/openai/src/internal/audio.ts b/packages/openai/src/internal/audio.ts
@@ -0,0 +1,28 @@
+import { toFile } from "openai";
+import { SpeechCreateParams } from "openai/resources/audio/speech";
+import { TranscriptionCreateParams } from "openai/resources/audio/transcriptions";
+
+import { SpeechData } from "@bubby/core/interfaces/ai";
+import { openai } from "./openai";
+
+export async function audioCreateTranscription(
+  speechData: SpeechData
+): Promise<string> {
+  const body: TranscriptionCreateParams = {
+    file: await toFile(speechData),
+    model: "whisper-1",
+  };
+  const transcription = await openai.audio.transcriptions.create(body);
+  console.log(JSON.stringify(transcription, null, 2));
+  return transcription.text;
+}
+
+export async function audioCreateSpeech(input: string): Promise<SpeechData> {
+  const body: SpeechCreateParams = {
+    input,
+    model: "tts-1",
+    response_format: "opus",
+    voice: "alloy",
+  };
+  return openai.audio.speech.create(body);
+}
diff --git a/packages/openai/src/speech.ts b/packages/openai/src/speech.ts
@@ -0,0 +1,7 @@
+import { Speech } from "@bubby/core/interfaces/ai";
+import { audioCreateSpeech, audioCreateTranscription } from "./internal/audio";
+
+export const speech: Speech = {
+  fromText: async (text) => audioCreateSpeech(text),
+  toText: (speechData) => audioCreateTranscription(speechData),
+};