Prompt reorg for caching + usage collection (#743)

* Optimize chat caching by repositioning definitions in prompt structure. * Update node sorting logic to include "image" type in layoutPromptNode function * Add ephemeral property to PromptNode and update sorting logic * Fix typo and add prompt caching section to context.md * Add chat usage tracking and refactor session handling * Update token usage logging and fix token accumulation logic in chat processing * Add ChatCompletionUsages to GenerationOptions and refactor usage handling * Update loop to iterate over 'usages' instead of 'result.usages' * Update log format for token usage details in CLI output
microsoft · Oct 1, 2024 · a64b1be · a64b1be
1 parent 820ca82
commit a64b1be
Show file tree

Hide file tree

Showing 28 changed files with 243 additions and 47 deletions.
diff --git a/docs/genaisrc/genaiscript.d.ts b/docs/genaisrc/genaiscript.d.ts
diff --git a/docs/src/content/docs/reference/scripts/context.md b/docs/src/content/docs/reference/scripts/context.md
@@ -106,7 +106,7 @@ def("DIFF", gitdiff, { language: "diff" })
 ### Referencing
 
 The `def` function returns a variable name that can be used in the prompt.
-The name might be formatted diferently to accommodate the model's preference.
+The name might be formatted differently to accommodate the model's preference.
 
 ```js "const f = "
 const f = def("FILE", file)
@@ -182,6 +182,15 @@ def("FILE", env.files, { sliceTail: 100 })
 def("FILE", env.files, { sliceSample: 100 })
 ```
 
+### Prompt Caching
+
+You can specify `ephemeral: true` to turn on some prompt caching optimization. In paricular, a `def` with `ephemeral` will be rendered at the back of the prompt
+to persist the [cache prefix](https://openai.com/index/api-prompt-caching/).
+
+```js
+def("FILE", env.files, { ephemeral: true })
+```
+
 ## Data definition (`defData`)
 
 The `defData` function offers additional formatting options for converting a data object into a textual representation. It supports rendering objects as YAML, JSON, or CSV (formatted as a markdown table).

diff --git a/genaisrc/genaiscript.d.ts b/genaisrc/genaiscript.d.ts
diff --git a/packages/auto/genaiscript.d.ts b/packages/auto/genaiscript.d.ts
diff --git a/packages/cli/src/run.ts b/packages/cli/src/run.ts
@@ -6,7 +6,10 @@ import { convertDiagnosticsToSARIF } from "./sarif"
 import { buildProject } from "./build"
 import { diagnosticsToCSV } from "../../core/src/ast"
 import { CancellationOptions } from "../../core/src/cancellation"
-import { ChatCompletionsProgressReport } from "../../core/src/chattypes"
+import {
+    ChatCompletionsProgressReport,
+    ChatCompletionUsages,
+} from "../../core/src/chattypes"
 import { runTemplate } from "../../core/src/promptrunner"
 import {
     githubCreateIssueComment,
@@ -244,7 +247,7 @@ export async function runScript(
         (acc, v) => ({ ...acc, ...parseKeyValuePair(v) }),
         {}
     )
-    let tokens = 0
+    const usages: ChatCompletionUsages = {}
     try {
         if (options.label) trace.heading(2, options.label)
         const { info } = await resolveModelConnectionInfo(script, {
@@ -262,6 +265,7 @@ export async function runScript(
         trace.options.encoder = await resolveTokenEncoder(info.model)
         await runtimeHost.models.pullModel(info.model)
         result = await runTemplate(prj, script, fragment, {
+            usages,
             inner: false,
             infoCb: (args) => {
                 const { text } = args
@@ -272,7 +276,6 @@ export async function runScript(
             },
             partialCb: (args) => {
                 const { responseChunk, tokensSoFar, inner } = args
-                tokens = tokensSoFar
                 if (responseChunk !== undefined) {
                     if (stream) {
                         if (!inner) process.stdout.write(responseChunk)
@@ -523,7 +526,13 @@ export async function runScript(
     if (failOnErrors && result.annotations?.some((a) => a.severity === "error"))
         return fail("error annotations found", ANNOTATION_ERROR_CODE)
 
-    logVerbose("genaiscript: done\n")
-    if (outTraceFilename) logVerbose(`trace: ${outTraceFilename}`)
+    logVerbose("genaiscript: done")
+    for (const [key, value] of Object.entries(usages)) {
+        if (value.total_tokens > 0)
+            logVerbose(
+                `tokens:  ${key}, ${value.total_tokens} (${value.prompt_tokens} => ${value.completion_tokens})`
+            )
+    }
+    if (outTraceFilename) logVerbose(`  trace: ${outTraceFilename}`)
     return { exitCode: 0, result }
 }
diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
@@ -28,6 +28,8 @@ import {
     ChatCompletionResponse,
     ChatCompletionsOptions,
     ChatCompletionTool,
+    ChatCompletionUsage,
+    ChatCompletionUsages,
     ChatCompletionUserMessageParam,
     CreateChatCompletionRequest,
 } from "./chattypes"
@@ -369,7 +371,7 @@ function structurifyChatSession(
         err?: any
     }
 ): RunPromptResult {
-    const { trace, responseType, responseSchema } = options
+    const { trace, responseType, responseSchema, usages } = options
     const { resp, err } = others || {}
     const text = assistantText(messages, responseType)
     const annotations = parseAnnotations(text)
@@ -426,10 +428,12 @@ function structurifyChatSession(
         error,
         genVars,
         schemas,
+        usages,
     }
 }
 
 async function processChatMessage(
+    req: CreateChatCompletionRequest,
     resp: ChatCompletionResponse,
     messages: ChatCompletionMessageParam[],
     tools: ToolCallback[],
@@ -443,8 +447,11 @@ async function processChatMessage(
         maxToolCalls = MAX_TOOL_CALLS,
         trace,
         cancellationToken,
+        usages,
     } = options
 
+    accumulateChatUsage(usages, req.model, resp.usage)
+
     if (resp.text)
         messages.push({
             role: "assistant",
@@ -534,11 +541,29 @@ export function mergeGenerationOptions(
     }
 }
 
+function accumulateChatUsage(
+    usages: ChatCompletionUsages,
+    model: string,
+    usage: ChatCompletionUsage
+) {
+    if (!usage) return
+
+    const u =
+        usages[model] ??
+        (usages[model] = <ChatCompletionUsage>{
+            completion_tokens: 0,
+            prompt_tokens: 0,
+            total_tokens: 0,
+        })
+    u.completion_tokens += usage.completion_tokens ?? 0
+    u.prompt_tokens += usage.prompt_tokens ?? 0
+    u.total_tokens += usage.total_tokens ?? 0
+}
+
 export async function executeChatSession(
     connectionToken: LanguageModelConfiguration,
     cancellationToken: CancellationToken,
     messages: ChatCompletionMessageParam[],
-    vars: Partial<ExpansionVariables>,
     toolDefinitions: ToolCallback[],
     schemas: Record<string, JSONSchema>,
     completer: ChatCompletionHandler,
@@ -585,34 +610,35 @@ export async function executeChatSession(
             let resp: ChatCompletionResponse
             try {
                 checkCancelled(cancellationToken)
+                const req: CreateChatCompletionRequest = {
+                    model,
+                    temperature: temperature,
+                    top_p: topP,
+                    max_tokens: maxTokens,
+                    seed,
+                    stream: true,
+                    messages,
+                    tools,
+                    response_format:
+                        responseType === "json_object"
+                            ? { type: responseType }
+                            : responseType === "json_schema"
+                              ? {
+                                    type: "json_schema",
+                                    json_schema: {
+                                        name: "result",
+                                        schema: toStrictJSONSchema(
+                                            responseSchema
+                                        ),
+                                        strict: true,
+                                    },
+                                }
+                              : undefined,
+                }
                 try {
                     trace.startDetails(`📤 llm request`)
                     resp = await completer(
-                        {
-                            model,
-                            temperature: temperature,
-                            top_p: topP,
-                            max_tokens: maxTokens,
-                            seed,
-                            stream: true,
-                            messages,
-                            tools,
-                            response_format:
-                                responseType === "json_object"
-                                    ? { type: responseType }
-                                    : responseType === "json_schema"
-                                      ? {
-                                            type: "json_schema",
-                                            json_schema: {
-                                                name: "result",
-                                                schema: toStrictJSONSchema(
-                                                    responseSchema
-                                                ),
-                                                strict: true,
-                                            },
-                                        }
-                                      : undefined,
-                        },
+                        req,
                         connectionToken,
                         genOptions,
                         trace
@@ -625,6 +651,7 @@ export async function executeChatSession(
                 }
 
                 const output = await processChatMessage(
+                    req,
                     resp,
                     messages,
                     toolDefinitions,

diff --git a/packages/core/src/chattypes.ts b/packages/core/src/chattypes.ts
@@ -18,6 +18,15 @@ export interface AICIRequest {
 }
 
 // Aliases for OpenAI chat completion types
+export type ChatCompletionUsage = Omit<
+    OpenAI.Completions.CompletionUsage,
+    "completion_tokens_details"
+>
+
+/**
+ * Per model storage of chat completion usages.
+ */
+export type ChatCompletionUsages = Record<string, ChatCompletionUsage>
 
 // Text content part of a chat completion
 export type ChatCompletionContentPartText =
@@ -99,6 +108,7 @@ export interface ChatCompletionResponse {
     toolCalls?: ChatCompletionToolCall[] // List of tool calls made during the response
     finishReason?: // Reason why the chat completion finished
     "stop" | "length" | "tool_calls" | "content_filter" | "cancel" | "fail"
+    usage?: ChatCompletionUsage // Usage information for the completion
 }
 
 // Alias for OpenAI's API error type

diff --git a/packages/core/src/genaisrc/genaiscript.d.ts b/packages/core/src/genaisrc/genaiscript.d.ts
diff --git a/packages/core/src/generation.ts b/packages/core/src/generation.ts
@@ -1,7 +1,11 @@
 // Import necessary modules and interfaces
 import { CancellationToken } from "./cancellation"
 import { LanguageModel } from "./chat"
-import { ChatCompletionMessageParam, ChatCompletionsOptions } from "./chattypes"
+import {
+    ChatCompletionMessageParam,
+    ChatCompletionsOptions,
+    ChatCompletionUsages,
+} from "./chattypes"
 import { MarkdownTrace } from "./trace"
 
 // Represents a code fragment with associated files
@@ -56,6 +60,11 @@ export interface GenerationResult extends GenerationOutput {
      */
     finishReason?: string
 
+    /**
+     * Token usage statistics if reported by LLM
+     */
+    usages?: ChatCompletionUsages
+
     /**
      * Optional label for the run
      */
@@ -96,4 +105,5 @@ export interface GenerationOptions
     }
     vars?: PromptParameters // Variables for prompt customization
     stats: GenerationStats // Statistics of the generation
+    usages: ChatCompletionUsages
 }