microsoft · pelikhan · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024 · github-actions
diff --git a/docs/src/components/BuiltinTools.mdx b/docs/src/components/BuiltinTools.mdx
@@ -45,4 +45,5 @@ import { LinkCard } from '@astrojs/starlight/components';
 <LinkCard title="user_input_confirm" description="Ask the user to confirm a message." href="/genaiscript/reference/scripts/system#systemuser_input" />
 <LinkCard title="user_input_select" description="Ask the user to select an option." href="/genaiscript/reference/scripts/system#systemuser_input" />
 <LinkCard title="user_input_text" description="Ask the user to input text." href="/genaiscript/reference/scripts/system#systemuser_input" />
+<LinkCard title="vision_ask_image" description="Use vision model to run a query on an image" href="/genaiscript/reference/scripts/system#systemvision_ask_image" />
 
diff --git a/docs/src/content/docs/reference/cli/commands.md b/docs/src/content/docs/reference/cli/commands.md
@@ -16,8 +16,9 @@
 Runs a GenAIScript against files.
 
 Options:
-  -m, --model <string>                       model for the run
-  -sm, --small-model <string>                small model for the run
+  -m, --model <string>                       'large' model alias (default)
+  -sm, --small-model <string>                'small' alias model
+  -vm, --vision-model <string>               'vision' alias model
   -lp, --logprobs                            enable reporting token probabilities
   -tlp, --top-logprobs <number>              number of top logprobs (1 to 5)
   -ef, --excluded-files <string...>          excluded files

diff --git a/docs/src/content/docs/reference/scripts/system.mdx b/docs/src/content/docs/reference/scripts/system.mdx
@@ -3350,6 +3350,72 @@ defTool(
 `````
 
 
+### `system.vision_ask_image`
+
+Vision Ask Image
+
+Register tool that uses vision model to run a query on an image
+
+-  tool `vision_ask_image`: Use vision model to run a query on an image
+
+`````js wrap title="system.vision_ask_image"
+system({
+    title: "Vision Ask Image",
+    description:
+        "Register tool that uses vision model to run a query on an image",
+})
+
+defTool(
+    "vision_ask_image",
+    "Use vision model to run a query on an image",
+    {
+        type: "object",
+        properties: {
+            image: {
+                type: "string",
+                description: "Image URL or workspace relative filepath",
+            },
+            query: {
+                type: "string",
+                description: "Query to run on the image",
+            },
+            hd: {
+                type: "boolean",
+                description: "Use high definition image",
+            },
+        },
+        required: ["image", "query"],
+    },
+    async (args) => {
+        const { image, query, hd } = args
+        const res = await runPrompt(
+            (_) => {
+                _.defImages(image, {
+                    autoCrop: true,
+                    detail: hd ? "high" : "low",
+                    maxWidth: hd ? 1024 : 512,
+                    maxHeight: hd ? 1024 : 512,
+                })
+                _.$`Answer this query about the images:`
+                _.def("QUERY", query)
+            },
+            {
+                model: "vision",
+                system: [
+                    "system",
+                    "system.assistant",
+                    "system.safety_jailbreak",
+                    "system.safety_harmful_content",
+                ],
+            }
+        )
+        return res
+    }
+)
+
+`````
+
+
 ### `system.zero_shot_cot`
 
 Zero-shot Chain Of Though

diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
@@ -96,8 +96,9 @@ export async function cli() {
         .command("run")
         .description("Runs a GenAIScript against files.")
         .arguments("<script> [files...]")
-        .option("-m, --model <string>", "model for the run")
-        .option("-sm, --small-model <string>", "small model for the run")
+        .option("-m, --model <string>", "'large' model alias (default)")
+        .option("-sm, --small-model <string>", "'small' alias model")
+        .option("-vm, --vision-model <string>", "'vision' alias model")
         .option("-lp, --logprobs", "enable reporting token probabilities")
         .option(
             "-tlp, --top-logprobs <number>",

diff --git a/packages/cli/src/nodehost.ts b/packages/cli/src/nodehost.ts
@@ -30,6 +30,7 @@ import {
     AZURE_AI_INFERENCE_TOKEN_SCOPES,
     MODEL_PROVIDER_AZURE_SERVERLESS_OPENAI,
     DOT_ENV_FILENAME,
+    DEFAULT_VISION_MODEL,
 } from "../../core/src/constants"
 import { tryReadText } from "../../core/src/fs"
 import {
@@ -141,6 +142,7 @@ export class NodeHost implements RuntimeHost {
     readonly defaultModelOptions = {
         model: DEFAULT_MODEL,
         smallModel: DEFAULT_SMALL_MODEL,
+        visionModel: DEFAULT_VISION_MODEL,
         temperature: DEFAULT_TEMPERATURE,
     }
     readonly defaultEmbeddingsModelOptions = {

diff --git a/packages/cli/src/run.ts b/packages/cli/src/run.ts
@@ -206,6 +206,8 @@ export async function runScript(
     if (options.model) host.defaultModelOptions.model = options.model
     if (options.smallModel)
         host.defaultModelOptions.smallModel = options.smallModel
+    if (options.visionModel)
+        host.defaultModelOptions.visionModel = options.visionModel
 
     const fail = (msg: string, exitCode: number, url?: string) => {
         logError(url ? `${msg} (see ${url})` : msg)

diff --git a/packages/cli/src/test.ts b/packages/cli/src/test.ts
@@ -65,6 +65,7 @@ function parseModelSpec(m: string): ModelOptions {
         return {
             model: values["m"],
             smallModel: values["s"],
+            visionModel: values["v"],
             temperature: normalizeFloat(values["t"]),
             topP: normalizeFloat(values["p"]),
         }
@@ -120,11 +121,14 @@ export async function runPromptScriptTests(
         testDelay?: string
         model?: string
         smallModel?: string
+        visionModel?: string
     }
 ): Promise<PromptScriptTestRunResponse> {
     if (options.model) host.defaultModelOptions.model = options.model
     if (options.smallModel)
         host.defaultModelOptions.smallModel = options.smallModel
+    if (options.visionModel)
+        host.defaultModelOptions.visionModel = options.visionModel
 
     const scripts = await listTests({ ids, ...(options || {}) })
     if (!scripts.length)
@@ -163,6 +167,7 @@ export async function runPromptScriptTests(
             cli,
             model: info.model,
             smallModel: info.smallModel,
+            visionModel: info.visionModel,
             models: options.models?.map(parseModelSpec),
             provider: "provider.mjs",
             testProvider,

diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
@@ -345,11 +345,12 @@
             logWarn(
                 `tool: ${tool.spec.name} response too long (${toolContentTokens} tokens), truncating ${maxToolContentTokens} tokens`
             )
-            toolContent = truncateTextToTokens(
-                toolContent,
-                maxToolContentTokens,
-                encoder
-            ) + "... (truncated)"
+            toolContent =
+                truncateTextToTokens(
+                    toolContent,
+                    maxToolContentTokens,
+                    encoder
+                ) + "... (truncated)"
         }
         trace.fence(toolContent, "markdown")
         toolResult.push(toolContent)
@@ -735,8 +736,8 @@
 ): GenerationOptions {
    const res = {
        ...options,
        ...(runOptions || {}),
        model:
            runOptions?.model ??
            options?.model ??
            host.defaultModelOptions.model,
@@ -744,6 +745,10 @@
             runOptions?.smallModel ??
             options?.smallModel ??
             host.defaultModelOptions.smallModel,
+        visionModel:
+            runOptions?.visionModel ??
+            options?.visionModel ??
+            host.defaultModelOptions.visionModel,
         temperature:
             runOptions?.temperature ?? host.defaultModelOptions.temperature,
         embeddingsModel:

diff --git a/packages/core/src/connection.ts b/packages/core/src/connection.ts
@@ -68,6 +68,9 @@ export async function parseDefaultsFromEnv(env: Record<string, string>) {
     if (env.GENAISCRIPT_DEFAULT_SMALL_MODEL)
         host.defaultModelOptions.smallModel =
             env.GENAISCRIPT_DEFAULT_SMALL_MODEL
+    if (env.GENAISCRIPT_DEFAULT_VISION_MODEL)
+        host.defaultModelOptions.visionModel =
+            env.GENAISCRIPT_DEFAULT_VISION_MODEL
     const t = normalizeFloat(env.GENAISCRIPT_DEFAULT_TEMPERATURE)
     if (!isNaN(t)) host.defaultModelOptions.temperature = t
     if (env.GENAISCRIPT_DEFAULT_EMBEDDINGS_MODEL)

diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts
@@ -53,6 +53,7 @@ export const RETRIEVAL_PERSIST_DIR = "retrieval"
 export const HIGHLIGHT_LENGTH = 4000
 export const SMALL_MODEL_ID = "small"
 export const LARGE_MODEL_ID = "large"
+export const VISION_MODEL_ID = "vision"
 export const DEFAULT_MODEL = "openai:gpt-4o"
 export const DEFAULT_MODEL_CANDIDATES = [
     "azure:gpt-4o",
@@ -62,6 +63,14 @@ export const DEFAULT_MODEL_CANDIDATES = [
     "github:gpt-4o",
     "client:gpt-4",
 ]
+export const DEFAULT_VISION_MODEL = "openai:gpt-4o"
+export const DEFAULT_VISION_MODEL_CANDIDATES = [
+    "azure:gpt-4o",
+    "azure_serverless:gpt-4o",
+    DEFAULT_MODEL,
+    "anthropic:claude-2",
+    "github:gpt-4o",
+]
 export const DEFAULT_SMALL_MODEL = "openai:gpt-4o-mini"
 export const DEFAULT_SMALL_MODEL_CANDIDATES = [
     "azure:gpt-4o-mini",

diff --git a/packages/core/src/genaiscript-api-provider.mjs b/packages/core/src/genaiscript-api-provider.mjs
@@ -22,8 +22,17 @@ class GenAIScriptApiProvider {
     }
 
     async callApi(prompt, context) {
-        const { model, smallModel, temperature, top_p, cache, version, cli, quiet } =
-            this.config
+        const {
+            model,
+            smallModel,
+            visionModel,
+            temperature,
+            top_p,
+            cache,
+            version,
+            cli,
+            quiet,
+        } = this.config
         const { vars, logger } = context
         try {
             let files = vars.files // string or string[]
@@ -52,6 +61,7 @@ class GenAIScriptApiProvider {
             if (quiet) args.push("--quiet")
             if (model) args.push("--model", model)
             if (smallModel) args.push("--small-model", smallModel)
+            if (visionModel) args.push("--vision-model", visionModel)
             if (temperature !== undefined)
                 args.push("--temperature", temperature)
             if (top_p !== undefined) args.push("--top_p", top_p)

diff --git a/packages/core/src/genaisrc/system.vision_ask_image.genai.mjs b/packages/core/src/genaisrc/system.vision_ask_image.genai.mjs
@@ -0,0 +1,53 @@
+system({
+    title: "Vision Ask Image",
+    description:
+        "Register tool that uses vision model to run a query on an image",
+})
+
+defTool(
+    "vision_ask_image",
+    "Use vision model to run a query on an image",
+    {
+        type: "object",
+        properties: {
+            image: {
+                type: "string",
+                description: "Image URL or workspace relative filepath",
+            },
+            query: {
+                type: "string",
+                description: "Query to run on the image",
+            },
+            hd: {
+                type: "boolean",
+                description: "Use high definition image",
+            },
+        },
+        required: ["image", "query"],
+    },
+    async (args) => {
+        const { image, query, hd } = args
+        const res = await runPrompt(
+            (_) => {
+                _.defImages(image, {
+                    autoCrop: true,
+                    detail: hd ? "high" : "low",
+                    maxWidth: hd ? 1024 : 512,
+                    maxHeight: hd ? 1024 : 512,
+                })
+                _.$`Answer this query about the images:`
+                _.def("QUERY", query)
+            },
+            {
+                model: "vision",
+                system: [
+                    "system",
+                    "system.assistant",
+                    "system.safety_jailbreak",
+                    "system.safety_harmful_content",
+                ],
+            }
+        )
+        return res
+    }
+)
diff --git a/packages/core/src/host.ts b/packages/core/src/host.ts
@@ -125,7 +125,7 @@ export interface Host {
 
     // read a secret from the environment or a .env file
     defaultModelOptions: Required<
-        Pick<ModelOptions, "model" | "smallModel" | "temperature">
+        Pick<ModelOptions, "model" | "smallModel" | "visionModel" | "temperature">
     >
     defaultEmbeddingsModelOptions: Required<
         Pick<EmbeddingsModelOptions, "embeddingsModel">

diff --git a/packages/core/src/image.ts b/packages/core/src/image.ts
@@ -15,11 +15,13 @@ export async function imageEncodeForLLM(
 ) {
     // Dynamically import the Jimp library and its alignment enums
     const { Jimp, HorizontalAlign, VerticalAlign } = await import("jimp")
-    const { autoCrop, maxHeight, maxWidth } = options
+    let { autoCrop, maxHeight, maxWidth } = options
 
     // If the URL is a string, resolve it to a data URI
     if (typeof url === "string") url = await resolveFileDataUri(url)
 
+    // https://platform.openai.com/docs/guides/vision/calculating-costs#managing-images
+
     // Return the URL if no image processing is required
     if (
         typeof url === "string" &&

diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
@@ -3,10 +3,12 @@ import {
     DEFAULT_EMBEDDINGS_MODEL_CANDIDATES,
     DEFAULT_MODEL_CANDIDATES,
     DEFAULT_SMALL_MODEL_CANDIDATES,
+    DEFAULT_VISION_MODEL_CANDIDATES,
     LARGE_MODEL_ID,
     MODEL_PROVIDER_LLAMAFILE,
     MODEL_PROVIDER_OPENAI,
     SMALL_MODEL_ID,
+    VISION_MODEL_ID,
 } from "./constants"
 import { errorMessage } from "./error"
 import { LanguageModelConfiguration, host } from "./host"
@@ -107,6 +109,12 @@ export async function resolveModelConnectionInfo(
             host.defaultModelOptions.smallModel,
             ...DEFAULT_SMALL_MODEL_CANDIDATES,
         ]
+    } else if (m === VISION_MODEL_ID) {
+        m = undefined
+        candidates ??= [
+            host.defaultModelOptions.visionModel,
+            ...DEFAULT_VISION_MODEL_CANDIDATES,
+        ]
     } else if (m === LARGE_MODEL_ID) {
         m = undefined
         candidates ??= [