From 1c906dfca329fa7172eb1e62050253b7abb36568 Mon Sep 17 00:00:00 2001 From: Peli de Halleux Date: Sat, 21 Dec 2024 16:28:49 +0100 Subject: [PATCH] zod support (#963) --- README.md | 2 +- .../docs/reference/scripts/schemas.mdx | 157 ++++++++++-------- packages/cli/README.md | 2 +- packages/cli/package.json | 10 +- packages/cli/src/runtime.ts | 10 +- packages/core/src/promptdom.ts | 5 +- packages/core/src/types/prompt_template.d.ts | 4 +- packages/core/src/types/prompt_type.d.ts | 2 +- packages/core/src/zod.ts | 19 +++ .../{cityinfo-zod.genai.mts => zod.genai.mts} | 10 +- packages/vscode/README.md | 2 +- 11 files changed, 137 insertions(+), 86 deletions(-) create mode 100644 packages/core/src/zod.ts rename packages/sample/genaisrc/{cityinfo-zod.genai.mts => zod.genai.mts} (66%) diff --git a/README.md b/README.md index 1eedc0dcc9..35dd9df720 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ $`Analyze FILE and extract data to JSON using the ${schema} schema.` ### 📋 Data Schemas -Define, validate, and repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). +Define, validate, and repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). Zod support builtin. ```js const data = defSchema("MY_DATA", { type: "array", items: { ... } }) diff --git a/docs/src/content/docs/reference/scripts/schemas.mdx b/docs/src/content/docs/reference/scripts/schemas.mdx index a341a98cac..473d6a9f7f 100644 --- a/docs/src/content/docs/reference/scripts/schemas.mdx +++ b/docs/src/content/docs/reference/scripts/schemas.mdx @@ -1,17 +1,16 @@ --- title: Data Schemas sidebar: - order: 6 + order: 6 description: Learn how to define and use data schemas for structured output in - JSON/YAML with LLM, including validation and repair techniques. + JSON/YAML with LLM, including validation and repair techniques. keywords: data schemas, JSON schema, YAML validation, LLM structured output, - schema repair + schema repair genaiscript: - model: openai:gpt-3.5-turbo - + model: openai:gpt-3.5-turbo --- -import { Card } from '@astrojs/starlight/components'; +import { Card } from "@astrojs/starlight/components" It is possible to force the LLM to generate data that conforms to a specific schema. This technique works reasonably well and GenAIScript also provides automatic validation "just in case". @@ -32,11 +31,17 @@ const schema = defSchema("CITY_SCHEMA", { description: "A city with population and elevation information.", properties: { name: { type: "string", description: "The name of the city." }, - population: { type: "number", description: "The population of the city." }, - url: { type: "string", description: "The URL of the city's Wikipedia page." } + population: { + type: "number", + description: "The population of the city.", + }, + url: { + type: "string", + description: "The URL of the city's Wikipedia page.", + }, }, - required: ["name", "population", "url"] - } + required: ["name", "population", "url"], + }, }) $`Generate data using JSON compliant with ${schema}.` @@ -47,9 +52,9 @@ $`Generate data using JSON compliant with ${schema}.`
👤 user - ````markdown wrap CITY_SCHEMA: + ```typescript-schema // A list of cities with population and elevation information. type CITY_SCHEMA = Array<{ @@ -61,46 +66,64 @@ type CITY_SCHEMA = Array<{ url: string, }> ``` + Generate data using JSON compliant with CITY_SCHEMA. ```` -
-
🤖 assistant - ````markdown wrap File ./data.json: + ```json schema=CITY_SCHEMA [ - { - "name": "New York", - "population": 8398748, - "url": "https://en.wikipedia.org/wiki/New_York_City" - }, - { - "name": "Los Angeles", - "population": 3990456, - "url": "https://en.wikipedia.org/wiki/Los_Angeles" - }, - { - "name": "Chicago", - "population": 2705994, - "url": "https://en.wikipedia.org/wiki/Chicago" - } + { + "name": "New York", + "population": 8398748, + "url": "https://en.wikipedia.org/wiki/New_York_City" + }, + { + "name": "Los Angeles", + "population": 3990456, + "url": "https://en.wikipedia.org/wiki/Los_Angeles" + }, + { + "name": "Chicago", + "population": 2705994, + "url": "https://en.wikipedia.org/wiki/Chicago" + } ] ``` ```` -
{/* genaiscript output end */} +### Native zod support + +A [Zod](https://zod.dev/) type can be passed in `defSchema` and it will be automatically converted to JSON schema. +The GenAIScript also exports the `z` object from Zod for convenience. +```js +// import from genaiscript +import { z } from "genaiscript/runtime" +// or directly from zod +// import { z } from "zod" +// create schema using zod +const CitySchema = z.array( + z.object({ + name: z.string(), + population: z.number(), + url: z.string(), + }) +) +// JSON schema to constrain the output of the tool. +const schema = defSchema("CITY_SCHEMA", CitySchema) +``` ### Prompt encoding @@ -111,12 +134,12 @@ from TypeChat, the schema is converted TypeScript types before being injected in // A list of cities with population and elevation information. type CITY_SCHEMA = Array<{ // The name of the city. - name: string, + name: string // The population of the city. - population: number, + population: number // The URL of the city's Wikipedia page. - url: string, - }> + url: string +}> ``` You can change this behavior by using the `{ format: "json" }` option. @@ -134,50 +157,46 @@ in the output folder as well.
schema CITY_SCHEMA -- source: +- source: ```json { - "type": "array", - "description": "A list of cities with population and elevation information.", - "items": { - "type": "object", - "description": "A city with population and elevation information.", - "properties": { - "name": { - "type": "string", - "description": "The name of the city." - }, - "population": { - "type": "number", - "description": "The population of the city." - }, - "url": { - "type": "string", - "description": "The URL of the city's Wikipedia page." - } - }, - "required": [ - "name", - "population", - "url" - ] - } + "type": "array", + "description": "A list of cities with population and elevation information.", + "items": { + "type": "object", + "description": "A city with population and elevation information.", + "properties": { + "name": { + "type": "string", + "description": "The name of the city." + }, + "population": { + "type": "number", + "description": "The population of the city." + }, + "url": { + "type": "string", + "description": "The URL of the city's Wikipedia page." + } + }, + "required": ["name", "population", "url"] + } } ``` -- prompt (rendered as typescript): + +- prompt (rendered as typescript): ```ts // A list of cities with population and elevation information. type CITY_SCHEMA = Array<{ // The name of the city. - name: string, + name: string // The population of the city. - population: number, + population: number // The URL of the city's Wikipedia page. - url: string, - }> - + url: string +}> ```
@@ -199,7 +218,7 @@ GenAIScript automatically validates the payload against the schema. :::tip -Not all data formats are equal! Some data formats like JSON introduce ambiguity +Not all data formats are equal! Some data formats like JSON introduce ambiguity and can confuse the LLM. [Read more...](https://betterprogramming.pub/yaml-vs-json-which-is-more-efficient-for-language-models-5bc11dd0f6df). @@ -207,7 +226,7 @@ and can confuse the LLM. ## Repair -GenAIScript will automatically try to repair the data by issues additional messages +GenAIScript will automatically try to repair the data by issues additional messages back to the LLM with the parsing output. ## Runtime Validation @@ -216,4 +235,4 @@ Use `parsers.validateJSON` to validate JSON when running the script. ```js const validation = parsers.validateJSON(schema, json) -``` \ No newline at end of file +``` diff --git a/packages/cli/README.md b/packages/cli/README.md index 5dc1df8570..c249ab1d4e 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -94,7 +94,7 @@ $`Analyze FILE and extract data to JSON using the ${schema} schema.` ### 📋 Data Schemas -Define, validate, and repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). +Define, validate, and repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). Zod support builtin. ```js const data = defSchema("MY_DATA", { type: "array", items: { ... } }) diff --git a/packages/cli/package.json b/packages/cli/package.json index 5fe00442cc..ba9e1fee1a 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -67,7 +67,9 @@ "turndown-plugin-gfm": "^1.0.2", "typescript": "5.7.2", "vectra": "^0.9.0", - "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz", + "zod": "^3.24.1", + "zod-to-json-schema": "^3.24.1" }, "optionalDependencies": { "@huggingface/transformers": "^3.2.1", @@ -112,11 +114,11 @@ "zx": "^8.2.4" }, "scripts": { - "compile:runtime": "tsc src/runtime.ts --skipLibCheck --outDir built --declaration --target es2020 --moduleResolution node && mv built/runtime.js built/runtime.mjs", + "compile:runtime": "tsc src/runtime.ts --skipLibCheck --outDir built --declaration --target es2020 --moduleResolution node --module esnext && mv built/runtime.js built/runtime.mjs", "compile:api": "esbuild src/api.ts --outfile=built/api.mjs", - "compile:cli": "esbuild src/main.ts --metafile=./esbuild.meta.json --bundle --platform=node --target=node20 --outfile=built/genaiscript.cjs --external:tsx --external:esbuild --external:get-tsconfig --external:resolve-pkg-maps --external:dockerode --external:pdfjs-dist --external:web-tree-sitter --external:tree-sitter-wasms --external:promptfoo --external:typescript --external:@lvce-editor/ripgrep --external:gpt-3-encoder --external:mammoth --external:xlsx --external:mathjs --external:@azure/identity --external:gpt-tokenizer --external:playwright --external:@inquirer/prompts --external:jimp --external:turndown --external:turndown-plugin-gfm --external:vectra --external:tabletojson --external:html-to-text --external:@octokit/rest --external:@octokit/plugin-throttling --external:@octokit/plugin-retry --external:@octokit/plugin-paginate-rest --external:skia-canvas --external:@huggingface/transformers --external:@modelcontextprotocol/sdk --external:@anthropic-ai/sdk --external:@anthropic-ai/bedrock-sdk --external:es-toolkit && node ../../scripts/patch-cli.mjs", + "compile:cli": "esbuild src/main.ts --metafile=./esbuild.meta.json --bundle --platform=node --target=node20 --outfile=built/genaiscript.cjs --external:tsx --external:esbuild --external:get-tsconfig --external:resolve-pkg-maps --external:dockerode --external:pdfjs-dist --external:web-tree-sitter --external:tree-sitter-wasms --external:promptfoo --external:typescript --external:@lvce-editor/ripgrep --external:gpt-3-encoder --external:mammoth --external:xlsx --external:mathjs --external:@azure/identity --external:gpt-tokenizer --external:playwright --external:@inquirer/prompts --external:jimp --external:turndown --external:turndown-plugin-gfm --external:vectra --external:tabletojson --external:html-to-text --external:@octokit/rest --external:@octokit/plugin-throttling --external:@octokit/plugin-retry --external:@octokit/plugin-paginate-rest --external:skia-canvas --external:@huggingface/transformers --external:@modelcontextprotocol/sdk --external:@anthropic-ai/sdk --external:@anthropic-ai/bedrock-sdk --external:es-toolkit --external:zod --external:zod-to-json-schema && node ../../scripts/patch-cli.mjs", "compile": "yarn compile:api && yarn compile:runtime && yarn compile:cli", - "compile-debug": "esbuild src/main.ts --sourcemap --metafile=./esbuild.meta.json --bundle --platform=node --target=node20 --outfile=built/genaiscript.cjs --external:tsx --external:esbuild --external:get-tsconfig --external:resolve-pkg-maps --external:dockerode --external:pdfjs-dist --external:web-tree-sitter --external:tree-sitter-wasms --external:promptfoo --external:typescript --external:@lvce-editor/ripgrep --external:gpt-3-encoder --external:mammoth --external:xlsx --external:mathjs --external:@azure/identity --external:gpt-tokenizer --external:playwright --external:@inquirer/prompts --external:jimp --external:turndown --external:turndown-plugin-gfm --external:vectra --external:tabletojson --external:html-to-text --external:@octokit/rest --external:@octokit/plugin-throttling --external:@octokit/plugin-retry --external:@octokit/plugin-paginate-rest --external:skia-canvas --external:@huggingface/transformers --external:@modelcontextprotocol/sdk --external:@anthropic-ai/sdk --external:@anthropic-ai/bedrock-sdk --external:es-toolkit", + "compile-debug": "esbuild src/main.ts --sourcemap --metafile=./esbuild.meta.json --bundle --platform=node --target=node20 --outfile=built/genaiscript.cjs --external:tsx --external:esbuild --external:get-tsconfig --external:resolve-pkg-maps --external:dockerode --external:pdfjs-dist --external:web-tree-sitter --external:tree-sitter-wasms --external:promptfoo --external:typescript --external:@lvce-editor/ripgrep --external:gpt-3-encoder --external:mammoth --external:xlsx --external:mathjs --external:@azure/identity --external:gpt-tokenizer --external:playwright --external:@inquirer/prompts --external:jimp --external:turndown --external:turndown-plugin-gfm --external:vectra --external:tabletojson --external:html-to-text --external:@octokit/rest --external:@octokit/plugin-throttling --external:@octokit/plugin-retry --external:@octokit/plugin-paginate-rest --external:skia-canvas --external:@huggingface/transformers --external:@modelcontextprotocol/sdk --external:@anthropic-ai/sdk --external:@anthropic-ai/bedrock-sdk --external:es-toolkit --external:zod --external:zod-to-json-schema", "postcompile": "node built/genaiscript.cjs info help > ../../docs/src/content/docs/reference/cli/commands.md", "vis:treemap": "npx --yes esbuild-visualizer --metadata esbuild.meta.json --filename esbuild.treemap.html", "vis:network": "npx --yes esbuild-visualizer --metadata esbuild.meta.json --filename esbuild.network.html --template network", diff --git a/packages/cli/src/runtime.ts b/packages/cli/src/runtime.ts index 6d7ad17e3d..6c4515e3ed 100644 --- a/packages/cli/src/runtime.ts +++ b/packages/cli/src/runtime.ts @@ -1,9 +1,15 @@ /** * GenAIScript supporting runtime */ -import { delay as esDelay } from "es-toolkit" +import { delay as _delay } from "es-toolkit" +import { z as zod } from "zod" /** * A helper function to delay the execution of the script */ -export const delay: (ms: number) => Promise = esDelay +export const delay: (ms: number) => Promise = _delay + +/** + * Zod schema generator + */ +export const z = zod diff --git a/packages/core/src/promptdom.ts b/packages/core/src/promptdom.ts index 10eefbc849..71d9f5f63e 100644 --- a/packages/core/src/promptdom.ts +++ b/packages/core/src/promptdom.ts @@ -41,6 +41,7 @@ import { jinjaRenderChatMessage } from "./jinja" import { runtimeHost } from "./host" import { hash } from "./crypto" import { startMcpServer } from "./mcp" +import { tryZodToJsonSchema } from "./zod" // Definition of the PromptNode interface which is an essential part of the code structure. export interface PromptNode extends ContextExpansionOptions { @@ -361,11 +362,13 @@ export function createImageNode( // Function to create a schema node. export function createSchemaNode( name: string, - value: JSONSchema, + value: JSONSchema | ZodTypeLike, options?: DefSchemaOptions ): PromptSchemaNode { assert(!!name) assert(value !== undefined) + // auto zod conversion + value = tryZodToJsonSchema(value as ZodTypeLike) ?? (value as JSONSchema) return { type: "schema", name, value, options } } diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index c4dfbbb17a..f6481a5778 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -2604,10 +2604,12 @@ interface McpServerConfig { type McpServersConfig = Record> +type ZodTypeLike = { _def: any, safeParse: any, refine: any } + interface ChatGenerationContext extends ChatTurnGenerationContext { defSchema( name: string, - schema: JSONSchema, + schema: JSONSchema | ZodTypeLike, options?: DefSchemaOptions ): string defImages( diff --git a/packages/core/src/types/prompt_type.d.ts b/packages/core/src/types/prompt_type.d.ts index 97b4ef7097..56c7042f37 100644 --- a/packages/core/src/types/prompt_type.d.ts +++ b/packages/core/src/types/prompt_type.d.ts @@ -239,7 +239,7 @@ declare function fetchText( */ declare function defSchema( name: string, - schema: JSONSchema, + schema: JSONSchema | ZodTypeLike, options?: DefSchemaOptions ): string diff --git a/packages/core/src/zod.ts b/packages/core/src/zod.ts new file mode 100644 index 0000000000..f3fe5a600d --- /dev/null +++ b/packages/core/src/zod.ts @@ -0,0 +1,19 @@ +import { zodToJsonSchema as _zodToJsonSchema } from "zod-to-json-schema" + +/** + * Converts a Zod schema to a JSON schema + * @param z + * @param options + * @returns + */ +export function tryZodToJsonSchema( + z: ZodTypeLike, + options?: object +): JSONSchema { + if (!z || !z._def || !z.refine || !z.safeParse) return undefined + const schema = _zodToJsonSchema(z as any, { + target: "openAi", + ...(options || {}), + }) + return structuredClone(schema) as JSONSchema +} diff --git a/packages/sample/genaisrc/cityinfo-zod.genai.mts b/packages/sample/genaisrc/zod.genai.mts similarity index 66% rename from packages/sample/genaisrc/cityinfo-zod.genai.mts rename to packages/sample/genaisrc/zod.genai.mts index 603d5d4540..a63e4d6fd2 100644 --- a/packages/sample/genaisrc/cityinfo-zod.genai.mts +++ b/packages/sample/genaisrc/zod.genai.mts @@ -1,12 +1,14 @@ script({ files: ["./src/cities.md"], + tests: { + files: ["./src/cities.md"], + }, }) // the data to analyze def("CITIES", env.files) -import { z } from "zod" -import { zodToJsonSchema } from "zod-to-json-schema" +import { z } from "genaiscript/runtime" // create schema using zod const CitySchema = z.array( z.object({ @@ -16,9 +18,7 @@ const CitySchema = z.array( }) ) // JSON schema to constrain the output of the tool. -const schema = defSchema("CITY_SCHEMA", zodToJsonSchema(CitySchema, "citySchema").definitions[ - "citySchema" -] as JSONSchemaArray) +const schema = defSchema("CITY_SCHEMA", CitySchema) // the task` $`Answer with the information of the cities in the CITIES data set, compliant with ${schema}.` diff --git a/packages/vscode/README.md b/packages/vscode/README.md index c649e8b11f..2e9fd84dde 100644 --- a/packages/vscode/README.md +++ b/packages/vscode/README.md @@ -24,7 +24,7 @@ $`Analyze FILE and - 📁 Scripts are [files](https://microsoft.github.io/genaiscript/reference/scripts/)! They can be versioned, shared, forked, ... -- 📊 Define, validate, repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). +- 📊 Define, validate, repair data using [schemas](https://microsoft.github.io/genaiscript/reference/scripts/schemas). Zod support builtin. ```js wrap const data = defSchema("MY_DATA",