From b8cc9871488017edd4a848f847fa0140270869e8 Mon Sep 17 00:00:00 2001 From: Peli de Halleux Date: Wed, 23 Oct 2024 12:06:49 -0700 Subject: [PATCH] expose decoder (#792) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * expose decoder * feat: ✨ add text chunking functionality to encoders * fix: 🐛 handle undefined content and filename gracefully * feat: ✨ add line numbering to text chunks * refactor: :recycle: rename chunkText to chunk function * refactor: ♻️ improve chunking and line numbering logic * feat: ✨ add line tracking and summarization script * feat: :white_check_mark: add tests object to script function * docs: ✏️ add tokenizers reference documentation --- .../docs/reference/scripts/tokenizers.md | 58 +++++++++ packages/cli/src/parse.ts | 6 +- packages/cli/src/run.ts | 2 +- packages/core/src/anthropic.ts | 2 +- packages/core/src/chat.ts | 2 +- packages/core/src/encoders.test.ts | 49 ++++++-- packages/core/src/encoders.ts | 78 ++++++++++-- packages/core/src/git.ts | 2 +- packages/core/src/globals.ts | 10 +- packages/core/src/liner.ts | 24 ++++ packages/core/src/openai.ts | 2 +- packages/core/src/parsers.ts | 4 +- packages/core/src/promptdom.ts | 8 +- packages/core/src/prompty.test.ts | 10 +- packages/core/src/prompty.ts | 2 +- packages/core/src/types/prompt_template.d.ts | 118 +++++++++++++----- packages/sample/genaisrc/chunk.genai.mjs | 25 ++++ 17 files changed, 332 insertions(+), 70 deletions(-) create mode 100644 docs/src/content/docs/reference/scripts/tokenizers.md create mode 100644 packages/sample/genaisrc/chunk.genai.mjs diff --git a/docs/src/content/docs/reference/scripts/tokenizers.md b/docs/src/content/docs/reference/scripts/tokenizers.md new file mode 100644 index 0000000000..47b40b0c1a --- /dev/null +++ b/docs/src/content/docs/reference/scripts/tokenizers.md @@ -0,0 +1,58 @@ +--- +title: Tokenizers +description: Tokenizers are used to split text into tokens. +sidebar: + order: 60 +--- + +The `tokenizers` helper module providers a set of functions to split text into tokens. + +```ts +const n = tokenizers.count("hello world") +``` + +## Choosing your tokenizer + +By default, the `tokenizers` module uses the `large` tokenizer. You can change the tokenizer by passing the model identifier. + +```ts 'model: "gpt-4o-mini"' +const n = await tokenizers.count("hello world", { model: "gpt-4o-mini" }) +``` + +## `count` + +Counts the number of tokens in a string. + +```ts wrap +const n = await tokenizers.count("hello world") +``` + +## `truncate` + +Drops a part of the string to fit into a token budget + +```ts wrap +const truncated = await tokenizers.truncate("hello world", 5) +``` + +## `chunk` + +Splits the text into chunks of a given token size. The chunk tries to find +appropriate chunking boundaries based on the document type. + +```ts +const chunks = await tokenizers.chunk(env.files[0]) +for(const chunk of chunks) { + ... +} +``` + +You can configure the chunking size, overlap and add line numbers. + +```ts wrap +const chunks = await tokenizers.chunk(env.files[0], { + chunkSize: 128, + chunkOverlap 10, + lineNumbers: true +}) +``` diff --git a/packages/cli/src/parse.ts b/packages/cli/src/parse.ts index 4fccf038d2..5401d69a04 100644 --- a/packages/cli/src/parse.ts +++ b/packages/cli/src/parse.ts @@ -98,7 +98,7 @@ export async function parseJinja2( } ) { let src = await readFile(file, { encoding: "utf-8" }) - if (PROMPTY_REGEX.test(file)) src = promptyParse(src).content + if (PROMPTY_REGEX.test(file)) src = promptyParse(file, src).content else if (MD_REGEX.test(file)) src = splitMarkdown(src).content const vars: Record = parseOptionsVars( @@ -188,7 +188,7 @@ export async function parseTokens( options: { excludedFiles: string[]; model: string } ) { const { model = DEFAULT_MODEL } = options || {} - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) const files = await expandFiles(filesGlobs, options?.excludedFiles) console.log(`parsing ${files.length} files`) @@ -222,7 +222,7 @@ export async function prompty2genaiscript( : replaceExt(f, ".genai.mts") console.log(`${f} -> ${gf}`) const content = await readText(f) - const doc = promptyParse(content) + const doc = promptyParse(f, content) const script = promptyToGenAIScript(doc) await writeText(gf, script) } diff --git a/packages/cli/src/run.ts b/packages/cli/src/run.ts index 7b607d7206..a9216a766e 100644 --- a/packages/cli/src/run.ts +++ b/packages/cli/src/run.ts @@ -272,7 +272,7 @@ export async function runScript( DOCS_CONFIGURATION_URL ) } - trace.options.encoder = await resolveTokenEncoder(info.model) + trace.options.encoder = (await resolveTokenEncoder(info.model)).encode await runtimeHost.models.pullModel(info.model) let tokenColor = 0 diff --git a/packages/core/src/anthropic.ts b/packages/core/src/anthropic.ts index f2cead1a24..d54f0efcb8 100644 --- a/packages/core/src/anthropic.ts +++ b/packages/core/src/anthropic.ts @@ -197,7 +197,7 @@ export const AnthropicChatCompletion: ChatCompletionHandler = async ( const { requestOptions, partialCb, cancellationToken, inner } = options const { headers } = requestOptions || {} const { model } = parseModelIdentifier(req.model) - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) const anthropic = new Anthropic({ baseURL: cfg.base, diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts index 5229edef24..74625b7b66 100644 --- a/packages/core/src/chat.ts +++ b/packages/core/src/chat.ts @@ -140,7 +140,7 @@ async function runToolCalls( ) { const projFolder = host.projectFolder() const { cancellationToken, trace, model } = options || {} - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) assert(!!trace) let edits: Edits[] = [] diff --git a/packages/core/src/encoders.test.ts b/packages/core/src/encoders.test.ts index 05274b8bc1..b8540cfda0 100644 --- a/packages/core/src/encoders.test.ts +++ b/packages/core/src/encoders.test.ts @@ -1,32 +1,67 @@ import test, { describe } from "node:test" import assert from "node:assert" -import { resolveTokenEncoder } from "./encoders" -import { encode as defaultEncode } from "gpt-tokenizer" +import { chunk, resolveTokenEncoder } from "./encoders" +import { dedent } from "./indent" describe("resolveTokenEncoder", () => { test("gpt-3.5-turbo", async () => { const encoder = await resolveTokenEncoder("gpt-3.5-turbo") - const result = encoder("test line") + const result = encoder.encode("test line") assert.deepEqual(result, [1985, 1584]) }) test("gpt-4", async () => { const encoder = await resolveTokenEncoder("gpt-4") - const result = encoder("test line") + const result = encoder.encode("test line") assert.deepEqual(result, [1985, 1584]) }) test("gpt-4o", async () => { const encoder = await resolveTokenEncoder("gpt-4o") - const result = encoder("test line") + const result = encoder.encode("test line") assert.deepEqual(result, [3190, 2543]) }) test("gpt-4o-mini", async () => { const encoder = await resolveTokenEncoder("gpt-4o-mini") - const result = encoder("test line") + const result = encoder.encode("test line") assert.deepEqual(result, [3190, 2543]) }) test("gpt-4o forbidden", async () => { const encoder = await resolveTokenEncoder("gpt-4o") - const result = encoder("<|im_end|>") + const result = encoder.encode("<|im_end|>") assert.deepEqual(result, [27, 91, 321, 13707, 91, 29]) }) + test("gpt-4o chunk", async () => { + const chunks = await chunk( + { + filename: "markdown.md", + content: dedent`--- +title: What is Markdown? - Understanding Markdown Syntax +description: Learn about Markdown, a lightweight markup language for formatting plain text, its syntax, and how it differs from WYSIWYG editors. +keywords: Markdown, markup language, formatting, plain text, syntax +sidebar: mydoc_sidebar +--- + +# Intro + +What is Markdown? + Markdown is a lightweight markup language that you can use to add formatting elements to plaintext text documents. Created by John Gruber in 2004, Markdown is now one of the world’s most popular markup languages. + +## What? + +Using Markdown is different than using a WYSIWYG editor. In an application like Microsoft Word, you click buttons to format words and phrases, and the changes are visible immediately. Markdown isn’t like that. When you create a Markdown-formatted file, you add Markdown syntax to the text to indicate which words and phrases should look different. + +## Examples + +For example, to denote a heading, you add a number sign before it (e.g., # Heading One). Or to make a phrase bold, you add two asterisks before and after it (e.g., **this text is bold**). It may take a while to get used to seeing Markdown syntax in your text, especially if you’re accustomed to WYSIWYG applications. The screenshot below shows a Markdown file displayed in the Visual Studio Code text editor.... +`, + }, + { + chunkSize: 128, + chunkOverlap: 16, + model: "gpt-4o", + lineNumbers: true + } + ) + console.log(chunks) + assert.equal(chunks.length, 3) + }) }) diff --git a/packages/core/src/encoders.ts b/packages/core/src/encoders.ts index 95b51388c7..91dbf107c2 100644 --- a/packages/core/src/encoders.ts +++ b/packages/core/src/encoders.ts @@ -1,26 +1,90 @@ // Import the function to parse model identifiers import { parseModelIdentifier } from "./models" +import { runtimeHost } from "./host" +import path from "node:path" +import { addLineNumbers, indexToLineNumber } from "./liner" +import { resolveFileContent } from "./file" +import { NotSupportedError } from "./error" /** * Resolves the appropriate token encoder based on the given model ID. * @param modelId - The identifier for the model to resolve the encoder for. * @returns A Promise that resolves to a TokenEncoder function. */ -export async function resolveTokenEncoder( - modelId: string -): Promise { +export async function resolveTokenEncoder(modelId: string): Promise { // Parse the model identifier to extract the model information + if (!modelId) modelId = runtimeHost.defaultModelOptions.model const { model } = parseModelIdentifier(modelId) const module = model // Assign model to module for dynamic import path const options = { disallowedSpecial: new Set() } try { // Attempt to dynamically import the encoder module for the specified model - const mod = await import(`gpt-tokenizer/model/${module}`) - return (line) => mod.encode(line, options) // Return the encoder function + const { encode, decode } = await import(`gpt-tokenizer/model/${module}`) + return Object.freeze({ + model, + encode: (line) => encode(line, options), // Return the default encoder function + decode, + }) } catch (e) { // If the specific model encoder is not found, default to gpt-4o encoder - const { encode } = await import("gpt-tokenizer") - return (line) => encode(line, options) // Return the default encoder function + const { encode, decode } = await import("gpt-tokenizer") + return Object.freeze({ + model: "gpt-4o", + encode: (line) => encode(line, options), // Return the default encoder function + decode, + }) } } + +export async function chunk( + file: Awaitable, + options?: TextChunkerConfig +): Promise { + const f = await file + let filename: string + let content: string + if (typeof f === "string") { + filename = undefined + content = f + } else if (typeof f === "object") { + await resolveFileContent(f) + filename = f.filename + content = f.content + } else return [] + + const { + model, + docType: optionsDocType, + lineNumbers, + ...rest + } = options || {} + const docType = ( + optionsDocType || (filename ? path.extname(filename) : undefined) + ) + ?.toLowerCase() + ?.replace(/^\./, "") + const tokenizer = await resolveTokenEncoder(model) + const { TextSplitter } = await import("vectra/lib/TextSplitter") + const ts = new TextSplitter({ + ...rest, + docType, + tokenizer, + keepSeparators: true, + }) + const chunksRaw = ts.split(content) + const chunks = chunksRaw.map(({ text, startPos, endPos }) => { + const lineStart = indexToLineNumber(content, startPos) + const lineEnd = indexToLineNumber(content, endPos) + if (lineNumbers) { + text = addLineNumbers(text, { startLine: lineStart }) + } + return { + content: text, + filename, + lineStart, + lineEnd, + } satisfies TextChunk + }) + return chunks +} diff --git a/packages/core/src/git.ts b/packages/core/src/git.ts index f1fb381c77..ba5811470a 100644 --- a/packages/core/src/git.ts +++ b/packages/core/src/git.ts @@ -282,7 +282,7 @@ export class GitClient implements Git { } if (!nameOnly && llmify) { res = llmifyDiff(res) - const encoder = await resolveTokenEncoder( + const { encode: encoder } = await resolveTokenEncoder( runtimeHost.defaultModelOptions.model || DEFAULT_MODEL ) const tokens = estimateTokens(res, encoder) diff --git a/packages/core/src/globals.ts b/packages/core/src/globals.ts index f206f149d4..41992dba93 100644 --- a/packages/core/src/globals.ts +++ b/packages/core/src/globals.ts @@ -12,12 +12,10 @@ import { JSONLStringify, JSONLTryParse } from "./jsonl" import { HTMLTablesToJSON, HTMLToMarkdown, HTMLToText } from "./html" import { CancelError } from "./error" import { fetchText } from "./fetch" -import { readText } from "./fs" -import { logVerbose } from "./util" import { GitHubClient } from "./github" import { GitClient } from "./git" import { estimateTokens, truncateTextToTokens } from "./tokens" -import { resolveTokenEncoder } from "./encoders" +import { chunk, resolveTokenEncoder } from "./encoders" import { runtimeHost } from "./host" /** @@ -122,19 +120,21 @@ export function installGlobals() { glb.git = new GitClient() glb.tokenizers = Object.freeze({ + resolve: resolveTokenEncoder, count: async (text, options) => { - const encoder = await resolveTokenEncoder( + const { encode: encoder } = await resolveTokenEncoder( options?.model || runtimeHost.defaultModelOptions.model ) const c = await estimateTokens(text, encoder) return c }, truncate: async (text, maxTokens, options) => { - const encoder = await resolveTokenEncoder( + const { encode: encoder } = await resolveTokenEncoder( options?.model || runtimeHost.defaultModelOptions.model ) return await truncateTextToTokens(text, maxTokens, encoder, options) }, + chunk: chunk, }) /** diff --git a/packages/core/src/liner.ts b/packages/core/src/liner.ts index bc64742168..1dbe9e0021 100644 --- a/packages/core/src/liner.ts +++ b/packages/core/src/liner.ts @@ -62,3 +62,27 @@ export function extractRange( const endLine = lineEnd || lines.length return lines.slice(startLine - 1, endLine).join("\n") } + +/** + * Converts a string position index to a line number. + * @param text - The text in which to find the line number. + * @param index - The position index within the text. + * @returns The line number corresponding to the position index, starting from 1. + */ +export function indexToLineNumber(text: string, index: number): number { + if ( + text === undefined || + text === null || + index < 0 || + index >= text.length + ) + return -1 + let lineNumber = 1 + const n = Math.min(index, text.length) + for (let i = 0; i < n; i++) { + if (text[i] === "\n") { + lineNumber++ + } + } + return lineNumber +} diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts index 38c7dc59f6..8d98b6a859 100644 --- a/packages/core/src/openai.ts +++ b/packages/core/src/openai.ts @@ -73,7 +73,7 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async ( const { headers = {}, ...rest } = requestOptions || {} const { token, source, ...cfgNoToken } = cfg const { model } = parseModelIdentifier(req.model) - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) const cache = !!cacheOrName || !!cacheName const cacheStore = getChatCompletionCache( diff --git a/packages/core/src/parsers.ts b/packages/core/src/parsers.ts index 236b0ca630..286ab75004 100644 --- a/packages/core/src/parsers.ts +++ b/packages/core/src/parsers.ts @@ -34,7 +34,7 @@ export async function createParsers(options: { model: string }): Promise { const { trace, model } = options - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) return Object.freeze({ JSON5: (text, options) => JSON5TryParse(filenameOrFileToContent(text), options?.defaultValue), @@ -120,6 +120,6 @@ export async function createParsers(options: { }, diff: (f1, f2) => llmifyDiff(createDiff(f1, f2)), tidyData: (rows, options) => tidyData(rows, options), - hash: async (text, options) => await hash(text, options) + hash: async (text, options) => await hash(text, options), }) } diff --git a/packages/core/src/promptdom.ts b/packages/core/src/promptdom.ts index c1d956a634..83b2f3dda7 100644 --- a/packages/core/src/promptdom.ts +++ b/packages/core/src/promptdom.ts @@ -568,7 +568,7 @@ async function resolvePromptNode( model: string, root: PromptNode ): Promise<{ errors: number }> { - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) let err = 0 const names = new Set() const uniqueName = (n_: string) => { @@ -742,7 +742,7 @@ async function resolveImportPrompty( args: Record, options: ImportTemplateOptions ) { - const { messages } = promptyParse(f.content) + const { messages } = promptyParse(f.filename, f.content) for (const message of messages) { const txt = jinjaRenderChatMessage(message, args) if (message.role === "assistant") @@ -761,7 +761,7 @@ async function truncatePromptNode( options?: TraceOptions ): Promise { const { trace } = options || {} - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) let truncated = false const cap = (n: { @@ -923,7 +923,7 @@ export async function renderPromptNode( ): Promise { const { trace, flexTokens } = options || {} const { model } = parseModelIdentifier(modelId) - const encoder = await resolveTokenEncoder(model) + const { encode: encoder } = await resolveTokenEncoder(model) await resolvePromptNode(model, node) await tracePromptNode(trace, node) diff --git a/packages/core/src/prompty.test.ts b/packages/core/src/prompty.test.ts index e24a41d715..9db49c1ef2 100644 --- a/packages/core/src/prompty.test.ts +++ b/packages/core/src/prompty.test.ts @@ -4,7 +4,7 @@ import assert from "node:assert/strict" describe("promptyParse", () => { test("correctly parses an empty markdown string", () => { - const result = promptyParse("") + const result = promptyParse(undefined, "") assert.deepStrictEqual(result, { meta: {}, frontmatter: {}, @@ -15,7 +15,7 @@ describe("promptyParse", () => { test("correctly parses a markdown string without frontmatter", () => { const content = "This is a sample content without frontmatter." - const result = promptyParse(content) + const result = promptyParse(undefined, content) assert.deepStrictEqual(result, { meta: {}, frontmatter: {}, @@ -40,7 +40,7 @@ sample: --- # Heading Content below heading.` - const result = promptyParse(markdownString) + const result = promptyParse(undefined, markdownString) assert.deepStrictEqual(result.frontmatter, { name: "Test", description: "A test description", @@ -59,7 +59,7 @@ assistant: Assistant's reply user: Another message from the user` - const result = promptyParse(markdownContent) + const result = promptyParse(undefined, markdownContent) assert.deepStrictEqual(result.messages, [ { role: "user", content: "User's message" }, { role: "assistant", content: "Assistant's reply" }, @@ -69,7 +69,7 @@ Another message from the user` test("correctly handles a markdown string with content but without roles", () => { const markdownContent = `Just some content without specifying roles.` - const result = promptyParse(markdownContent) + const result = promptyParse(undefined, markdownContent) assert.deepStrictEqual(result.messages, [ { role: "system", content: markdownContent }, ]) diff --git a/packages/core/src/prompty.ts b/packages/core/src/prompty.ts index 8cbaa02c4e..597dd112dc 100644 --- a/packages/core/src/prompty.ts +++ b/packages/core/src/prompty.ts @@ -123,7 +123,7 @@ export function promptyParse(filename: string, text: string): PromptyDocument { const { frontmatter = "", content = "" } = splitMarkdown(text) const fm = YAMLTryParse(frontmatter) ?? {} const meta: PromptArgs = fm ? promptyFrontmatterToMeta(fm) : {} - meta.filename = filename + if (filename) meta.filename = filename const messages: ChatCompletionMessageParam[] = [] // split diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index 6673357d99..00317bd7df 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -108,29 +108,35 @@ type PromptOutputProcessorHandler = ( type PromptTemplateResponseType = "json_object" | "json_schema" | undefined +type ModelType = OptionsOrString< + | "large" + | "small" + | "openai:gpt-4o" + | "openai:gpt-4o-mini" + | "openai:gpt-3.5-turbo" + | "azure:gpt-4o" + | "azure:gpt-4o-mini" + | "ollama:phi3" + | "ollama:llama3" + | "ollama:mixtral" + | "anthropic:claude-3-5-sonnet-20240620" + | "anthropic:claude-3-opus-20240229" + | "anthropic:claude-3-sonnet-20240229" + | "anthropic:claude-3-haiku-20240307" + | "anthropic:claude-2.1" + | "anthropic:claude-2.0" + | "anthropic:claude-instant-1.2" +> + +type ModelSmallType = OptionsOrString< + "openai:gpt-4o-mini" | "openai:gpt-3.5-turbo" | "azure:gpt-4o-mini" +> + interface ModelConnectionOptions { /** * Which LLM model to use. Use `large` for the default set of model candidates, `small` for the set of small models like gpt-4o-mini. */ - model?: OptionsOrString< - | "large" - | "small" - | "openai:gpt-4o" - | "openai:gpt-4o-mini" - | "openai:gpt-3.5-turbo" - | "azure:gpt-4o" - | "azure:gpt-4o-mini" - | "ollama:phi3" - | "ollama:llama3" - | "ollama:mixtral" - | "anthropic:claude-3-5-sonnet-20240620" - | "anthropic:claude-3-opus-20240229" - | "anthropic:claude-3-sonnet-20240229" - | "anthropic:claude-3-haiku-20240307" - | "anthropic:claude-2.1" - | "anthropic:claude-2.0" - | "anthropic:claude-instant-1.2" - > + model?: ModelType /** * Which LLM model to use for the "small" model. @@ -138,9 +144,7 @@ interface ModelConnectionOptions { * @default gpt-4 * @example gpt-4 */ - smallModel?: OptionsOrString< - "openai:gpt-4o-mini" | "openai:gpt-3.5-turbo" | "azure:gpt-4o-mini" - > + smallModel?: ModelSmallType } interface ModelOptions extends ModelConnectionOptions { @@ -248,16 +252,11 @@ interface PromptSystemOptions { excludedSystem?: ElementOrArray } -interface ScriptRuntimeOptions { +interface ScriptRuntimeOptions extends LineNumberingOptions { /** * Secrets required by the prompt */ secrets?: string[] - - /** - * Default value for emitting line numbers in fenced code blocks. - */ - lineNumbers?: boolean } type PromptParameterType = @@ -981,7 +980,7 @@ interface RunPromptResult { fileEdits?: Record edits?: Edits[] changelogs?: ChangeLog[] - model?: string + model?: ModelType } /** @@ -1083,19 +1082,60 @@ interface ParseZipOptions { } type TokenEncoder = (text: string) => number[] +type TokenDecoder = (lines: Iterable) => string + +interface Tokenizer { + model: string + encode: TokenEncoder + decode: TokenDecoder +} interface CSVParseOptions { delimiter?: string headers?: string[] } +interface TextChunk extends WorkspaceFile { + lineStart: number + lineEnd: number +} + +interface TextChunkerConfig extends LineNumberingOptions { + model?: ModelType + chunkSize?: number + chunkOverlap?: number + docType?: OptionsOrString< + | "cpp" + | "python" + | "py" + | "java" + | "go" + | "c#" + | "c" + | "cs" + | "ts" + | "js" + | "tsx" + | "typescript" + | "js" + | "jsx" + | "javascript" + | "php" + | "md" + | "mdx" + | "markdown" + | "rst" + | "rust" + > +} + interface Tokenizers { /** * Estimates the number of tokens in the content. May not be accurate * @param model * @param text */ - count(text: string, options?: { model: string }): Promise + count(text: string, options?: { model?: ModelType }): Promise /** * Truncates the text to a given number of tokens, approximation. @@ -1107,8 +1147,24 @@ interface Tokenizers { truncate( text: string, maxTokens: number, - options?: { model?: string; last?: boolean } + options?: { model?: ModelType; last?: boolean } ): Promise + + /** + * Tries to resolve a tokenizer for a given model. Defaults to gpt-4o if not found. + * @param model + */ + resolve(model?: ModelType): Promise + + /** + * Chunk the text into smaller pieces based on a token limit and chunking strategy. + * @param text + * @param options + */ + chunk( + file: Awaitable, + options?: TextChunkerConfig + ): Promise } interface HashOptions { diff --git a/packages/sample/genaisrc/chunk.genai.mjs b/packages/sample/genaisrc/chunk.genai.mjs new file mode 100644 index 0000000000..f9ed57c27b --- /dev/null +++ b/packages/sample/genaisrc/chunk.genai.mjs @@ -0,0 +1,25 @@ +script({ + files: "src/rag/loremipsum.pdf", + tests: {}, +}) + +const chunks = await tokenizers.chunk(env.files[0], { + chunkSize: 256, + chunkOverlap: 42, + lineNumbers: true, +}) + +let summary = "" +for (const chunk of chunks) { + const { text } = await runPrompt( + (ctx) => { + ctx.def("CHUNK", chunk) + ctx.def("SUMMARY_SO_FAR", summary, { ignoreEmpty: true }) + ctx.$`Summarize CHUNK. Use SUMMARY_SO_FAR as a starting point (but do not repeat it).` + }, + { model: "small", system: ["system"] } + ) + summary = text +} + +console.log(summary)