Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expose decoder #792

Merged
merged 9 commits into from
Oct 23, 2024
58 changes: 58 additions & 0 deletions docs/src/content/docs/reference/scripts/tokenizers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
title: Tokenizers
description: Tokenizers are used to split text into tokens.
sidebar:
order: 60
---

The `tokenizers` helper module providers a set of functions to split text into tokens.

```ts
const n = tokenizers.count("hello world")
```

## Choosing your tokenizer

By default, the `tokenizers` module uses the `large` tokenizer. You can change the tokenizer by passing the model identifier.

```ts 'model: "gpt-4o-mini"'
const n = await tokenizers.count("hello world", { model: "gpt-4o-mini" })
```

## `count`

Counts the number of tokens in a string.

```ts wrap
const n = await tokenizers.count("hello world")
```

## `truncate`

Drops a part of the string to fit into a token budget

```ts wrap
const truncated = await tokenizers.truncate("hello world", 5)
```

## `chunk`

Splits the text into chunks of a given token size. The chunk tries to find
appropriate chunking boundaries based on the document type.

```ts
const chunks = await tokenizers.chunk(env.files[0])
for(const chunk of chunks) {
...
}
```

You can configure the chunking size, overlap and add line numbers.

```ts wrap
const chunks = await tokenizers.chunk(env.files[0], {
chunkSize: 128,
chunkOverlap 10,
lineNumbers: true
})
```
6 changes: 3 additions & 3 deletions packages/cli/src/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ export async function parseJinja2(
}
) {
let src = await readFile(file, { encoding: "utf-8" })
if (PROMPTY_REGEX.test(file)) src = promptyParse(src).content
if (PROMPTY_REGEX.test(file)) src = promptyParse(file, src).content
else if (MD_REGEX.test(file)) src = splitMarkdown(src).content

const vars: Record<string, any> = parseOptionsVars(
Expand Down Expand Up @@ -188,7 +188,7 @@ export async function parseTokens(
options: { excludedFiles: string[]; model: string }
) {
const { model = DEFAULT_MODEL } = options || {}
const encoder = await resolveTokenEncoder(model)
const { encode: encoder } = await resolveTokenEncoder(model)

const files = await expandFiles(filesGlobs, options?.excludedFiles)
console.log(`parsing ${files.length} files`)
Expand Down Expand Up @@ -222,7 +222,7 @@ export async function prompty2genaiscript(
: replaceExt(f, ".genai.mts")
console.log(`${f} -> ${gf}`)
const content = await readText(f)
const doc = promptyParse(content)
const doc = promptyParse(f, content)
const script = promptyToGenAIScript(doc)
await writeText(gf, script)
}
Expand Down
2 changes: 1 addition & 1 deletion packages/cli/src/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ export async function runScript(
DOCS_CONFIGURATION_URL
)
}
trace.options.encoder = await resolveTokenEncoder(info.model)
trace.options.encoder = (await resolveTokenEncoder(info.model)).encode
await runtimeHost.models.pullModel(info.model)

let tokenColor = 0
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ export const AnthropicChatCompletion: ChatCompletionHandler = async (
const { requestOptions, partialCb, cancellationToken, inner } = options
const { headers } = requestOptions || {}
const { model } = parseModelIdentifier(req.model)
const encoder = await resolveTokenEncoder(model)
const { encode: encoder } = await resolveTokenEncoder(model)

const anthropic = new Anthropic({
baseURL: cfg.base,
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ async function runToolCalls(
) {
const projFolder = host.projectFolder()
const { cancellationToken, trace, model } = options || {}
const encoder = await resolveTokenEncoder(model)
const { encode: encoder } = await resolveTokenEncoder(model)
assert(!!trace)
let edits: Edits[] = []

Expand Down
49 changes: 42 additions & 7 deletions packages/core/src/encoders.test.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,67 @@
import test, { describe } from "node:test"
import assert from "node:assert"
import { resolveTokenEncoder } from "./encoders"
import { encode as defaultEncode } from "gpt-tokenizer"
import { chunk, resolveTokenEncoder } from "./encoders"
import { dedent } from "./indent"

describe("resolveTokenEncoder", () => {
test("gpt-3.5-turbo", async () => {
const encoder = await resolveTokenEncoder("gpt-3.5-turbo")
const result = encoder("test line")
const result = encoder.encode("test line")
assert.deepEqual(result, [1985, 1584])
})
test("gpt-4", async () => {
const encoder = await resolveTokenEncoder("gpt-4")
const result = encoder("test line")
const result = encoder.encode("test line")
assert.deepEqual(result, [1985, 1584])
})
test("gpt-4o", async () => {
const encoder = await resolveTokenEncoder("gpt-4o")
const result = encoder("test line")
const result = encoder.encode("test line")
assert.deepEqual(result, [3190, 2543])
})
test("gpt-4o-mini", async () => {
const encoder = await resolveTokenEncoder("gpt-4o-mini")
const result = encoder("test line")
const result = encoder.encode("test line")
assert.deepEqual(result, [3190, 2543])
})
test("gpt-4o forbidden", async () => {
const encoder = await resolveTokenEncoder("gpt-4o")
const result = encoder("<|im_end|>")
const result = encoder.encode("<|im_end|>")
assert.deepEqual(result, [27, 91, 321, 13707, 91, 29])
})
test("gpt-4o chunk", async () => {
const chunks = await chunk(
{
filename: "markdown.md",
content: dedent`---
title: What is Markdown? - Understanding Markdown Syntax
description: Learn about Markdown, a lightweight markup language for formatting plain text, its syntax, and how it differs from WYSIWYG editors.
keywords: Markdown, markup language, formatting, plain text, syntax
sidebar: mydoc_sidebar
---

# Intro

What is Markdown?
Markdown is a lightweight markup language that you can use to add formatting elements to plaintext text documents. Created by John Gruber in 2004, Markdown is now one of the world’s most popular markup languages.

## What?

Using Markdown is different than using a WYSIWYG editor. In an application like Microsoft Word, you click buttons to format words and phrases, and the changes are visible immediately. Markdown isn’t like that. When you create a Markdown-formatted file, you add Markdown syntax to the text to indicate which words and phrases should look different.

## Examples

For example, to denote a heading, you add a number sign before it (e.g., # Heading One). Or to make a phrase bold, you add two asterisks before and after it (e.g., **this text is bold**). It may take a while to get used to seeing Markdown syntax in your text, especially if you’re accustomed to WYSIWYG applications. The screenshot below shows a Markdown file displayed in the Visual Studio Code text editor....
`,
},
{
chunkSize: 128,
chunkOverlap: 16,
model: "gpt-4o",
lineNumbers: true
}
)
console.log(chunks)
assert.equal(chunks.length, 3)
})
})
78 changes: 71 additions & 7 deletions packages/core/src/encoders.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,90 @@
// Import the function to parse model identifiers
import { parseModelIdentifier } from "./models"
import { runtimeHost } from "./host"
import path from "node:path"
import { addLineNumbers, indexToLineNumber } from "./liner"
import { resolveFileContent } from "./file"
import { NotSupportedError } from "./error"

/**
* Resolves the appropriate token encoder based on the given model ID.
* @param modelId - The identifier for the model to resolve the encoder for.
* @returns A Promise that resolves to a TokenEncoder function.
*/
export async function resolveTokenEncoder(
modelId: string
): Promise<TokenEncoder> {
export async function resolveTokenEncoder(modelId: string): Promise<Tokenizer> {
// Parse the model identifier to extract the model information
if (!modelId) modelId = runtimeHost.defaultModelOptions.model
const { model } = parseModelIdentifier(modelId)
const module = model // Assign model to module for dynamic import path

const options = { disallowedSpecial: new Set<string>() }
try {
// Attempt to dynamically import the encoder module for the specified model
const mod = await import(`gpt-tokenizer/model/${module}`)
return (line) => mod.encode(line, options) // Return the encoder function
const { encode, decode } = await import(`gpt-tokenizer/model/${module}`)
return Object.freeze<Tokenizer>({
model,
encode: (line) => encode(line, options), // Return the default encoder function
decode,
})
} catch (e) {
// If the specific model encoder is not found, default to gpt-4o encoder
const { encode } = await import("gpt-tokenizer")
return (line) => encode(line, options) // Return the default encoder function
const { encode, decode } = await import("gpt-tokenizer")
return Object.freeze<Tokenizer>({
model: "gpt-4o",
encode: (line) => encode(line, options), // Return the default encoder function
decode,
})
}
}

export async function chunk(
file: Awaitable<string | WorkspaceFile>,
options?: TextChunkerConfig
): Promise<TextChunk[]> {
const f = await file
let filename: string
let content: string
if (typeof f === "string") {
filename = undefined
content = f
} else if (typeof f === "object") {
await resolveFileContent(f)
filename = f.filename
content = f.content
} else return []

const {
model,
docType: optionsDocType,
lineNumbers,
...rest
} = options || {}
const docType = (
optionsDocType || (filename ? path.extname(filename) : undefined)
)
?.toLowerCase()
?.replace(/^\./, "")
const tokenizer = await resolveTokenEncoder(model)
const { TextSplitter } = await import("vectra/lib/TextSplitter")
const ts = new TextSplitter({
...rest,
docType,
tokenizer,
keepSeparators: true,
})
const chunksRaw = ts.split(content)
const chunks = chunksRaw.map(({ text, startPos, endPos }) => {
const lineStart = indexToLineNumber(content, startPos)
const lineEnd = indexToLineNumber(content, endPos)
if (lineNumbers) {
text = addLineNumbers(text, { startLine: lineStart })
}
return {
content: text,
filename,
lineStart,
lineEnd,
} satisfies TextChunk
})
return chunks
}
2 changes: 1 addition & 1 deletion packages/core/src/git.ts
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ export class GitClient implements Git {
}
if (!nameOnly && llmify) {
res = llmifyDiff(res)
const encoder = await resolveTokenEncoder(
const { encode: encoder } = await resolveTokenEncoder(
runtimeHost.defaultModelOptions.model || DEFAULT_MODEL
)
const tokens = estimateTokens(res, encoder)
Expand Down
10 changes: 5 additions & 5 deletions packages/core/src/globals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@ import { JSONLStringify, JSONLTryParse } from "./jsonl"
import { HTMLTablesToJSON, HTMLToMarkdown, HTMLToText } from "./html"
import { CancelError } from "./error"
import { fetchText } from "./fetch"
import { readText } from "./fs"
import { logVerbose } from "./util"
import { GitHubClient } from "./github"
import { GitClient } from "./git"
import { estimateTokens, truncateTextToTokens } from "./tokens"
import { resolveTokenEncoder } from "./encoders"
import { chunk, resolveTokenEncoder } from "./encoders"
import { runtimeHost } from "./host"

/**
Expand Down Expand Up @@ -122,19 +120,21 @@ export function installGlobals() {
glb.git = new GitClient()

glb.tokenizers = Object.freeze<Tokenizers>({
resolve: resolveTokenEncoder,
count: async (text, options) => {
const encoder = await resolveTokenEncoder(
const { encode: encoder } = await resolveTokenEncoder(
options?.model || runtimeHost.defaultModelOptions.model
)
const c = await estimateTokens(text, encoder)
return c
},
truncate: async (text, maxTokens, options) => {
const encoder = await resolveTokenEncoder(
const { encode: encoder } = await resolveTokenEncoder(
options?.model || runtimeHost.defaultModelOptions.model
)
return await truncateTextToTokens(text, maxTokens, encoder, options)
},
chunk: chunk,
})

/**
Expand Down
24 changes: 24 additions & 0 deletions packages/core/src/liner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,27 @@ export function extractRange(
const endLine = lineEnd || lines.length
return lines.slice(startLine - 1, endLine).join("\n")
}

/**
* Converts a string position index to a line number.
* @param text - The text in which to find the line number.
* @param index - The position index within the text.
* @returns The line number corresponding to the position index, starting from 1.
*/
export function indexToLineNumber(text: string, index: number): number {
if (
text === undefined ||
text === null ||
index < 0 ||
index >= text.length
)
return -1
let lineNumber = 1
const n = Math.min(index, text.length)
for (let i = 0; i < n; i++) {
if (text[i] === "\n") {
lineNumber++
}
}
return lineNumber
}
2 changes: 1 addition & 1 deletion packages/core/src/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async (
const { headers = {}, ...rest } = requestOptions || {}
const { token, source, ...cfgNoToken } = cfg
const { model } = parseModelIdentifier(req.model)
const encoder = await resolveTokenEncoder(model)
const { encode: encoder } = await resolveTokenEncoder(model)

const cache = !!cacheOrName || !!cacheName
const cacheStore = getChatCompletionCache(
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ export async function createParsers(options: {
model: string
}): Promise<Parsers> {
const { trace, model } = options
const encoder = await resolveTokenEncoder(model)
const { encode: encoder } = await resolveTokenEncoder(model)
return Object.freeze<Parsers>({
JSON5: (text, options) =>
JSON5TryParse(filenameOrFileToContent(text), options?.defaultValue),
Expand Down Expand Up @@ -120,6 +120,6 @@ export async function createParsers(options: {
},
diff: (f1, f2) => llmifyDiff(createDiff(f1, f2)),
tidyData: (rows, options) => tidyData(rows, options),
hash: async (text, options) => await hash(text, options)
hash: async (text, options) => await hash(text, options),
})
}
Loading
Loading