Skip to content

Commit

Permalink
Add tests for token encoders and update encoding function to include …
Browse files Browse the repository at this point in the history
…options
  • Loading branch information
pelikhan committed Sep 26, 2024
1 parent 72e9ba9 commit 7a36c27
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 3 deletions.
32 changes: 32 additions & 0 deletions packages/core/src/encoders.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import test, { describe } from "node:test"
import assert from "node:assert"
import { resolveTokenEncoder } from "./encoders"
import { encode as defaultEncode } from "gpt-tokenizer"

describe("resolveTokenEncoder", () => {
test("gpt-3.5-turbo", async () => {
const encoder = await resolveTokenEncoder("gpt-3.5-turbo")
const result = encoder("test line")
assert.deepEqual(result, [1985, 1584])
})
test("gpt-4", async () => {
const encoder = await resolveTokenEncoder("gpt-4")
const result = encoder("test line")
assert.deepEqual(result, [1985, 1584])
})
test("gpt-4o", async () => {
const encoder = await resolveTokenEncoder("gpt-4o")
const result = encoder("test line")
assert.deepEqual(result, [3190, 2543])
})
test("gpt-4o-mini", async () => {
const encoder = await resolveTokenEncoder("gpt-4o-mini")
const result = encoder("test line")
assert.deepEqual(result, [3190, 2543])
})
test("gpt-4o forbidden", async () => {
const encoder = await resolveTokenEncoder("gpt-4o")
const result = encoder("<|im_end|>")
assert.deepEqual(result, [27, 91, 321, 13707, 91, 29])
})
})
7 changes: 4 additions & 3 deletions packages/core/src/encoders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ export async function resolveTokenEncoder(
const { model } = parseModelIdentifier(modelId)
const module = model // Assign model to module for dynamic import path

const options = { disallowedSpecial: new Set<string>() }
try {
// Attempt to dynamically import the encoder module for the specified model
const mod = await import(`gpt-tokenizer/model/${module}`)
return mod.encode // Return the encoder function
return (line) => mod.encode(line, options) // Return the encoder function
} catch (e) {
// If the specific model encoder is not found, default to gpt-4 encoder
// If the specific model encoder is not found, default to gpt-4o encoder
const { encode } = await import("gpt-tokenizer")
return encode // Return the default encoder function
return (line) => encode(line, options) // Return the default encoder function
}
}

0 comments on commit 7a36c27

Please sign in to comment.