Add tests for token encoders and update encoding function to include …

…options
microsoft · Sep 26, 2024 · 7a36c27 · 7a36c27
1 parent 72e9ba9
commit 7a36c27
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 3 deletions.
diff --git a/packages/core/src/encoders.test.ts b/packages/core/src/encoders.test.ts
@@ -0,0 +1,32 @@
+import test, { describe } from "node:test"
+import assert from "node:assert"
+import { resolveTokenEncoder } from "./encoders"
+import { encode as defaultEncode } from "gpt-tokenizer"
+
+describe("resolveTokenEncoder", () => {
+    test("gpt-3.5-turbo", async () => {
+        const encoder = await resolveTokenEncoder("gpt-3.5-turbo")
+        const result = encoder("test line")
+        assert.deepEqual(result, [1985, 1584])
+    })
+    test("gpt-4", async () => {
+        const encoder = await resolveTokenEncoder("gpt-4")
+        const result = encoder("test line")
+        assert.deepEqual(result, [1985, 1584])
+    })
+    test("gpt-4o", async () => {
+        const encoder = await resolveTokenEncoder("gpt-4o")
+        const result = encoder("test line")
+        assert.deepEqual(result, [3190, 2543])
+    })
+    test("gpt-4o-mini", async () => {
+        const encoder = await resolveTokenEncoder("gpt-4o-mini")
+        const result = encoder("test line")
+        assert.deepEqual(result, [3190, 2543])
+    })
+    test("gpt-4o forbidden", async () => {
+        const encoder = await resolveTokenEncoder("gpt-4o")
+        const result = encoder("<|im_end|>")
+        assert.deepEqual(result, [27, 91, 321, 13707, 91, 29])
+    })
+})
diff --git a/packages/core/src/encoders.ts b/packages/core/src/encoders.ts
@@ -13,13 +13,14 @@ export async function resolveTokenEncoder(
     const { model } = parseModelIdentifier(modelId)
     const module = model // Assign model to module for dynamic import path
 
+    const options = { disallowedSpecial: new Set<string>() }
     try {
         // Attempt to dynamically import the encoder module for the specified model
         const mod = await import(`gpt-tokenizer/model/${module}`)
-        return mod.encode // Return the encoder function
+        return (line) => mod.encode(line, options) // Return the encoder function
     } catch (e) {
-        // If the specific model encoder is not found, default to gpt-4 encoder
+        // If the specific model encoder is not found, default to gpt-4o encoder
         const { encode } = await import("gpt-tokenizer")
-        return encode // Return the default encoder function
+        return (line) => encode(line, options) // Return the default encoder function
     }
 }