From 655ffabb8b43d029d980e61dd431db94dc3ceec9 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Tue, 17 Sep 2024 13:39:04 +0200 Subject: [PATCH 01/10] support embeddings generation in inference.ts (cherry picked from commit 9ae8773ad13ed87af8f72f167bdd56e02ea66f15) --- apps/workers/inference.ts | 51 +++++++++++++++++++++++++++++++++++++-- apps/workers/package.json | 1 + 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index 071f4742..de47cf91 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -1,5 +1,6 @@ import { Ollama } from "ollama"; import OpenAI from "openai"; +import { encoding_for_model, TiktokenModel } from "tiktoken"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; @@ -9,6 +10,10 @@ export interface InferenceResponse { totalTokens: number | undefined; } +export interface EmbeddingResponse { + embeddings: number[][]; +} + export interface InferenceClient { inferFromText(prompt: string): Promise; inferFromImage( @@ -16,6 +21,7 @@ export interface InferenceClient { contentType: string, image: string, ): Promise; + generateEmbeddingFromText(prompt: string): Promise; } export class InferenceClientFactory { @@ -30,7 +36,6 @@ export class InferenceClientFactory { return null; } } - class OpenAIInferenceClient implements InferenceClient { openAI: OpenAI; @@ -87,6 +92,30 @@ class OpenAIInferenceClient implements InferenceClient { } return { response, totalTokens: chatCompletion.usage?.total_tokens }; } + + truncateTextTokens(text: string, maxTokens: number, model: string) { + const encoding = encoding_for_model(model as TiktokenModel); + const encoded = encoding.encode(text); + if (encoded.length <= maxTokens) { + return text; + } + + return new TextDecoder().decode( + encoding.decode(encoded.slice(0, maxTokens)), + ); + } + + async generateEmbeddingFromText(prompt: string): Promise { + const model = serverConfig.inference.textModel; + const embedResponse = await this.openAI.embeddings.create({ + model: model, + input: [this.truncateTextTokens(prompt, 2000, model)], + }); + const embedding2D: number[][] = embedResponse.data.map( + (embedding: OpenAI.Embedding) => embedding.embedding, + ); + return { embeddings: embedding2D }; + } } class OllamaInferenceClient implements InferenceClient { @@ -103,7 +132,6 @@ class OllamaInferenceClient implements InferenceClient { model: model, format: "json", stream: true, - keep_alive: serverConfig.inference.ollamaKeepAlive, messages: [ { role: "user", content: prompt, images: image ? [image] : undefined }, ], @@ -134,6 +162,17 @@ class OllamaInferenceClient implements InferenceClient { return { response, totalTokens }; } + async runEmbeddingModel(model: string, prompt: string, image?: string) { + const embedding = await this.ollama.embed({ + model: model, + input: prompt, + // Truncate the input to fit into the model's max token limit, + // in the future we want to add a way to split the input into multiple parts. + truncate: true, + }); + return { response: embedding }; + } + async inferFromText(prompt: string): Promise { return await this.runModel(serverConfig.inference.textModel, prompt); } @@ -149,4 +188,12 @@ class OllamaInferenceClient implements InferenceClient { image, ); } + + async generateEmbeddingFromText(prompt: string): Promise { + const embedResponse = await this.runEmbeddingModel( + serverConfig.inference.textModel, + prompt, + ); + return { embeddings: embedResponse.response.embeddings }; + } } diff --git a/apps/workers/package.json b/apps/workers/package.json index bbd5b17d..0532ec66 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -34,6 +34,7 @@ "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", + "tiktoken": "^1.0.16", "tsx": "^4.7.1", "typescript": "^5.3.3", "zod": "^3.22.4" From f94446db5d12e79c6fa11c1bc3de28fbd8c1fb23 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Tue, 17 Sep 2024 22:47:05 +0200 Subject: [PATCH 02/10] make AI worker generate embeddings for text bookmark --- apps/workers/inference.ts | 6 +- apps/workers/openaiWorker.ts | 112 +++++++++++++++++++++++++++-------- packages/shared/config.ts | 4 ++ 3 files changed, 95 insertions(+), 27 deletions(-) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index de47cf91..039c05a3 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -66,7 +66,7 @@ class OpenAIInferenceClient implements InferenceClient { image: string, ): Promise { const chatCompletion = await this.openAI.chat.completions.create({ - model: serverConfig.inference.imageModel, + model: serverConfig.embedding.textModel, response_format: { type: "json_object" }, messages: [ { @@ -106,7 +106,7 @@ class OpenAIInferenceClient implements InferenceClient { } async generateEmbeddingFromText(prompt: string): Promise { - const model = serverConfig.inference.textModel; + const model = serverConfig.embedding.textModel; const embedResponse = await this.openAI.embeddings.create({ model: model, input: [this.truncateTextTokens(prompt, 2000, model)], @@ -191,7 +191,7 @@ class OllamaInferenceClient implements InferenceClient { async generateEmbeddingFromText(prompt: string): Promise { const embedResponse = await this.runEmbeddingModel( - serverConfig.inference.textModel, + serverConfig.embedding.textModel, prompt, ); return { embeddings: embedResponse.response.embeddings }; diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index 8bd2cf4a..6e1fb801 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -23,6 +23,8 @@ import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; import { readPDFText, truncateContent } from "./utils"; +type BookmarkType = "link" | "text" | "image" | "pdf" | "unsupported"; + const openAIResponseSchema = z.object({ tags: z.array(z.string()), }); @@ -60,7 +62,7 @@ async function attemptMarkTaggingStatus( export class OpenAiWorker { static build() { - logger.info("Starting inference worker ..."); + logger.info("Starting AI worker ..."); const worker = new Runner( OpenAIQueue, { @@ -114,17 +116,10 @@ Aim for 3-5 tags. If there are no good tags, leave the array empty. function buildPrompt( bookmark: NonNullable>>, ) { - if (bookmark.link) { - if (!bookmark.link.description && !bookmark.link.content) { - throw new Error( - `No content found for link "${bookmark.id}". Skipping ...`, - ); - } + const content = extractTextFromBookmark(bookmark); + const bType = bookmarkType(bookmark); - let content = bookmark.link.content; - if (content) { - content = truncateContent(content); - } + if (bType === "link") { return ` ${TEXT_PROMPT_BASE} URL: ${bookmark.link.url} @@ -134,7 +129,7 @@ Content: ${content ?? ""} ${TEXT_PROMPT_INSTRUCTIONS}`; } - if (bookmark.text) { + if (bType == "text") { const content = truncateContent(bookmark.text.text ?? ""); // TODO: Ensure that the content doesn't exceed the context length of openai return ` @@ -224,25 +219,39 @@ async function inferTagsFromText( return await inferenceClient.inferFromText(buildPrompt(bookmark)); } +function bookmarkType( + bookmark: NonNullable>>, +): BookmarkType { + if (bookmark.link) { + return "link"; + } else if (bookmark.text) { + return "text"; + } + switch (bookmark.asset.assetType) { + case "image": + return "image"; + break; + case "pdf": + return "pdf"; + break; + default: + return "unsupported"; + } +} + async function inferTags( jobId: string, bookmark: NonNullable>>, inferenceClient: InferenceClient, ) { let response; - if (bookmark.link || bookmark.text) { + const bType = bookmarkType(bookmark); + if (bType === "text" || bType == "link") { response = await inferTagsFromText(bookmark, inferenceClient); - } else if (bookmark.asset) { - switch (bookmark.asset.assetType) { - case "image": - response = await inferTagsFromImage(jobId, bookmark, inferenceClient); - break; - case "pdf": - response = await inferTagsFromPDF(jobId, bookmark, inferenceClient); - break; - default: - throw new Error(`[inference][${jobId}] Unsupported bookmark type`); - } + } else if (bType == "image") { + response = await inferTagsFromImage(jobId, bookmark, inferenceClient); + } else if (bType == "pdf") { + response = await inferTagsFromPDF(jobId, bookmark, inferenceClient); } else { throw new Error(`[inference][${jobId}] Unsupported bookmark type`); } @@ -362,6 +371,59 @@ async function connectTags( }); } +// TODO: Make this function accept max tokens as an argument. +function extractTextFromBookmark( + bookmark: NonNullable>>, +): string { + if (bookmark.link) { + if (!bookmark.link.description && !bookmark.link.content) { + throw new Error( + `No content found for link "${bookmark.id}". Skipping ...`, + ); + } + + let content = bookmark.link.content; + if (content) { + content = truncateContent(content); + } + return content || ""; + } + + if (bookmark.text) { + const content = truncateContent(bookmark.text.text ?? ""); + return content; + } + logger.error(`[extractTextFromBookmark] Unsupported bookmark type`); + return ""; +} + +async function textEmbedBookmark( + bookmark: NonNullable>>, + inferenceClient: InferenceClient, +) { + const content = extractTextFromBookmark(bookmark); + const embedding = await inferenceClient.generateEmbeddingFromText(content); + return embedding; +} + +async function embedBookmark( + jobId: string, + bookmark: NonNullable>>, + inferenceClient: InferenceClient, +) { + logger.info(`[embedding][${jobId}] Embedding bookmark ${bookmark.id}`); + const bType = bookmarkType(bookmark); + logger.info(`[embedding][${jobId}] Bookmark type: ${bType}`); + if (bType === "text") { + const embedding = await inferenceClient.generateEmbeddingFromText( + extractTextFromBookmark(bookmark), + ); + logger.info( + `[embeddings] Embedding generated successfully: ${embedding.embeddings}`, + ); + } +} + async function runOpenAI(job: DequeuedJob) { const jobId = job.id ?? "unknown"; @@ -398,4 +460,6 @@ async function runOpenAI(job: DequeuedJob) { // Update the search index await triggerSearchReindex(bookmarkId); + + await embedBookmark(jobId, bookmark, inferenceClient); } diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 21cdb1c8..107cb3a7 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -23,6 +23,7 @@ const allEnv = z.object({ INFERENCE_JOB_TIMEOUT_SEC: z.coerce.number().default(30), INFERENCE_TEXT_MODEL: z.string().default("gpt-4o-mini"), INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"), + EMBEDDING_TEXT_MODEL: z.string().default("text-embedding-3-small"), CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), BROWSER_WEBSOCKET_URL: z.string().url().optional(), @@ -73,6 +74,9 @@ const serverConfigSchema = allEnv.transform((val) => { imageModel: val.INFERENCE_IMAGE_MODEL, inferredTagLang: val.INFERENCE_LANG, }, + embedding: { + textModel: val.EMBEDDING_TEXT_MODEL, + }, crawler: { numWorkers: val.CRAWLER_NUM_WORKERS, headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, From 6f02501ee7568b042793b7936ce895e4a8ca548c Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Tue, 17 Sep 2024 22:47:05 +0200 Subject: [PATCH 03/10] make AI worker generate embeddings for text bookmark --- apps/workers/inference.ts | 4 +- apps/workers/openaiWorker.ts | 112 +++++++++++++++++++++++++++-------- packages/shared/config.ts | 4 ++ 3 files changed, 94 insertions(+), 26 deletions(-) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index de47cf91..5c1882b1 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -106,7 +106,7 @@ class OpenAIInferenceClient implements InferenceClient { } async generateEmbeddingFromText(prompt: string): Promise { - const model = serverConfig.inference.textModel; + const model = serverConfig.embedding.textModel; const embedResponse = await this.openAI.embeddings.create({ model: model, input: [this.truncateTextTokens(prompt, 2000, model)], @@ -191,7 +191,7 @@ class OllamaInferenceClient implements InferenceClient { async generateEmbeddingFromText(prompt: string): Promise { const embedResponse = await this.runEmbeddingModel( - serverConfig.inference.textModel, + serverConfig.embedding.textModel, prompt, ); return { embeddings: embedResponse.response.embeddings }; diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index 8bd2cf4a..6e1fb801 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -23,6 +23,8 @@ import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; import { readPDFText, truncateContent } from "./utils"; +type BookmarkType = "link" | "text" | "image" | "pdf" | "unsupported"; + const openAIResponseSchema = z.object({ tags: z.array(z.string()), }); @@ -60,7 +62,7 @@ async function attemptMarkTaggingStatus( export class OpenAiWorker { static build() { - logger.info("Starting inference worker ..."); + logger.info("Starting AI worker ..."); const worker = new Runner( OpenAIQueue, { @@ -114,17 +116,10 @@ Aim for 3-5 tags. If there are no good tags, leave the array empty. function buildPrompt( bookmark: NonNullable>>, ) { - if (bookmark.link) { - if (!bookmark.link.description && !bookmark.link.content) { - throw new Error( - `No content found for link "${bookmark.id}". Skipping ...`, - ); - } + const content = extractTextFromBookmark(bookmark); + const bType = bookmarkType(bookmark); - let content = bookmark.link.content; - if (content) { - content = truncateContent(content); - } + if (bType === "link") { return ` ${TEXT_PROMPT_BASE} URL: ${bookmark.link.url} @@ -134,7 +129,7 @@ Content: ${content ?? ""} ${TEXT_PROMPT_INSTRUCTIONS}`; } - if (bookmark.text) { + if (bType == "text") { const content = truncateContent(bookmark.text.text ?? ""); // TODO: Ensure that the content doesn't exceed the context length of openai return ` @@ -224,25 +219,39 @@ async function inferTagsFromText( return await inferenceClient.inferFromText(buildPrompt(bookmark)); } +function bookmarkType( + bookmark: NonNullable>>, +): BookmarkType { + if (bookmark.link) { + return "link"; + } else if (bookmark.text) { + return "text"; + } + switch (bookmark.asset.assetType) { + case "image": + return "image"; + break; + case "pdf": + return "pdf"; + break; + default: + return "unsupported"; + } +} + async function inferTags( jobId: string, bookmark: NonNullable>>, inferenceClient: InferenceClient, ) { let response; - if (bookmark.link || bookmark.text) { + const bType = bookmarkType(bookmark); + if (bType === "text" || bType == "link") { response = await inferTagsFromText(bookmark, inferenceClient); - } else if (bookmark.asset) { - switch (bookmark.asset.assetType) { - case "image": - response = await inferTagsFromImage(jobId, bookmark, inferenceClient); - break; - case "pdf": - response = await inferTagsFromPDF(jobId, bookmark, inferenceClient); - break; - default: - throw new Error(`[inference][${jobId}] Unsupported bookmark type`); - } + } else if (bType == "image") { + response = await inferTagsFromImage(jobId, bookmark, inferenceClient); + } else if (bType == "pdf") { + response = await inferTagsFromPDF(jobId, bookmark, inferenceClient); } else { throw new Error(`[inference][${jobId}] Unsupported bookmark type`); } @@ -362,6 +371,59 @@ async function connectTags( }); } +// TODO: Make this function accept max tokens as an argument. +function extractTextFromBookmark( + bookmark: NonNullable>>, +): string { + if (bookmark.link) { + if (!bookmark.link.description && !bookmark.link.content) { + throw new Error( + `No content found for link "${bookmark.id}". Skipping ...`, + ); + } + + let content = bookmark.link.content; + if (content) { + content = truncateContent(content); + } + return content || ""; + } + + if (bookmark.text) { + const content = truncateContent(bookmark.text.text ?? ""); + return content; + } + logger.error(`[extractTextFromBookmark] Unsupported bookmark type`); + return ""; +} + +async function textEmbedBookmark( + bookmark: NonNullable>>, + inferenceClient: InferenceClient, +) { + const content = extractTextFromBookmark(bookmark); + const embedding = await inferenceClient.generateEmbeddingFromText(content); + return embedding; +} + +async function embedBookmark( + jobId: string, + bookmark: NonNullable>>, + inferenceClient: InferenceClient, +) { + logger.info(`[embedding][${jobId}] Embedding bookmark ${bookmark.id}`); + const bType = bookmarkType(bookmark); + logger.info(`[embedding][${jobId}] Bookmark type: ${bType}`); + if (bType === "text") { + const embedding = await inferenceClient.generateEmbeddingFromText( + extractTextFromBookmark(bookmark), + ); + logger.info( + `[embeddings] Embedding generated successfully: ${embedding.embeddings}`, + ); + } +} + async function runOpenAI(job: DequeuedJob) { const jobId = job.id ?? "unknown"; @@ -398,4 +460,6 @@ async function runOpenAI(job: DequeuedJob) { // Update the search index await triggerSearchReindex(bookmarkId); + + await embedBookmark(jobId, bookmark, inferenceClient); } diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 21cdb1c8..107cb3a7 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -23,6 +23,7 @@ const allEnv = z.object({ INFERENCE_JOB_TIMEOUT_SEC: z.coerce.number().default(30), INFERENCE_TEXT_MODEL: z.string().default("gpt-4o-mini"), INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"), + EMBEDDING_TEXT_MODEL: z.string().default("text-embedding-3-small"), CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), BROWSER_WEBSOCKET_URL: z.string().url().optional(), @@ -73,6 +74,9 @@ const serverConfigSchema = allEnv.transform((val) => { imageModel: val.INFERENCE_IMAGE_MODEL, inferredTagLang: val.INFERENCE_LANG, }, + embedding: { + textModel: val.EMBEDDING_TEXT_MODEL, + }, crawler: { numWorkers: val.CRAWLER_NUM_WORKERS, headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, From 8b8ebe11f13e5c13d76bdd29286e419786bf6164 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Tue, 17 Sep 2024 22:55:55 +0200 Subject: [PATCH 04/10] fix unintentional change -- inference image model --- apps/workers/inference.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index 039c05a3..5c1882b1 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -66,7 +66,7 @@ class OpenAIInferenceClient implements InferenceClient { image: string, ): Promise { const chatCompletion = await this.openAI.chat.completions.create({ - model: serverConfig.embedding.textModel, + model: serverConfig.inference.imageModel, response_format: { type: "json_object" }, messages: [ { From e14b38f600d0492dc84afa58b94ee851f863116e Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 17:48:27 +0200 Subject: [PATCH 05/10] support embeddings for PDF bookmarks --- apps/workers/openaiWorker.ts | 57 +++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index 6e1fb801..5c07410a 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -374,6 +374,7 @@ async function connectTags( // TODO: Make this function accept max tokens as an argument. function extractTextFromBookmark( bookmark: NonNullable>>, + jobId: string, ): string { if (bookmark.link) { if (!bookmark.link.description && !bookmark.link.content) { @@ -389,19 +390,55 @@ function extractTextFromBookmark( return content || ""; } - if (bookmark.text) { - const content = truncateContent(bookmark.text.text ?? ""); - return content; + if (!bookmark.text) { + logger.error( + `[extractTextFromBookmark][${jobId}] Unsupported bookmark type, skipping ...`, + ); + return ""; } - logger.error(`[extractTextFromBookmark] Unsupported bookmark type`); - return ""; + const content = truncateContent(bookmark.text.text ?? ""); + if (!content) { + throw new Error( + `[inference][${jobId}] [UNEXPECTED] TruncateContent returned empty content for bookmark "${bookmark.id}". Skipping ...`, + ); + } + return content; +} + +async function extractTextFromPDFBookmark( + bookmark: NonNullable>>, + jobId: string, +) { + const { asset } = await readAsset({ + userId: bookmark.userId, + assetId: bookmark.asset.assetId, + }); + if (!asset) { + throw new Error( + `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`, + ); + } + const pdfParse = await readPDFText(asset); + if (!pdfParse?.text) { + throw new Error( + `[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`, + ); + } + const content = truncateContent(pdfParse.text); + if (!content) { + throw new Error( + `[inference][${jobId}] [UNEXPECTED] TruncateContent returned empty content for PDF "${bookmark.id}". Skipping ...`, + ); + } + return content; } async function textEmbedBookmark( + jobId: string, bookmark: NonNullable>>, inferenceClient: InferenceClient, ) { - const content = extractTextFromBookmark(bookmark); + const content = extractTextFromBookmark(bookmark, jobId); const embedding = await inferenceClient.generateEmbeddingFromText(content); return embedding; } @@ -416,11 +453,17 @@ async function embedBookmark( logger.info(`[embedding][${jobId}] Bookmark type: ${bType}`); if (bType === "text") { const embedding = await inferenceClient.generateEmbeddingFromText( - extractTextFromBookmark(bookmark), + extractTextFromBookmark(bookmark, jobId), ); logger.info( `[embeddings] Embedding generated successfully: ${embedding.embeddings}`, ); + } else if (bType == "pdf") { + const content = await extractTextFromPDFBookmark(bookmark, jobId); + const embedding = await inferenceClient.generateEmbeddingFromText(content); + logger.info( + `[embeddings] Embedding generated successfully: ${embedding.embeddings}`, + ); } } From 08a02c8df4ea403de65986ed1265940c6c994a20 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 19:14:51 +0200 Subject: [PATCH 06/10] Upgrade drizzle-kit Existing version is not working with the upgraded version of drizzle-orm. I removed the "driver" to the match the new schema of the Config. Quoting from their Config: * `driver` - optional param that is responsible for explicitly providing a driver to use when accessing a database * *Possible values*: `aws-data-api`, `d1-http`, `expo`, `turso`, `pglite` * If you don't use AWS Data API, D1, Turso or Expo - ypu don't need this driver. You can check a driver strategy choice here: https://orm. --- packages/db/drizzle.config.ts | 2 +- packages/db/package.json | 2 +- packages/queue/package.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/db/drizzle.config.ts b/packages/db/drizzle.config.ts index 25a7675f..dca30914 100644 --- a/packages/db/drizzle.config.ts +++ b/packages/db/drizzle.config.ts @@ -7,9 +7,9 @@ const databaseURL = serverConfig.dataDir : "./db.db"; export default { + dialect: "sqlite", schema: "./schema.ts", out: "./drizzle", - driver: "better-sqlite", dbCredentials: { url: databaseURL, }, diff --git a/packages/db/package.json b/packages/db/package.json index 2335d1f8..f55d0640 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -24,7 +24,7 @@ "@hoarder/tsconfig": "workspace:^0.1.0", "@tsconfig/node21": "^21.0.1", "@types/better-sqlite3": "^7.6.9", - "drizzle-kit": "^0.20.14" + "drizzle-kit": "^0.24.2" }, "eslintConfig": { "root": true, diff --git a/packages/queue/package.json b/packages/queue/package.json index a5d648e3..e034accd 100644 --- a/packages/queue/package.json +++ b/packages/queue/package.json @@ -15,7 +15,7 @@ "@hoarder/prettier-config": "workspace:^0.1.0", "@hoarder/tsconfig": "workspace:^0.1.0", "@types/better-sqlite3": "^7.6.9", - "drizzle-kit": "^0.20.14", + "drizzle-kit": "^0.24.2", "vitest": "^1.3.1" }, "scripts": { From adfe2be1c864ebd3e59baac83da93637222c7d81 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 20:17:20 +0200 Subject: [PATCH 07/10] fix formatting and lint --- apps/workers/inference.ts | 2 +- apps/workers/openaiWorker.ts | 23 +++++++---------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index 5c1882b1..2b43f1f1 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -162,7 +162,7 @@ class OllamaInferenceClient implements InferenceClient { return { response, totalTokens }; } - async runEmbeddingModel(model: string, prompt: string, image?: string) { + async runEmbeddingModel(model: string, prompt: string) { const embedding = await this.ollama.embed({ model: model, input: prompt, diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index 5c07410a..7431d267 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -19,9 +19,11 @@ import { zOpenAIRequestSchema, } from "@hoarder/shared/queues"; -import type { InferenceClient } from "./inference"; +import type { EmbeddingResponse, InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; import { readPDFText, truncateContent } from "./utils"; +import * as sqliteVec from "sqlite-vec"; +import Database from "better-sqlite3"; type BookmarkType = "link" | "text" | "image" | "pdf" | "unsupported"; @@ -374,7 +376,6 @@ async function connectTags( // TODO: Make this function accept max tokens as an argument. function extractTextFromBookmark( bookmark: NonNullable>>, - jobId: string, ): string { if (bookmark.link) { if (!bookmark.link.description && !bookmark.link.content) { @@ -387,19 +388,19 @@ function extractTextFromBookmark( if (content) { content = truncateContent(content); } - return content || ""; + return content ?? ""; } if (!bookmark.text) { logger.error( - `[extractTextFromBookmark][${jobId}] Unsupported bookmark type, skipping ...`, + `[extractTextFromBookmark] Unsupported bookmark type, skipping ...`, ); return ""; } const content = truncateContent(bookmark.text.text ?? ""); if (!content) { throw new Error( - `[inference][${jobId}] [UNEXPECTED] TruncateContent returned empty content for bookmark "${bookmark.id}". Skipping ...`, + `[inference] [UNEXPECTED] TruncateContent returned empty content for bookmark "${bookmark.id}". Skipping ...`, ); } return content; @@ -433,16 +434,6 @@ async function extractTextFromPDFBookmark( return content; } -async function textEmbedBookmark( - jobId: string, - bookmark: NonNullable>>, - inferenceClient: InferenceClient, -) { - const content = extractTextFromBookmark(bookmark, jobId); - const embedding = await inferenceClient.generateEmbeddingFromText(content); - return embedding; -} - async function embedBookmark( jobId: string, bookmark: NonNullable>>, @@ -453,7 +444,7 @@ async function embedBookmark( logger.info(`[embedding][${jobId}] Bookmark type: ${bType}`); if (bType === "text") { const embedding = await inferenceClient.generateEmbeddingFromText( - extractTextFromBookmark(bookmark, jobId), + extractTextFromBookmark(bookmark), ); logger.info( `[embeddings] Embedding generated successfully: ${embedding.embeddings}`, From 5cfd82877f9fcbc240765df0dd065ef709595c7d Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 20:27:00 +0200 Subject: [PATCH 08/10] add comments about truncate content --- apps/workers/openaiWorker.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index 7431d267..f28b4d67 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -19,11 +19,9 @@ import { zOpenAIRequestSchema, } from "@hoarder/shared/queues"; -import type { EmbeddingResponse, InferenceClient } from "./inference"; +import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; import { readPDFText, truncateContent } from "./utils"; -import * as sqliteVec from "sqlite-vec"; -import Database from "better-sqlite3"; type BookmarkType = "link" | "text" | "image" | "pdf" | "unsupported"; @@ -374,6 +372,8 @@ async function connectTags( } // TODO: Make this function accept max tokens as an argument. +// TODO: Truncate text logic needs to be taken refactored such that the max token are tied to the model +// being used and not done once per bookmark. function extractTextFromBookmark( bookmark: NonNullable>>, ): string { @@ -439,7 +439,7 @@ async function embedBookmark( bookmark: NonNullable>>, inferenceClient: InferenceClient, ) { - logger.info(`[embedding][${jobId}] Embedding bookmark ${bookmark.id}`); + logger.info(`[embedding][${jobId}] ookmark ${bookmark.id}`); const bType = bookmarkType(bookmark); logger.info(`[embedding][${jobId}] Bookmark type: ${bType}`); if (bType === "text") { From c10b8ad6076bc71958064cf55800a70c2d97bab1 Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 20:35:20 +0200 Subject: [PATCH 09/10] Revert "Upgrade drizzle-kit" This reverts commit 08a02c8df4ea403de65986ed1265940c6c994a20. --- packages/db/drizzle.config.ts | 2 +- packages/db/package.json | 2 +- packages/queue/package.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/db/drizzle.config.ts b/packages/db/drizzle.config.ts index dca30914..25a7675f 100644 --- a/packages/db/drizzle.config.ts +++ b/packages/db/drizzle.config.ts @@ -7,9 +7,9 @@ const databaseURL = serverConfig.dataDir : "./db.db"; export default { - dialect: "sqlite", schema: "./schema.ts", out: "./drizzle", + driver: "better-sqlite", dbCredentials: { url: databaseURL, }, diff --git a/packages/db/package.json b/packages/db/package.json index f55d0640..2335d1f8 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -24,7 +24,7 @@ "@hoarder/tsconfig": "workspace:^0.1.0", "@tsconfig/node21": "^21.0.1", "@types/better-sqlite3": "^7.6.9", - "drizzle-kit": "^0.24.2" + "drizzle-kit": "^0.20.14" }, "eslintConfig": { "root": true, diff --git a/packages/queue/package.json b/packages/queue/package.json index e034accd..a5d648e3 100644 --- a/packages/queue/package.json +++ b/packages/queue/package.json @@ -15,7 +15,7 @@ "@hoarder/prettier-config": "workspace:^0.1.0", "@hoarder/tsconfig": "workspace:^0.1.0", "@types/better-sqlite3": "^7.6.9", - "drizzle-kit": "^0.24.2", + "drizzle-kit": "^0.20.14", "vitest": "^1.3.1" }, "scripts": { From a823b603a446f7ddecea90981441f2075a956fff Mon Sep 17 00:00:00 2001 From: Mohammed Farghal Date: Sat, 21 Sep 2024 21:07:05 +0200 Subject: [PATCH 10/10] revert keep alive field in Ollama --- apps/workers/inference.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/workers/inference.ts b/apps/workers/inference.ts index 2b43f1f1..1f9fecdb 100644 --- a/apps/workers/inference.ts +++ b/apps/workers/inference.ts @@ -36,6 +36,7 @@ export class InferenceClientFactory { return null; } } + class OpenAIInferenceClient implements InferenceClient { openAI: OpenAI; @@ -132,6 +133,7 @@ class OllamaInferenceClient implements InferenceClient { model: model, format: "json", stream: true, + keep_alive: serverConfig.inference.ollamaKeepAlive, messages: [ { role: "user", content: prompt, images: image ? [image] : undefined }, ],