From 804851fb344874a2912ab152a303f6c62f70d891 Mon Sep 17 00:00:00 2001 From: Peli de Halleux Date: Mon, 18 Nov 2024 03:05:14 +0000 Subject: [PATCH] =?UTF-8?q?refactor:=20=E2=99=BB=EF=B8=8F=20update=20PDF?= =?UTF-8?q?=20parsing=20to=20use=20structured=20pages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/cli/package.json | 5 ++--- packages/cli/src/playwright.ts | 1 - packages/core/src/parsers.test.ts | 12 +++++------ packages/core/src/parsers.ts | 7 +++---- packages/core/src/pdf.ts | 35 +++++++++++++++++++++---------- yarn.lock | 6 +++--- 6 files changed, 38 insertions(+), 28 deletions(-) diff --git a/packages/cli/package.json b/packages/cli/package.json index 81e0b586d2..678e71968d 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -45,20 +45,19 @@ "jimp": "^1.6.0", "mammoth": "^1.8.0", "mathjs": "^13.2.2", - "playwright": "^1.48.2", "tabletojson": "^4.1.5", - "tree-sitter-wasms": "^0.1.11", "tsx": "^4.19.2", "turndown": "^7.2.0", "typescript": "5.6.3", "vectra": "^0.9.0", - "web-tree-sitter": "^0.22.2", "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" }, "optionalDependencies": { "@lvce-editor/ripgrep": "^1.4.0", "canvas": "^2.11.2", "pdfjs-dist": "4.8.69", + "playwright": "^1.48.2", + "tree-sitter-wasms": "^0.1.11", "web-tree-sitter": "0.22.2" }, "engines": { diff --git a/packages/cli/src/playwright.ts b/packages/cli/src/playwright.ts index 0c94d53966..3b1303ae03 100644 --- a/packages/cli/src/playwright.ts +++ b/packages/cli/src/playwright.ts @@ -5,7 +5,6 @@ import { runtimeHost } from "../../core/src/host" import { PLAYWRIGHT_VERSION } from "./version" import { ellipseUri } from "../../core/src/url" import { PLAYWRIGHT_DEFAULT_BROWSER } from "../../core/src/constants" -import { log } from "node:console" /** * Manages browser instances using Playwright, including launching, diff --git a/packages/core/src/parsers.test.ts b/packages/core/src/parsers.test.ts index 4f14038b80..44c5d3228f 100644 --- a/packages/core/src/parsers.test.ts +++ b/packages/core/src/parsers.test.ts @@ -8,7 +8,7 @@ import { resolve } from "path" import { TestHost } from "./testhost" import { estimateTokens } from "./tokens" -describe("parsers", () => { +describe("parsers", async () => { let trace: MarkdownTrace let model: string let parsers: Awaited> @@ -46,17 +46,17 @@ describe("parsers", () => { assert.equal(result.key, "value") }) - test("PDF", async () => { - const result = await parsers.PDF({ filename: "./src/rag/loremipsum.pdf" }) - console.log(result) + await test("PDF", async () => { + const result = await parsers.PDF({ filename: "../sample/src/rag/loremipsum.pdf" }) assert(result.file.content.includes("Lorem")) }) - test("PDF-image", async () => { + await test("PDF-image", async () => { const result = await parsers.PDF( - { filename: "src/rag/loremipsum.pdf" }, + { filename: "../sample/src/rag/loremipsum.pdf" }, { renderAsImage: true } ) + console.log(result) assert(result.file.content.includes("Lorem")) }) diff --git a/packages/core/src/parsers.ts b/packages/core/src/parsers.ts index a8bfd525cf..1fb4467efe 100644 --- a/packages/core/src/parsers.ts +++ b/packages/core/src/parsers.ts @@ -91,15 +91,14 @@ export async function createParsers(options: { trace, } const filename = typeof file === "string" ? file : file.filename - const { pages, images, content } = - (await parsePdf(filename, opts)) || {} + const { pages, content } = (await parsePdf(filename, opts)) || {} return { file: { filename, content, }, - pages, - images, + pages: pages?.map((p) => p.content), + images: pages?.map((p) => p.image), } }, code: async (file, query) => { diff --git a/packages/core/src/pdf.ts b/packages/core/src/pdf.ts index acd37136e9..afc1fd8d28 100644 --- a/packages/core/src/pdf.ts +++ b/packages/core/src/pdf.ts @@ -63,6 +63,12 @@ function installPromiseWithResolversShim() { }) } +export interface PDFPage { + index: number + content: string + image?: Buffer +} + /** * Parses PDF files using pdfjs-dist. * @param fileOrUrl - The file path or URL of the PDF @@ -88,8 +94,7 @@ async function PDFTryParse( }) const doc = await loader.promise const numPages = doc.numPages - const pages: string[] = [] - const images: Buffer[] = [] + const pages: PDFPage[] = [] // Iterate through each page and extract text content for (let i = 0; i < numPages; i++) { @@ -105,7 +110,11 @@ async function PDFTryParse( lines = lines.map((line) => line.replace(/[\t ]+$/g, "")) // Collapse trailing spaces - pages.push(lines.join("\n")) + const p: PDFPage = { + index: i + 1, + content: lines.join("\n"), + } + pages.push(p) if (renderAsImage) { const viewport = page.getViewport({ scale: 1.5 }) @@ -121,11 +130,11 @@ async function PDFTryParse( viewport, }).promise const buffer = canvas.toBuffer("image/png") - images.push(buffer) + p.image = buffer } } } - return { ok: true, pages, images } + return { ok: true, pages } } catch (error) { trace?.error(`reading pdf`, error) // Log error if tracing is enabled return { ok: false, error: serializeError(error) } @@ -137,8 +146,10 @@ async function PDFTryParse( * @param pages - Array of page content strings * @returns A single string representing the entire document */ -function PDFPagesToString(pages: string[]) { - return pages?.join("\n\n-------- Page Break --------\n\n") +function PDFPagesToString(pages: PDFPage[]) { + return pages + ?.map((p) => `-------- Page ${p.index} --------\n\n${p.content}`) + .join("\n\n") } /** @@ -150,14 +161,16 @@ function PDFPagesToString(pages: string[]) { export async function parsePdf( filename: string, options?: ParsePDFOptions & TraceOptions -): Promise<{ pages: string[]; images?: Buffer[]; content: string }> { +): Promise<{ pages: PDFPage[]; content: string }> { const { filter } = options || {} - let { pages, images } = await PDFTryParse(filename, undefined, options) + let { pages, ok } = await PDFTryParse(filename, undefined, options) + if (!ok) return { pages: [], content: "" } // Apply filter if provided - if (filter) pages = pages.filter((page, index) => filter(index, page)) + if (filter) + pages = pages.filter((page, index) => filter(index, page.content)) const content = PDFPagesToString(pages) - return { pages, images, content } + return { pages, content } } /** diff --git a/yarn.lock b/yarn.lock index 6c74d2e259..3c49b3c5c7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4696,9 +4696,9 @@ punycode.js@^2.3.1: integrity sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA== qs@^6.9.1: - version "6.13.0" - resolved "https://registry.yarnpkg.com/qs/-/qs-6.13.0.tgz#6ca3bd58439f7e245655798997787b0d88a51906" - integrity sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg== + version "6.13.1" + resolved "https://registry.yarnpkg.com/qs/-/qs-6.13.1.tgz#3ce5fc72bd3a8171b85c99b93c65dd20b7d1b16e" + integrity sha512-EJPeIn0CYrGu+hli1xilKAPXODtJ12T0sP63Ijx2/khC2JtuaN3JyNIpvmnkmaEtha9ocbG4A4cMcr+TvqvwQg== dependencies: side-channel "^1.0.6"