Skip to content

Commit

Permalink
refactor: ♻️ update PDF parsing to use structured pages
Browse files Browse the repository at this point in the history
  • Loading branch information
pelikhan committed Nov 18, 2024
1 parent ddc245c commit 804851f
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 28 deletions.
5 changes: 2 additions & 3 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,19 @@
"jimp": "^1.6.0",
"mammoth": "^1.8.0",
"mathjs": "^13.2.2",
"playwright": "^1.48.2",
"tabletojson": "^4.1.5",
"tree-sitter-wasms": "^0.1.11",
"tsx": "^4.19.2",
"turndown": "^7.2.0",
"typescript": "5.6.3",
"vectra": "^0.9.0",
"web-tree-sitter": "^0.22.2",
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"
},
"optionalDependencies": {
"@lvce-editor/ripgrep": "^1.4.0",
"canvas": "^2.11.2",
"pdfjs-dist": "4.8.69",
"playwright": "^1.48.2",
"tree-sitter-wasms": "^0.1.11",
"web-tree-sitter": "0.22.2"
},
"engines": {
Expand Down
1 change: 0 additions & 1 deletion packages/cli/src/playwright.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import { runtimeHost } from "../../core/src/host"
import { PLAYWRIGHT_VERSION } from "./version"
import { ellipseUri } from "../../core/src/url"
import { PLAYWRIGHT_DEFAULT_BROWSER } from "../../core/src/constants"
import { log } from "node:console"

/**
* Manages browser instances using Playwright, including launching,
Expand Down
12 changes: 6 additions & 6 deletions packages/core/src/parsers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { resolve } from "path"
import { TestHost } from "./testhost"
import { estimateTokens } from "./tokens"

describe("parsers", () => {
describe("parsers", async () => {
let trace: MarkdownTrace
let model: string
let parsers: Awaited<ReturnType<typeof createParsers>>
Expand Down Expand Up @@ -46,17 +46,17 @@ describe("parsers", () => {
assert.equal(result.key, "value")
})

test("PDF", async () => {
const result = await parsers.PDF({ filename: "./src/rag/loremipsum.pdf" })
console.log(result)
await test("PDF", async () => {
const result = await parsers.PDF({ filename: "../sample/src/rag/loremipsum.pdf" })
assert(result.file.content.includes("Lorem"))
})

test("PDF-image", async () => {
await test("PDF-image", async () => {
const result = await parsers.PDF(
{ filename: "src/rag/loremipsum.pdf" },
{ filename: "../sample/src/rag/loremipsum.pdf" },
{ renderAsImage: true }
)
console.log(result)
assert(result.file.content.includes("Lorem"))
})

Expand Down
7 changes: 3 additions & 4 deletions packages/core/src/parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,14 @@ export async function createParsers(options: {
trace,
}
const filename = typeof file === "string" ? file : file.filename
const { pages, images, content } =
(await parsePdf(filename, opts)) || {}
const { pages, content } = (await parsePdf(filename, opts)) || {}
return {
file: <WorkspaceFile>{
filename,
content,
},
pages,
images,
pages: pages?.map((p) => p.content),
images: pages?.map((p) => p.image),
}
},
code: async (file, query) => {
Expand Down
35 changes: 24 additions & 11 deletions packages/core/src/pdf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ function installPromiseWithResolversShim() {
})
}

export interface PDFPage {
index: number
content: string
image?: Buffer
}

/**
* Parses PDF files using pdfjs-dist.
* @param fileOrUrl - The file path or URL of the PDF
Expand All @@ -88,8 +94,7 @@ async function PDFTryParse(
})
const doc = await loader.promise
const numPages = doc.numPages
const pages: string[] = []
const images: Buffer[] = []
const pages: PDFPage[] = []

// Iterate through each page and extract text content
for (let i = 0; i < numPages; i++) {
Expand All @@ -105,7 +110,11 @@ async function PDFTryParse(
lines = lines.map((line) => line.replace(/[\t ]+$/g, ""))

// Collapse trailing spaces
pages.push(lines.join("\n"))
const p: PDFPage = {
index: i + 1,
content: lines.join("\n"),
}
pages.push(p)

if (renderAsImage) {
const viewport = page.getViewport({ scale: 1.5 })
Expand All @@ -121,11 +130,11 @@ async function PDFTryParse(
viewport,
}).promise
const buffer = canvas.toBuffer("image/png")
images.push(buffer)
p.image = buffer
}
}
}
return { ok: true, pages, images }
return { ok: true, pages }
} catch (error) {
trace?.error(`reading pdf`, error) // Log error if tracing is enabled
return { ok: false, error: serializeError(error) }
Expand All @@ -137,8 +146,10 @@ async function PDFTryParse(
* @param pages - Array of page content strings
* @returns A single string representing the entire document
*/
function PDFPagesToString(pages: string[]) {
return pages?.join("\n\n-------- Page Break --------\n\n")
function PDFPagesToString(pages: PDFPage[]) {
return pages
?.map((p) => `-------- Page ${p.index} --------\n\n${p.content}`)
.join("\n\n")
}

/**
Expand All @@ -150,14 +161,16 @@ function PDFPagesToString(pages: string[]) {
export async function parsePdf(
filename: string,
options?: ParsePDFOptions & TraceOptions
): Promise<{ pages: string[]; images?: Buffer[]; content: string }> {
): Promise<{ pages: PDFPage[]; content: string }> {
const { filter } = options || {}
let { pages, images } = await PDFTryParse(filename, undefined, options)
let { pages, ok } = await PDFTryParse(filename, undefined, options)
if (!ok) return { pages: [], content: "" }

// Apply filter if provided
if (filter) pages = pages.filter((page, index) => filter(index, page))
if (filter)
pages = pages.filter((page, index) => filter(index, page.content))
const content = PDFPagesToString(pages)
return { pages, images, content }
return { pages, content }
}

/**
Expand Down
6 changes: 3 additions & 3 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4696,9 +4696,9 @@ punycode.js@^2.3.1:
integrity sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==

qs@^6.9.1:
version "6.13.0"
resolved "https://registry.yarnpkg.com/qs/-/qs-6.13.0.tgz#6ca3bd58439f7e245655798997787b0d88a51906"
integrity sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==
version "6.13.1"
resolved "https://registry.yarnpkg.com/qs/-/qs-6.13.1.tgz#3ce5fc72bd3a8171b85c99b93c65dd20b7d1b16e"
integrity sha512-EJPeIn0CYrGu+hli1xilKAPXODtJ12T0sP63Ijx2/khC2JtuaN3JyNIpvmnkmaEtha9ocbG4A4cMcr+TvqvwQg==
dependencies:
side-channel "^1.0.6"

Expand Down

0 comments on commit 804851f

Please sign in to comment.