Skip to content

Commit

Permalink
Add script for contextual retrieval with document chunking in genaisrc
Browse files Browse the repository at this point in the history
  • Loading branch information
pelikhan committed Sep 26, 2024
1 parent 42b268e commit 72e9ba9
Showing 1 changed file with 34 additions and 0 deletions.
34 changes: 34 additions & 0 deletions packages/sample/genaisrc/contextual-retreival.genai.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
script({
system: [],
files: ["src/azure-lza/azure-azure-resource-manager-bicep.pdf"],
})

const doc = env.files[0]
const chunks = splitTextIntoChunks(doc.content, 100)

console.log(
`Document: ${doc.filename}, ${doc.content.length} characters, ${chunks.length} chunks`
)
for (const chunk of chunks) {
console.log(`chunk: ${chunk.slice(0, 25) + "..."}`)
const res = await runPrompt(
(_) => {
_.def("DOCUMENT", doc, { maxTokens: 10000 })
$`Here is the chunk we want to situate within the whole document`
_.def("CHUNK", chunk)
_.$`Please give a short succinct context to situate this chunk
within the overall document for the purposes of improving search retrieval of the chunk.
Answer only with the succinct context and nothing else. `
},
{ cache: "cr" }
)
}

function splitTextIntoChunks(text: string, chunkSize: number): string[] {
const tokens = text.split(/\s+/) // Split text into tokens based on whitespace
const chunks = []
for (let i = 0; i < tokens.length; i += chunkSize) {
chunks.push(tokens.slice(i, i + chunkSize).join(" "))
}
return chunks
}

0 comments on commit 72e9ba9

Please sign in to comment.