Skip to content

Commit

Permalink
feat(ai-help): index full docs as well (#9608)
Browse files Browse the repository at this point in the history
* refactor(scripts/ai-help): extract selectDocs helper

* chore(ai-help): add {content,token_count,embedding} columns

* refactor(scripts/ai-help): extract createEmbedding() helper

* feat(scripts/ai-help): create embeddings for full docs
  • Loading branch information
caugner authored Oct 2, 2023
1 parent b38d6c2 commit 9355a64
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 29 deletions.
3 changes: 3 additions & 0 deletions scripts/ai-help.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ create table
url text not null,
slug text not null,
title text not null,
content text null,
token_count integer null,
embedding extensions.vector null,
checksum text null,
constraint mdn_doc_pkey primary key (id),
constraint mdn_doc_url_key unique (url),
Expand Down
102 changes: 73 additions & 29 deletions scripts/ai-help.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ interface IndexedDoc {
url: string;
slug: string;
title: string;
token_count: number | null;
checksum: string;
}

Expand Down Expand Up @@ -52,6 +53,45 @@ export async function updateEmbeddings(directory: string) {
});
const openai = new OpenAIApi(configuration);

const createEmbedding = async (content: string) => {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = content.replace(/\n/g, " ");

let embeddingResponse;
try {
embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input,
});
} catch (e: any) {
const {
data: {
error: { message, type },
},
status,
statusText,
} = e.response;
console.error(
`[!] Failed to create embedding (${status} ${statusText}): ${type} - ${message}`
);
// Try again with trimmed content.
embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input: input.substring(0, 15000),
});
}

const {
data: [{ embedding }],
usage: { total_tokens },
} = embeddingResponse.data;

return {
total_tokens,
embedding,
};
};

console.log(`Retrieving all indexed documents...`);
const existingDocs = await fetchAllExistingDocs(supabaseClient);
console.log(`-> Done.`);
Expand Down Expand Up @@ -81,6 +121,20 @@ export async function updateEmbeddings(directory: string) {
checksum,
});
continue;
} else if (existingDoc && existingDoc.token_count === null) {
// (Legacy migration:) Add content, token_count, embedding where missing.
console.log(`-> [${url}] Adding content/token_count/embedding...`);
const { total_tokens, embedding } = await createEmbedding(content);

await supabaseClient
.from("mdn_doc")
.update({
content,
token_count: total_tokens,
embedding,
})
.filter("id", "eq", existingDoc.id)
.throwOnError();
}
}
console.log(
Expand Down Expand Up @@ -108,6 +162,9 @@ export async function updateEmbeddings(directory: string) {
.throwOnError();
}

// Embedding for full document.
const { total_tokens, embedding } = await createEmbedding(content);

// Create/update document record. Intentionally clear checksum until we
// have successfully generated all document sections.
const { data: doc } = await supabaseClient
Expand All @@ -118,6 +175,9 @@ export async function updateEmbeddings(directory: string) {
url,
slug,
title,
content,
token_count: total_tokens,
embedding,
},
{ onConflict: "url" }
)
Expand All @@ -133,29 +193,16 @@ export async function updateEmbeddings(directory: string) {

await Promise.all(
sections.map(async ({ heading, content }) => {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = content.replace(/\n/g, " ");

const embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input,
});

if (embeddingResponse.status !== 200) {
console.error("Embedding request failed", embeddingResponse.data);
throw new Error("Embedding request failed");
}

const [responseData] = embeddingResponse.data.data;
const { total_tokens, embedding } = await createEmbedding(content);

await supabaseClient
.from("mdn_doc_section")
.insert({
doc_id: doc.id,
heading,
content,
token_count: embeddingResponse.data.usage.total_tokens,
embedding: responseData.embedding,
token_count: total_tokens,
embedding: embedding,
})
.select()
.single()
Expand Down Expand Up @@ -260,24 +307,21 @@ function splitAndFilterSections(
}
async function fetchAllExistingDocs(supabase: SupabaseClient) {
const PAGE_SIZE = 1000;
let { data } = await supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum")
.order("id")
.limit(PAGE_SIZE)
.throwOnError();
const selectDocs = () =>
supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum, token_count")
.order("id")
.limit(PAGE_SIZE);

let { data } = await selectDocs().throwOnError();
let allData = data;
while (data.length === PAGE_SIZE) {
const lastItem = data[data.length - 1];
({ data } = await supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum")
.order("id")
.gt("id", lastItem.id)
.limit(PAGE_SIZE)
.throwOnError());
({ data } = await selectDocs().gt("id", lastItem.id).throwOnError());
allData = [...allData, ...data];
}

return allData;
}

Expand Down

0 comments on commit 9355a64

Please sign in to comment.