Skip to content

Commit

Permalink
PR#2355 Continued + expanded scope (#2365)
Browse files Browse the repository at this point in the history
* #2317 Fetch pinned documents once per folder to reduce the number of queries.

* Reorder the lines to keeps const declarations together.

* Add some comments to functions
move pinned document fetch for folder to function
move watched documents per-folder to also function the same
remove unused function in documents model

---------

Co-authored-by: Błażej Owczarczyk <[email protected]>
  • Loading branch information
timothycarambat and blazeyo authored Sep 24, 2024
1 parent ac91d0d commit b44889a
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 32 deletions.
7 changes: 7 additions & 0 deletions server/models/documentSyncQueue.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ const DocumentSyncQueue = {
return new Date(Number(new Date()) + queueRecord.staleAfterMs);
},

/**
* Check if the document can be watched based on the metadata fields
* @param {object} metadata - metadata to check
* @param {string} metadata.title - title of the document
* @param {string} metadata.chunkSource - chunk source of the document
* @returns {boolean} - true if the document can be watched, false otherwise
*/
canWatch: function ({ title, chunkSource = null } = {}) {
if (chunkSource.startsWith("link://") && title.endsWith(".html"))
return true; // If is web-link material (prior to feature most chunkSources were links://)
Expand Down
19 changes: 3 additions & 16 deletions server/models/documents.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,33 +57,20 @@ const Document = {
}
},

getOnlyWorkspaceIds: async function (clause = {}) {
try {
const workspaceIds = await prisma.workspace_documents.findMany({
where: clause,
select: {
workspaceId: true,
},
});
return workspaceIds.map((record) => record.workspaceId) || [];
} catch (error) {
console.error(error.message);
return [];
}
},

where: async function (
clause = {},
limit = null,
orderBy = null,
include = null
include = null,
select = null
) {
try {
const results = await prisma.workspace_documents.findMany({
where: clause,
...(limit !== null ? { take: limit } : {}),
...(orderBy !== null ? { orderBy } : {}),
...(include !== null ? { include } : {}),
...(select !== null ? { select: { ...select } } : {}),
});
return results;
} catch (error) {
Expand Down
95 changes: 79 additions & 16 deletions server/utils/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,40 @@ async function viewLocalFiles() {
items: [],
};
const subfiles = fs.readdirSync(folderPath);
const filenames = {};

for (const subfile of subfiles) {
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
const pinnedInWorkspaces = await Document.getOnlyWorkspaceIds({
docpath: cachefilename,
pinned: true,
});
const watchedInWorkspaces = liveSyncAvailable
? await Document.getOnlyWorkspaceIds({
docpath: cachefilename,
watched: true,
})
: [];

subdocs.items.push({
name: subfile,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
pinnedWorkspaces: pinnedInWorkspaces,
canWatch: liveSyncAvailable
? DocumentSyncQueue.canWatch(metadata)
: false,
// Is file watched in any workspace since sync updates all workspaces where file is referenced
watched: watchedInWorkspaces.length !== 0,
// pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// watched: false, // boolean to indicate if this document is watched in ANY workspace
});
filenames[cachefilename] = subfile;
}

// Grab the pinned workspaces and watched documents for this folder's documents
// at the time of the query so we don't have to re-query the database for each file
const pinnedWorkspacesByDocument =
await getPinnedWorkspacesByDocument(filenames);
const watchedDocumentsFilenames =
await getWatchedDocumentFilenames(filenames);
for (const item of subdocs.items) {
item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
item.watched =
watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
}

directory.items.push(subdocs);
}
}
Expand All @@ -88,8 +91,13 @@ async function viewLocalFiles() {
return directory;
}

// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
/**
* Searches the vector-cache folder for existing information so we dont have to re-embed a
* document and can instead push directly to vector db.
* @param {string} filename - the filename to check for cached vector information
* @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
* @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
*/
async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };

Expand Down Expand Up @@ -218,6 +226,61 @@ function hasVectorCachedFiles() {
return false;
}

/**
* @param {string[]} filenames - array of filenames to check for pinned workspaces
* @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
*/
async function getPinnedWorkspacesByDocument(filenames = []) {
return (
await Document.where(
{
docpath: {
in: Object.keys(filenames),
},
pinned: true,
},
null,
null,
null,
{
workspaceId: true,
docpath: true,
}
)
).reduce((result, { workspaceId, docpath }) => {
const filename = filenames[docpath];
if (!result[filename]) result[filename] = [];
if (!result[filename].includes(workspaceId))
result[filename].push(workspaceId);
return result;
}, {});
}

/**
* Get a record of filenames and their corresponding workspaceIds that have watched a document
* that will be used to determine if a document should be displayed in the watched documents sidebar
* @param {string[]} filenames - array of filenames to check for watched workspaces
* @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
*/
async function getWatchedDocumentFilenames(filenames = []) {
return (
await Document.where(
{
docpath: { in: Object.keys(filenames) },
watched: true,
},
null,
null,
null,
{ workspaceId: true, docpath: true }
)
).reduce((result, { workspaceId, docpath }) => {
const filename = filenames[docpath];
result[filename] = workspaceId;
return result;
}, {});
}

module.exports = {
findDocumentInDocuments,
cachedVectorInformation,
Expand Down

0 comments on commit b44889a

Please sign in to comment.