Skip to content

Commit

Permalink
Support more Confluence URL formats (#2118)
Browse files Browse the repository at this point in the history
* support more confluence url formats

* use pattern matching for confluence urls and manual splitting as fallback

* rework entire Confluence flow to prevent issues with custom, local, and cloud spaces

* remove dep

---------

Co-authored-by: Timothy Carambat <[email protected]>
  • Loading branch information
shatfield4 and timothycarambat authored Sep 25, 2024
1 parent 44dddcd commit 4488744
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 113 deletions.
1 change: 1 addition & 0 deletions collector/extensions/resync/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) {
const { success, reason, content } = await fetchConfluencePage({
pageUrl: `https:${source.pathname}`, // need to add back the real protocol
baseUrl: source.searchParams.get('baseUrl'),
spaceKey: source.searchParams.get('spaceKey'),
accessToken: source.searchParams.get('token'),
username: source.searchParams.get('username'),
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ class ConfluencePagesLoader {
}
}

// https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url);
if (data.size === 0) {
return [];
Expand Down
150 changes: 45 additions & 105 deletions collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const UrlPattern = require("url-pattern");
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
Expand All @@ -13,28 +12,36 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader");
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadConfluence({ pageUrl, username, accessToken }, response) {
if (!pageUrl || !username || !accessToken) {
async function loadConfluence(
{ baseUrl = null, spaceKey = null, username = null, accessToken = null },
response
) {
if (!baseUrl || !spaceKey || !username || !accessToken) {
return {
success: false,
reason:
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.",
};
}

const { valid, result } = validSpaceUrl(pageUrl);
if (!valid) {
if (!validBaseUrl(baseUrl)) {
return {
success: false,
reason:
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
reason: "Provided base URL is not a valid URL.",
};
}

const { apiBase: baseUrl, spaceKey, subdomain } = result;
console.log(`-- Working Confluence ${baseUrl} --`);
if (!spaceKey) {
return {
success: false,
reason: "You need to provide a Confluence space key.",
};
}

const { origin, hostname } = new URL(baseUrl);
console.log(`-- Working Confluence ${origin} --`);
const loader = new ConfluencePagesLoader({
baseUrl,
baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
spaceKey,
username,
accessToken,
Expand All @@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
};
}
const outFolder = slugify(
`${subdomain}-confluence-${v4().slice(0, 4)}`
`confluence-${origin}-${v4().slice(0, 4)}`
).toLowerCase();

const outFolderPath =
Expand All @@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
id: v4(),
url: doc.metadata.url + ".page",
title: doc.metadata.title || doc.metadata.source,
docAuthor: subdomain,
docAuthor: origin,
description: doc.metadata.title,
docSource: `${subdomain} Confluence`,
docSource: `${origin} Confluence`,
chunkSource: generateChunkSource(
{ doc, baseUrl, accessToken, username },
{ doc, baseUrl: origin, spaceKey, accessToken, username },
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
Expand Down Expand Up @@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
async function fetchConfluencePage({
pageUrl,
baseUrl,
spaceKey,
username,
accessToken,
}) {
if (!pageUrl || !baseUrl || !username || !accessToken) {
if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
return {
success: false,
content: null,
Expand All @@ -132,20 +140,25 @@ async function fetchConfluencePage({
};
}

const { valid, result } = validSpaceUrl(pageUrl);
if (!valid) {
if (!validBaseUrl(baseUrl)) {
return {
success: false,
content: null,
reason:
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
reason: "Provided base URL is not a valid URL.",
};
}

if (!spaceKey) {
return {
success: false,
content: null,
reason: "You need to provide a Confluence space key.",
};
}

console.log(`-- Working Confluence Page ${pageUrl} --`);
const { spaceKey } = result;
const loader = new ConfluencePagesLoader({
baseUrl,
baseUrl, // Should be the origin of the baseUrl
spaceKey,
username,
accessToken,
Expand Down Expand Up @@ -190,91 +203,17 @@ async function fetchConfluencePage({
}

/**
* A match result for a url-pattern of a Confluence URL
* @typedef {Object} ConfluenceMatchResult
* @property {string} subdomain - the subdomain of an organization's Confluence space
* @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
* @property {string} apiBase - the correct REST API url to use for loader.
*/

/**
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host/access a
* Confluence space.
* @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
* @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @returns {string} - the resulting REST API URL
*/
function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
const { subdomain } = matchResult;
if (isCustomDomain) return `https://${subdomain}`;
return `https://${subdomain}.atlassian.net/wiki`;
}

/**
* Validates and parses the correct information from a given Confluence URL
* @param {string} spaceUrl - The organization's Confluence URL to parse
* @returns {{
* valid: boolean,
* result: (ConfluenceMatchResult|null),
* }}
* Validates if the provided baseUrl is a valid URL at all.
* @param {string} baseUrl
* @returns {boolean}
*/
function validSpaceUrl(spaceUrl = "") {
let matchResult;
const patterns = {
default: new UrlPattern(
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
),
subdomain: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
),
custom: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
),
};

// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns.default.match(spaceUrl);
if (matchResult)
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};

// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns.subdomain.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
function validBaseUrl(baseUrl) {
try {
new URL(baseUrl);
return true;
} catch (e) {
return false;
}

// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns.custom.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult, true),
},
};
}

// No match
return { valid: false, result: null };
}

/**
Expand All @@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") {
* @returns {string}
*/
function generateChunkSource(
{ doc, baseUrl, accessToken, username },
{ doc, baseUrl, spaceKey, accessToken, username },
encryptionWorker
) {
const payload = {
baseUrl,
spaceKey,
token: accessToken,
username,
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ export default function ConfluenceOptions() {
}
);
const { data, error } = await System.dataConnectors.confluence.collect({
pageUrl: form.get("pageUrl"),
baseUrl: form.get("baseUrl"),
spaceKey: form.get("spaceKey"),
username: form.get("username"),
accessToken: form.get("accessToken"),
});
Expand Down Expand Up @@ -56,17 +57,37 @@ export default function ConfluenceOptions() {
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
<p className="font-bold text-white">Confluence Page URL</p>
<p className="font-bold text-white">Confluence base URL</p>
</label>
<p className="text-xs font-normal text-white/50">
URL of a page in the Confluence space.
This is the base URL of your Confluence space.
</p>
</div>
<input
type="url"
name="pageUrl"
name="baseUrl"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page"
placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..."
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Confluence space key
</label>
<p className="text-xs font-normal text-white/50">
This is the spaces key of your confluence instance that will
be used. Usually begins with ~
</p>
</div>
<input
type="text"
name="spaceKey"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="eg: ~7120208c08555d52224113949698b933a3bb56"
required={true}
autoComplete="off"
spellCheck={false}
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/models/dataConnector.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,13 @@ const DataConnector = {
},

confluence: {
collect: async function ({ pageUrl, username, accessToken }) {
collect: async function ({ baseUrl, spaceKey, username, accessToken }) {
return await fetch(`${API_BASE}/ext/confluence`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({
pageUrl,
baseUrl,
spaceKey,
username,
accessToken,
}),
Expand Down

0 comments on commit 4488744

Please sign in to comment.