forked from aws-amplify/docs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from jacoblogan/update-link-script
add pr version of link checker script
- Loading branch information
Showing
6 changed files
with
215 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
name: CheckPRLinks | ||
on: | ||
schedule: | ||
- cron: '0 17 * * 1-5' | ||
workflow_dispatch: | ||
permissions: | ||
contents: read | ||
id-token: write | ||
jobs: | ||
CheckPRLinks: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 | ||
- name: Setup Node.js 16.x | ||
uses: actions/setup-node@e33196f7422957bea03ed53f6fbb155025ffc7b8 # v3.7.0 | ||
with: | ||
node-version: 16.x | ||
cache: 'yarn' | ||
- name: Install Dependencies | ||
run: yarn | ||
- name: Run Build | ||
run: yarn build | ||
env: | ||
NODE_OPTIONS: --max_old_space_size=4096 | ||
- name: Run Server | ||
run: | | ||
yarn run next-start & | ||
sleep 5 && | ||
yarn run linkcheck |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
import puppeteer from 'puppeteer'; | ||
import axios from 'axios'; | ||
|
||
const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml'; | ||
const DOMAIN = 'https://docs.amplify.aws'; | ||
const CRAWLER_EXCEPTIONS = [ | ||
'https://aaaaaaaaaa.execute-api.us-east-1.amazonaws.com/api', | ||
'https://aaaaaaaaaaaaaaaaaaaaaaaaaa.appsync-api.us-east-1.amazonaws.com/graphql', | ||
'https://twitter.com/AWSAmplify' | ||
]; | ||
const GITHUB_CREATE_ISSUE_LINK = | ||
'https://github.com/aws-amplify/docs/issues/new'; | ||
const GITHUB_EDIT_LINK = 'https://github.com/aws-amplify/docs/edit/'; | ||
|
||
const getSitemapUrls = async (localDomain) => { | ||
let browser = await puppeteer.launch({ headless: 'new' }); | ||
|
||
const page = await browser.newPage(); | ||
|
||
let siteMap = localDomain ? `${localDomain}/sitemap.xml` : SITEMAP_URL; | ||
let response = await page.goto(siteMap); | ||
|
||
const siteMapUrls = []; | ||
|
||
if (response && response.status() && response.status() === 200) { | ||
const urlTags = await page.evaluateHandle(() => { | ||
return document.getElementsByTagName('loc'); | ||
}); | ||
|
||
const numOfLinks = await page.evaluate((e) => e.length, urlTags); | ||
|
||
for (let i = 0; i < numOfLinks; i++) { | ||
let url = await page.evaluate( | ||
(urlTags, i) => urlTags[i].innerHTML, | ||
urlTags, | ||
i | ||
); | ||
if (localDomain) { | ||
url = url.replace(DOMAIN, localDomain); | ||
} | ||
siteMapUrls.push(url); | ||
} | ||
} | ||
|
||
browser.close(); | ||
|
||
return siteMapUrls; | ||
}; | ||
|
||
const retrieveLinks = async (siteMapUrls, visitedLinks) => { | ||
let browser = await puppeteer.launch({ headless: 'new' }); | ||
|
||
let page = await browser.newPage(); | ||
|
||
const urlsToVisit = []; | ||
|
||
for (let i = 0; i < siteMapUrls.length; i++) { | ||
let url = siteMapUrls[i]; | ||
|
||
try { | ||
let response = await page.goto(url, { waitUntil: 'domcontentloaded' }); | ||
await new Promise((r) => setTimeout(r, 100)); | ||
if (response && response.status() && response.status() === 200) { | ||
console.log(`successfully visited ${url} to retrieve links`); | ||
visitedLinks[url] = true; | ||
|
||
const urlList = await page.evaluate(async (url) => { | ||
let urls = []; | ||
let elements = document.getElementsByTagName('a'); | ||
for (let i = 0; i < elements.length; i++) { | ||
let element = elements[i]; | ||
if (element.href) { | ||
const link = { | ||
url: element.href, | ||
parentUrl: url, | ||
linkText: element.textContent | ||
}; | ||
urls.push(link); | ||
} | ||
} | ||
return urls; | ||
}, url); | ||
|
||
urlList.forEach((link) => { | ||
if (!CRAWLER_EXCEPTIONS.includes(link.url)) { | ||
urlsToVisit.push(link); | ||
} | ||
}); | ||
} | ||
} catch (e) { | ||
console.log(`failed to load ${url}: ${e}`); | ||
browser.close(); | ||
browser = await puppeteer.launch({ headless: 'new' }); | ||
page = await browser.newPage(); | ||
} | ||
} | ||
|
||
browser.close(); | ||
|
||
return urlsToVisit; | ||
}; | ||
|
||
const formatString = (inputs) => { | ||
let retString = ''; | ||
inputs.forEach((item) => { | ||
Object.keys(item).forEach((k) => { | ||
retString += `${k} - ${item[k]} \\n`; | ||
}); | ||
retString += '\\n \\n'; | ||
}); | ||
return retString; | ||
}; | ||
|
||
const linkChecker = async (base) => { | ||
const visitedLinks = {}; | ||
const statusCodes = {}; | ||
const brokenLinks = []; | ||
|
||
const siteMapUrls = await getSitemapUrls(base); | ||
|
||
const urlsToVisit = await retrieveLinks(siteMapUrls, visitedLinks); | ||
|
||
let allPromises = []; | ||
|
||
for (let i = 0; i < urlsToVisit.length; i++) { | ||
const link = urlsToVisit[i]; | ||
let href = link.url; | ||
if (href.startsWith(GITHUB_CREATE_ISSUE_LINK)) { | ||
// remove query parameters from github new issue links | ||
href = href.split('?')[0]; | ||
} | ||
if (href.startsWith(GITHUB_EDIT_LINK)) continue; | ||
if (visitedLinks[href]) continue; | ||
visitedLinks[href] = true; | ||
|
||
let request = axios | ||
.get(href) | ||
.then((response) => { | ||
let statusCode = response.status; | ||
if (statusCode && statusCode !== 200) { | ||
statusCodes[statusCode] = statusCodes[statusCode] || []; | ||
statusCodes[statusCode].push(href); | ||
} | ||
}) | ||
.catch((e) => { | ||
let statusCode = e?.response?.status; | ||
if (statusCode) { | ||
statusCodes[statusCode] = statusCodes[statusCode] || []; | ||
statusCodes[statusCode].push(href); | ||
} | ||
if (statusCode === 404) { | ||
// this regular expression is meant to filter out any of the platform selector pages. These are appearing in the result set | ||
// because the crawler is seeing disabled platform dropdown links | ||
const platformPages = /\/q\/(platform|integration|framework)\/(android|ios|flutter|js|react-native)/gm; | ||
if (!platformPages.test(link.url)) { | ||
brokenLinks.push(link); | ||
} | ||
} | ||
}); | ||
|
||
allPromises.push(request); | ||
} | ||
|
||
await Promise.all(allPromises); | ||
|
||
console.log(statusCodes); | ||
console.log(brokenLinks); | ||
|
||
return formatString(brokenLinks); | ||
}; | ||
|
||
let param = process.argv[2]; | ||
let base; | ||
if (param && param === 'Internal') { | ||
base = 'http://localhost:3000'; | ||
} | ||
|
||
let results = await linkChecker(base); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters