From 22175848015e8e8d1545b26c8222a96967b8b99d Mon Sep 17 00:00:00 2001 From: Jacob Logan Date: Wed, 27 Sep 2023 04:22:13 -0700 Subject: [PATCH] add pr version of link checker script --- .github/workflows/build.yml | 1 + .github/workflows/check_for_broken_links.yml | 1 + .../workflows/check_pr_for_broken_links.yml | 30 +++ package.json | 1 + tasks/check-links.mjs | 178 ++++++++++++++++++ tasks/link-checker.js | 6 +- 6 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/check_pr_for_broken_links.yml create mode 100644 tasks/check-links.mjs diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 474458f180a..225fc8a459d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,6 +14,7 @@ jobs: uses: actions/setup-node@main with: node-version: 16.x + cache: 'yarn' - name: Install Dependencies run: yarn - name: Run tests diff --git a/.github/workflows/check_for_broken_links.yml b/.github/workflows/check_for_broken_links.yml index 98afe5688ec..9c4cd6850d6 100644 --- a/.github/workflows/check_for_broken_links.yml +++ b/.github/workflows/check_for_broken_links.yml @@ -16,6 +16,7 @@ jobs: uses: actions/setup-node@e33196f7422957bea03ed53f6fbb155025ffc7b8 # v3.7.0 with: node-version: 16.x + cache: yarn - name: Install Dependencies run: yarn - name: Run Link Checker diff --git a/.github/workflows/check_pr_for_broken_links.yml b/.github/workflows/check_pr_for_broken_links.yml new file mode 100644 index 00000000000..0f233422d99 --- /dev/null +++ b/.github/workflows/check_pr_for_broken_links.yml @@ -0,0 +1,30 @@ +name: CheckPRLinks +on: + schedule: + - cron: '0 17 * * 1-5' + workflow_dispatch: +permissions: + contents: read + id-token: write +jobs: + CheckPRLinks: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 + - name: Setup Node.js 16.x + uses: actions/setup-node@e33196f7422957bea03ed53f6fbb155025ffc7b8 # v3.7.0 + with: + node-version: 16.x + cache: 'yarn' + - name: Install Dependencies + run: yarn + - name: Run Build + run: yarn build + env: + NODE_OPTIONS: --max_old_space_size=4096 + - name: Run Server + run: | + yarn run next-start & + sleep 5 && + yarn run linkcheck diff --git a/package.json b/package.json index 85448646968..86d90f4121a 100644 --- a/package.json +++ b/package.json @@ -135,6 +135,7 @@ "refresh": "yarn clean && yarn", "test": "jest", "dev": "next dev", + "linkcheck": "node tasks/check-links.mjs Internal", "spellcheck": "cspell \"src/**/*.mdx\" --no-progress", "spellcheck-diff": "git diff --name-only --cached | awk \"/src.*\\.mdx/{print}\" | npx cspell --no-must-find-files --file-list stdin", "build": "node tasks/generate-sitemap.mjs && next build && next export -o client/www/next-build && next-image-export-optimizer --exportFolderPath client/www/next-build", diff --git a/tasks/check-links.mjs b/tasks/check-links.mjs new file mode 100644 index 00000000000..411afac2fb3 --- /dev/null +++ b/tasks/check-links.mjs @@ -0,0 +1,178 @@ +import puppeteer from 'puppeteer'; +import axios from 'axios'; + +const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml'; +const DOMAIN = 'https://docs.amplify.aws'; +const CRAWLER_EXCEPTIONS = [ + 'https://aaaaaaaaaa.execute-api.us-east-1.amazonaws.com/api', + 'https://aaaaaaaaaaaaaaaaaaaaaaaaaa.appsync-api.us-east-1.amazonaws.com/graphql', + 'https://twitter.com/AWSAmplify' +]; +const GITHUB_CREATE_ISSUE_LINK = + 'https://github.com/aws-amplify/docs/issues/new'; +const GITHUB_EDIT_LINK = 'https://github.com/aws-amplify/docs/edit/'; + +const getSitemapUrls = async (localDomain) => { + let browser = await puppeteer.launch({ headless: 'new' }); + + const page = await browser.newPage(); + + let siteMap = localDomain ? `${localDomain}/sitemap.xml` : SITEMAP_URL; + let response = await page.goto(siteMap); + + const siteMapUrls = []; + + if (response && response.status() && response.status() === 200) { + const urlTags = await page.evaluateHandle(() => { + return document.getElementsByTagName('loc'); + }); + + const numOfLinks = await page.evaluate((e) => e.length, urlTags); + + for (let i = 0; i < numOfLinks; i++) { + let url = await page.evaluate( + (urlTags, i) => urlTags[i].innerHTML, + urlTags, + i + ); + if (localDomain) { + url = url.replace(DOMAIN, localDomain); + } + siteMapUrls.push(url); + } + } + + browser.close(); + + return siteMapUrls; +}; + +const retrieveLinks = async (siteMapUrls, visitedLinks) => { + let browser = await puppeteer.launch({ headless: 'new' }); + + let page = await browser.newPage(); + + const urlsToVisit = []; + + for (let i = 0; i < siteMapUrls.length; i++) { + let url = siteMapUrls[i]; + + try { + let response = await page.goto(url, { waitUntil: 'domcontentloaded' }); + await new Promise((r) => setTimeout(r, 100)); + if (response && response.status() && response.status() === 200) { + console.log(`successfully visited ${url} to retrieve links`); + visitedLinks[url] = true; + + const urlList = await page.evaluate(async (url) => { + let urls = []; + let elements = document.getElementsByTagName('a'); + for (let i = 0; i < elements.length; i++) { + let element = elements[i]; + if (element.href) { + const link = { + url: element.href, + parentUrl: url, + linkText: element.textContent + }; + urls.push(link); + } + } + return urls; + }, url); + + urlList.forEach((link) => { + if (!CRAWLER_EXCEPTIONS.includes(link.url)) { + urlsToVisit.push(link); + } + }); + } + } catch (e) { + console.log(`failed to load ${url}: ${e}`); + browser.close(); + browser = await puppeteer.launch({ headless: 'new' }); + page = await browser.newPage(); + } + } + + browser.close(); + + return urlsToVisit; +}; + +const formatString = (inputs) => { + let retString = ''; + inputs.forEach((item) => { + Object.keys(item).forEach((k) => { + retString += `${k} - ${item[k]} \\n`; + }); + retString += '\\n \\n'; + }); + return retString; +}; + +const linkChecker = async (base) => { + const visitedLinks = {}; + const statusCodes = {}; + const brokenLinks = []; + + const siteMapUrls = await getSitemapUrls(base); + + const urlsToVisit = await retrieveLinks(siteMapUrls, visitedLinks); + + let allPromises = []; + + for (let i = 0; i < urlsToVisit.length; i++) { + const link = urlsToVisit[i]; + let href = link.url; + if (href.startsWith(GITHUB_CREATE_ISSUE_LINK)) { + // remove query parameters from github new issue links + href = href.split('?')[0]; + } + if (href.startsWith(GITHUB_EDIT_LINK)) continue; + if (visitedLinks[href]) continue; + visitedLinks[href] = true; + + let request = axios + .get(href) + .then((response) => { + let statusCode = response.status; + if (statusCode && statusCode !== 200) { + statusCodes[statusCode] = statusCodes[statusCode] || []; + statusCodes[statusCode].push(href); + } + }) + .catch((e) => { + let statusCode = e?.response?.status; + if (statusCode) { + statusCodes[statusCode] = statusCodes[statusCode] || []; + statusCodes[statusCode].push(href); + } + if (statusCode === 404) { + // this regular expression is meant to filter out any of the platform selector pages. These are appearing in the result set + // because the crawler is seeing disabled platform dropdown links + const platformPages = /\/q\/(platform|integration|framework)\/(android|ios|flutter|js|react-native)/gm; + if (!platformPages.test(link.url)) { + brokenLinks.push(link); + } + } + }); + + allPromises.push(request); + } + + await Promise.all(allPromises); + + console.log(statusCodes); + console.log(brokenLinks); + + return formatString(brokenLinks); +}; + +let param = process.argv[2]; +let base; +if (param && param === 'Internal') { + base = 'http://localhost:3000'; +} + +let results = await linkChecker(base); diff --git a/tasks/link-checker.js b/tasks/link-checker.js index 05e8e2f212c..56e24d8dbd5 100644 --- a/tasks/link-checker.js +++ b/tasks/link-checker.js @@ -2,6 +2,7 @@ const puppeteer = require('puppeteer'); const axios = require('axios'); const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml'; +const DOMAIN = 'https://docs.amplify.aws'; const CRAWLER_EXCEPTIONS = [ 'https://aaaaaaaaaa.execute-api.us-east-1.amazonaws.com/api', 'https://aaaaaaaaaaaaaaaaaaaaaaaaaa.appsync-api.us-east-1.amazonaws.com/graphql', @@ -11,12 +12,13 @@ const GITHUB_CREATE_ISSUE_LINK = 'https://github.com/aws-amplify/docs/issues/new'; const GITHUB_EDIT_LINK = 'https://github.com/aws-amplify/docs/edit/'; -const getSitemapUrls = async () => { +const getSitemapUrls = async (localDomain) => { let browser = await puppeteer.launch({ headless: 'new' }); const page = await browser.newPage(); - let response = await page.goto(SITEMAP_URL); + let siteMap = localDomain ? SITEMAP_URL : `${localDomain}/sitemap.xml`; + let response = await page.goto(siteMap); const siteMapUrls = [];