Skip to content

Commit

Permalink
Merge pull request #7 from jacoblogan/update-link-script
Browse files Browse the repository at this point in the history
add pr version of link checker script
  • Loading branch information
jacoblogan authored Sep 27, 2023
2 parents 3947574 + 2217584 commit 637e46f
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 2 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ jobs:
uses: actions/setup-node@main
with:
node-version: 16.x
cache: 'yarn'
- name: Install Dependencies
run: yarn
- name: Run tests
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/check_for_broken_links.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jobs:
uses: actions/setup-node@e33196f7422957bea03ed53f6fbb155025ffc7b8 # v3.7.0
with:
node-version: 16.x
cache: yarn
- name: Install Dependencies
run: yarn
- name: Run Link Checker
Expand Down
30 changes: 30 additions & 0 deletions .github/workflows/check_pr_for_broken_links.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: CheckPRLinks
on:
schedule:
- cron: '0 17 * * 1-5'
workflow_dispatch:
permissions:
contents: read
id-token: write
jobs:
CheckPRLinks:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
- name: Setup Node.js 16.x
uses: actions/setup-node@e33196f7422957bea03ed53f6fbb155025ffc7b8 # v3.7.0
with:
node-version: 16.x
cache: 'yarn'
- name: Install Dependencies
run: yarn
- name: Run Build
run: yarn build
env:
NODE_OPTIONS: --max_old_space_size=4096
- name: Run Server
run: |
yarn run next-start &
sleep 5 &&
yarn run linkcheck
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
"refresh": "yarn clean && yarn",
"test": "jest",
"dev": "next dev",
"linkcheck": "node tasks/check-links.mjs Internal",
"spellcheck": "cspell \"src/**/*.mdx\" --no-progress",
"spellcheck-diff": "git diff --name-only --cached | awk \"/src.*\\.mdx/{print}\" | npx cspell --no-must-find-files --file-list stdin",
"build": "node tasks/generate-sitemap.mjs && next build && next export -o client/www/next-build && next-image-export-optimizer --exportFolderPath client/www/next-build",
Expand Down
178 changes: 178 additions & 0 deletions tasks/check-links.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import puppeteer from 'puppeteer';
import axios from 'axios';

const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml';
const DOMAIN = 'https://docs.amplify.aws';
const CRAWLER_EXCEPTIONS = [
'https://aaaaaaaaaa.execute-api.us-east-1.amazonaws.com/api',
'https://aaaaaaaaaaaaaaaaaaaaaaaaaa.appsync-api.us-east-1.amazonaws.com/graphql',
'https://twitter.com/AWSAmplify'
];
const GITHUB_CREATE_ISSUE_LINK =
'https://github.com/aws-amplify/docs/issues/new';
const GITHUB_EDIT_LINK = 'https://github.com/aws-amplify/docs/edit/';

const getSitemapUrls = async (localDomain) => {
let browser = await puppeteer.launch({ headless: 'new' });

const page = await browser.newPage();

let siteMap = localDomain ? `${localDomain}/sitemap.xml` : SITEMAP_URL;
let response = await page.goto(siteMap);

const siteMapUrls = [];

if (response && response.status() && response.status() === 200) {
const urlTags = await page.evaluateHandle(() => {
return document.getElementsByTagName('loc');
});

const numOfLinks = await page.evaluate((e) => e.length, urlTags);

for (let i = 0; i < numOfLinks; i++) {
let url = await page.evaluate(
(urlTags, i) => urlTags[i].innerHTML,
urlTags,
i
);
if (localDomain) {
url = url.replace(DOMAIN, localDomain);
}
siteMapUrls.push(url);
}
}

browser.close();

return siteMapUrls;
};

const retrieveLinks = async (siteMapUrls, visitedLinks) => {
let browser = await puppeteer.launch({ headless: 'new' });

let page = await browser.newPage();

const urlsToVisit = [];

for (let i = 0; i < siteMapUrls.length; i++) {
let url = siteMapUrls[i];

try {
let response = await page.goto(url, { waitUntil: 'domcontentloaded' });
await new Promise((r) => setTimeout(r, 100));
if (response && response.status() && response.status() === 200) {
console.log(`successfully visited ${url} to retrieve links`);
visitedLinks[url] = true;

const urlList = await page.evaluate(async (url) => {
let urls = [];
let elements = document.getElementsByTagName('a');
for (let i = 0; i < elements.length; i++) {
let element = elements[i];
if (element.href) {
const link = {
url: element.href,
parentUrl: url,
linkText: element.textContent
};
urls.push(link);
}
}
return urls;
}, url);

urlList.forEach((link) => {
if (!CRAWLER_EXCEPTIONS.includes(link.url)) {
urlsToVisit.push(link);
}
});
}
} catch (e) {
console.log(`failed to load ${url}: ${e}`);
browser.close();
browser = await puppeteer.launch({ headless: 'new' });
page = await browser.newPage();
}
}

browser.close();

return urlsToVisit;
};

const formatString = (inputs) => {
let retString = '';
inputs.forEach((item) => {
Object.keys(item).forEach((k) => {
retString += `${k} - ${item[k]} \\n`;
});
retString += '\\n \\n';
});
return retString;
};

const linkChecker = async (base) => {
const visitedLinks = {};
const statusCodes = {};
const brokenLinks = [];

const siteMapUrls = await getSitemapUrls(base);

const urlsToVisit = await retrieveLinks(siteMapUrls, visitedLinks);

let allPromises = [];

for (let i = 0; i < urlsToVisit.length; i++) {
const link = urlsToVisit[i];
let href = link.url;
if (href.startsWith(GITHUB_CREATE_ISSUE_LINK)) {
// remove query parameters from github new issue links
href = href.split('?')[0];
}
if (href.startsWith(GITHUB_EDIT_LINK)) continue;
if (visitedLinks[href]) continue;
visitedLinks[href] = true;

let request = axios
.get(href)
.then((response) => {
let statusCode = response.status;
if (statusCode && statusCode !== 200) {
statusCodes[statusCode] = statusCodes[statusCode] || [];
statusCodes[statusCode].push(href);
}
})
.catch((e) => {
let statusCode = e?.response?.status;
if (statusCode) {
statusCodes[statusCode] = statusCodes[statusCode] || [];
statusCodes[statusCode].push(href);
}
if (statusCode === 404) {
// this regular expression is meant to filter out any of the platform selector pages. These are appearing in the result set
// because the crawler is seeing disabled platform dropdown links
const platformPages = /\/q\/(platform|integration|framework)\/(android|ios|flutter|js|react-native)/gm;
if (!platformPages.test(link.url)) {
brokenLinks.push(link);
}
}
});

allPromises.push(request);
}

await Promise.all(allPromises);

console.log(statusCodes);
console.log(brokenLinks);

return formatString(brokenLinks);
};

let param = process.argv[2];
let base;
if (param && param === 'Internal') {
base = 'http://localhost:3000';
}

let results = await linkChecker(base);
6 changes: 4 additions & 2 deletions tasks/link-checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ const puppeteer = require('puppeteer');
const axios = require('axios');

const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml';
const DOMAIN = 'https://docs.amplify.aws';
const CRAWLER_EXCEPTIONS = [
'https://aaaaaaaaaa.execute-api.us-east-1.amazonaws.com/api',
'https://aaaaaaaaaaaaaaaaaaaaaaaaaa.appsync-api.us-east-1.amazonaws.com/graphql',
Expand All @@ -11,12 +12,13 @@ const GITHUB_CREATE_ISSUE_LINK =
'https://github.com/aws-amplify/docs/issues/new';
const GITHUB_EDIT_LINK = 'https://github.com/aws-amplify/docs/edit/';

const getSitemapUrls = async () => {
const getSitemapUrls = async (localDomain) => {
let browser = await puppeteer.launch({ headless: 'new' });

const page = await browser.newPage();

let response = await page.goto(SITEMAP_URL);
let siteMap = localDomain ? SITEMAP_URL : `${localDomain}/sitemap.xml`;
let response = await page.goto(siteMap);

const siteMapUrls = [];

Expand Down

0 comments on commit 637e46f

Please sign in to comment.