Skip to content

Commit

Permalink
Merge current scrape with previous results (livgust#52)
Browse files Browse the repository at this point in the history
* Interim MAimmunizations improvement. (livgust#50)

* Test for the Mass. error page

As of 11am, it returns a Heroku error page about half the time.
Catch that and fail fast.

Also, check for other CSS selector serach failures, although it's not
clear under what circumstances that might happen.

Previously we got:

  TypeError: Cannot read property 'evaluate' of undefined
      at ScrapeWebsiteData (/Users/jhawk/src/covid-vaccine-scrapers/site-scrapers/MAImmunizations.js:20:48)
      at processTicksAndRejections (internal/process/task_queues.js:93:5)
      at async GetAvailableAppointments (/Users/jhawk/src/covid-vaccine-scrapers/site-scrapers/MAImmunizations.js:5:18)
      at async Promise.all (index 0)
      at async gatherData (/Users/jhawk/src/covid-vaccine-scrapers/main.js:41:19)
      at async execute (/Users/jhawk/src/covid-vaccine-scrapers/main.js:116:2)
      at async /Users/jhawk/src/covid-vaccine-scrapers/main.js:124:3
  The following data would be published:

attempting to iterate over the page navigation.

THe Heroku error page looks like this:
---snip
<!DOCTYPE html><html><head>
		<meta name="viewport" content="width=device-width, initial-scale=1">
		<meta charset="utf-8">
		<title>Application Error</title>
		<style media="screen">
		  html,body,iframe {
			margin: 0;
			padding: 0;
		  }
		  html,body {
			height: 100%;
			overflow: hidden;
		  }
		  iframe {
			width: 100%;
			height: 100%;
			border: 0;
		  }
		</style>
	  </head>
	  <body>
		<iframe src="//www.herokucdn.com/error-pages/application-error.html"></iframe>

	</body></html>
---snip

so we check the page title to find if we're there.

* MAImmunizations.js: Also return false for other failure

Avoid the traceback for every instance of missing page navigation, not
just the Heroku error page.

* Return static copy of MA scrape if we fail

* Fetch the current data.json, use it if neccssary

Note we have a bootstrapping problem, which is why the baseline is
here. Otherwise we can never update ourselves because the retreived
data never has the data to update.

Adds a node-fetch dependancy.

* add defaulting data, next step is adding timestamps to scrapers

* forgot to add time allowance

* add timestamps for all scrapers

* respond to codecheck comments

Co-authored-by: John Hawkinson <[email protected]>
  • Loading branch information
livgust and johnhawkinson authored Feb 19, 2021
1 parent 982d0c5 commit 4ac5dc6
Show file tree
Hide file tree
Showing 17 changed files with 1,082 additions and 266 deletions.
53 changes: 53 additions & 0 deletions data/dataDefaulter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* mergeResults
*
* Merges cachedResults into currentResults. If secondsOfTolerance is set,
* will only merge in cachedResults with a timestamp newer than
* now - secondsOfTolerance.
*/
function mergeResults(currentResults, cachedResults, secondsOfTolerance) {
if (!(cachedResults && cachedResults.length)) {
return currentResults;
} else {
const combinedResults = [];
const currentResultsMap = {};
currentResults.forEach((result) => {
combinedResults.push(result);
currentResultsMap[generateKey(result)] = 1;
});

cachedResults.forEach((cachedResult) => {
if (!currentResultsMap[generateKey(cachedResult)]) {
if (secondsOfTolerance) {
const lowerTimeBound =
new Date() - secondsOfTolerance * 1000;
if (
cachedResult.timestamp &&
cachedResult.timestamp >= lowerTimeBound
) {
combinedResults.push(cachedResult);
}
} else {
combinedResults.push(cachedResult);
}
}
});

return combinedResults;
}
}

function generateKey(entry) {
let uniqueIdentifier = "";
["name", "street", "city", "zip"].forEach((key) => {
if (entry[key]) {
uniqueIdentifier += `${entry[key]
.toLowerCase()
.replace(/[^\w]/g, "")}|`;
}
});

return uniqueIdentifier;
}

module.exports.mergeResults = mergeResults;
module.exports.generateKey = generateKey;
228 changes: 128 additions & 100 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,120 +7,148 @@ const { addExtra } = require("puppeteer-extra");
const Puppeteer = addExtra(chromium.puppeteer);
const Recaptcha = require("puppeteer-extra-plugin-recaptcha");
const scrapers = require("./site-scrapers");
const fetch = require("node-fetch");
const dataDefaulter = require("./data/dataDefaulter");

//aws-sdk is only a dev dependency because Lambda already includes the package by default.
const AWS = require("aws-sdk");

async function execute() {
//S3 bucket initialization
const s3 = new AWS.S3({
accessKeyId: process.env.AWSACCESSKEYID,
secretAccessKey: process.env.AWSSECRETACCESSKEY,
});
//S3 bucket initialization
const s3 = new AWS.S3({
accessKeyId: process.env.AWSACCESSKEYID,
secretAccessKey: process.env.AWSSECRETACCESSKEY,
});

Puppeteer.use(
Recaptcha({
provider: { id: "2captcha", token: process.env.RECAPTCHATOKEN },
})
);
const cachedResults = await fetch(
"https://mzqsa4noec.execute-api.us-east-1.amazonaws.com/prod"
)
.then((res) => res.json())
.then((unpack) => JSON.parse(unpack.body).results);

const browser = process.env.DEVELOPMENT
? await Puppeteer.launch({
executablePath: process.env.CHROMEPATH,
})
: await Puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: chromium.headless,
ignoreHTTPSErrors: true,
});
Puppeteer.use(
Recaptcha({
provider: { id: "2captcha", token: process.env.RECAPTCHATOKEN },
})
);

const gatherData = async () => {
const results = await Promise.all(
scrapers.map((scraper) =>
scraper(browser).catch((error) => {
//print out the issue but don't fail, this way we still publish updates
//for other locations even if this website's scrape doesn't work
console.log(error);
return null;
})
)
);
browser.close();
const finalResultsArray = [];
for (const result of results) {
if (Array.isArray(result)) {
finalResultsArray.push(...result);
} else if (result) {
//ignore nulls
finalResultsArray.push(result);
}
}
const browser = process.env.DEVELOPMENT
? await Puppeteer.launch({
executablePath: process.env.CHROMEPATH,
})
: await Puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: chromium.headless,
ignoreHTTPSErrors: true,
});

const responseJson = {
results: finalResultsArray,
};
const gatherData = async () => {
const results = await Promise.all(
scrapers.map((scraper) =>
scraper(browser).catch((error) => {
//print out the issue but don't fail, this way we still publish updates
//for other locations even if this website's scrape doesn't work
console.log(error);
return null;
})
)
);
browser.close();
let scrapedResultsArray = [];
for (const result of results) {
if (Array.isArray(result)) {
scrapedResultsArray.push(...result);
} else if (result) {
//ignore nulls
scrapedResultsArray.push(result);
}
}

if (process.env.DEVELOPMENT) {
console.log("The following data would be published:");
console.dir(responseJson, { depth: null });
fs = require("fs");
await new Promise((resolve, reject) => {
fs.writeFile("out.json", JSON.stringify(responseJson), (err) => {
if (err) {
reject(err);
} else {
resolve();
}
});
});
return;
} else {
const params = {
Bucket: process.env.AWSS3BUCKETNAME,
Key: "data.json",
Body: JSON.stringify(responseJson),
};
let finalResultsArray = [];
if (process.argv.length <= 2) {
// Only add default data if we're not testing individual scrapers.
// We are not passing in the optional 3rd arg of mergeResults;
// this means that there is no time limit on stale data being merged in.
finalResultsArray = dataDefaulter.mergeResults(
scrapedResultsArray,
cachedResults
);
} else {
finalResultsArray = scrapedResultsArray;
}

// Uploading files to the bucket
const results = await new Promise((resolve, reject) => {
s3.upload(params, function (err, data) {
if (err) {
reject(err);
}
console.log(`File uploaded successfully. ${data.Location}`);
resolve(data);
});
});
// Timestamped upload
var params2 = params;
const now = new Date();
const timestamp = now.toISOString().substring(0,16).replace(':','')+'Z';
// timestamp form: "2021-02-12T1838Z" (we omit seconds for predictability)
params2.Key = "data-"+timestamp+".json";
// Do the reupload
const results2 = await new Promise((resolve, reject) => {
s3.upload(params2, function (err, data) {
if (err) {
reject(err);
}
console.log(`Timestamped file uploaded successfully. ${data.Location}`);
resolve(data);
});
});
return results;
}
};
await gatherData();
const responseJson = {
results: finalResultsArray,
};

if (process.env.DEVELOPMENT) {
console.log("The following data would be published:");
console.dir(responseJson, { depth: null });
fs = require("fs");
await new Promise((resolve, reject) => {
fs.writeFile(
"out.json",
JSON.stringify(responseJson),
(err) => {
if (err) {
reject(err);
} else {
resolve();
}
}
);
});
return;
} else {
const params = {
Bucket: process.env.AWSS3BUCKETNAME,
Key: "data.json",
Body: JSON.stringify(responseJson),
};

// Uploading files to the bucket
const results = await new Promise((resolve, reject) => {
s3.upload(params, function (err, data) {
if (err) {
reject(err);
}
console.log(`File uploaded successfully. ${data.Location}`);
resolve(data);
});
});
// Timestamped upload
var params2 = params;
const now = new Date();
const timestamp =
now.toISOString().substring(0, 16).replace(":", "") + "Z";
// timestamp form: "2021-02-12T1838Z" (we omit seconds for predictability)
params2.Key = "data-" + timestamp + ".json";
// Do the reupload
const results2 = await new Promise((resolve, reject) => {
s3.upload(params2, function (err, data) {
if (err) {
reject(err);
}
console.log(
`Timestamped file uploaded successfully. ${data.Location}`
);
resolve(data);
});
});
return results;
}
};
await gatherData();
}

exports.handler = execute;

if (process.env.DEVELOPMENT) {
(async () => {
console.log("DEV MODE");
await execute();
process.exit();
})();
(async () => {
console.log("DEV MODE");
await execute();
process.exit();
})();
}
Loading

0 comments on commit 4ac5dc6

Please sign in to comment.