Merge current scrape with previous results (livgust#52)

* Interim MAimmunizations improvement. (livgust#50) * Test for the Mass. error page As of 11am, it returns a Heroku error page about half the time. Catch that and fail fast. Also, check for other CSS selector serach failures, although it's not clear under what circumstances that might happen. Previously we got: TypeError: Cannot read property 'evaluate' of undefined at ScrapeWebsiteData (/Users/jhawk/src/covid-vaccine-scrapers/site-scrapers/MAImmunizations.js:20:48) at processTicksAndRejections (internal/process/task_queues.js:93:5) at async GetAvailableAppointments (/Users/jhawk/src/covid-vaccine-scrapers/site-scrapers/MAImmunizations.js:5:18) at async Promise.all (index 0) at async gatherData (/Users/jhawk/src/covid-vaccine-scrapers/main.js:41:19) at async execute (/Users/jhawk/src/covid-vaccine-scrapers/main.js:116:2) at async /Users/jhawk/src/covid-vaccine-scrapers/main.js:124:3 The following data would be published: attempting to iterate over the page navigation. THe Heroku error page looks like this: ---snip <!DOCTYPE html><html><head> <meta name="viewport" content="width=device-width, initial-scale=1"> <meta charset="utf-8"> <title>Application Error</title> <style media="screen"> html,body,iframe { margin: 0; padding: 0; } html,body { height: 100%; overflow: hidden; } iframe { width: 100%; height: 100%; border: 0; } </style> </head> <body> <iframe src="//www.herokucdn.com/error-pages/application-error.html"></iframe> </body></html> ---snip so we check the page title to find if we're there. * MAImmunizations.js: Also return false for other failure Avoid the traceback for every instance of missing page navigation, not just the Heroku error page. * Return static copy of MA scrape if we fail * Fetch the current data.json, use it if neccssary Note we have a bootstrapping problem, which is why the baseline is here. Otherwise we can never update ourselves because the retreived data never has the data to update. Adds a node-fetch dependancy. * add defaulting data, next step is adding timestamps to scrapers * forgot to add time allowance * add timestamps for all scrapers * respond to codecheck comments Co-authored-by: John Hawkinson <[email protected]>
VacFind · Feb 19, 2021 · 4ac5dc6 · 4ac5dc6
1 parent 982d0c5
commit 4ac5dc6
Show file tree

Hide file tree

Showing 17 changed files with 1,082 additions and 266 deletions.
diff --git a/data/dataDefaulter.js b/data/dataDefaulter.js
@@ -0,0 +1,53 @@
+/* mergeResults
+ *
+ * Merges cachedResults into currentResults. If secondsOfTolerance is set,
+ * will only merge in cachedResults with a timestamp newer than
+ * now - secondsOfTolerance.
+ */
+function mergeResults(currentResults, cachedResults, secondsOfTolerance) {
+    if (!(cachedResults && cachedResults.length)) {
+        return currentResults;
+    } else {
+        const combinedResults = [];
+        const currentResultsMap = {};
+        currentResults.forEach((result) => {
+            combinedResults.push(result);
+            currentResultsMap[generateKey(result)] = 1;
+        });
+
+        cachedResults.forEach((cachedResult) => {
+            if (!currentResultsMap[generateKey(cachedResult)]) {
+                if (secondsOfTolerance) {
+                    const lowerTimeBound =
+                        new Date() - secondsOfTolerance * 1000;
+                    if (
+                        cachedResult.timestamp &&
+                        cachedResult.timestamp >= lowerTimeBound
+                    ) {
+                        combinedResults.push(cachedResult);
+                    }
+                } else {
+                    combinedResults.push(cachedResult);
+                }
+            }
+        });
+
+        return combinedResults;
+    }
+}
+
+function generateKey(entry) {
+    let uniqueIdentifier = "";
+    ["name", "street", "city", "zip"].forEach((key) => {
+        if (entry[key]) {
+            uniqueIdentifier += `${entry[key]
+                .toLowerCase()
+                .replace(/[^\w]/g, "")}|`;
+        }
+    });
+
+    return uniqueIdentifier;
+}
+
+module.exports.mergeResults = mergeResults;
+module.exports.generateKey = generateKey;
diff --git a/main.js b/main.js
@@ -7,120 +7,148 @@ const { addExtra } = require("puppeteer-extra");
 const Puppeteer = addExtra(chromium.puppeteer);
 const Recaptcha = require("puppeteer-extra-plugin-recaptcha");
 const scrapers = require("./site-scrapers");
+const fetch = require("node-fetch");
+const dataDefaulter = require("./data/dataDefaulter");
 
 //aws-sdk is only a dev dependency because Lambda already includes the package by default.
 const AWS = require("aws-sdk");
 
 async function execute() {
-	//S3 bucket initialization
-	const s3 = new AWS.S3({
-		accessKeyId: process.env.AWSACCESSKEYID,
-		secretAccessKey: process.env.AWSSECRETACCESSKEY,
-	});
+    //S3 bucket initialization
+    const s3 = new AWS.S3({
+        accessKeyId: process.env.AWSACCESSKEYID,
+        secretAccessKey: process.env.AWSSECRETACCESSKEY,
+    });
 
-	Puppeteer.use(
-		Recaptcha({
-			provider: { id: "2captcha", token: process.env.RECAPTCHATOKEN },
-		})
-	);
+    const cachedResults = await fetch(
+        "https://mzqsa4noec.execute-api.us-east-1.amazonaws.com/prod"
+    )
+        .then((res) => res.json())
+        .then((unpack) => JSON.parse(unpack.body).results);
 
-	const browser = process.env.DEVELOPMENT
-		? await Puppeteer.launch({
-				executablePath: process.env.CHROMEPATH,
-		  })
-		: await Puppeteer.launch({
-				args: chromium.args,
-				defaultViewport: chromium.defaultViewport,
-				executablePath: await chromium.executablePath,
-				headless: chromium.headless,
-				ignoreHTTPSErrors: true,
-		  });
+    Puppeteer.use(
+        Recaptcha({
+            provider: { id: "2captcha", token: process.env.RECAPTCHATOKEN },
+        })
+    );
 
-	const gatherData = async () => {
-		const results = await Promise.all(
-			scrapers.map((scraper) =>
-				scraper(browser).catch((error) => {
-					//print out the issue but don't fail, this way we still publish updates
-					//for other locations even if this website's scrape doesn't work
-					console.log(error);
-					return null;
-				})
-			)
-		);
-		browser.close();
-		const finalResultsArray = [];
-		for (const result of results) {
-			if (Array.isArray(result)) {
-				finalResultsArray.push(...result);
-			} else if (result) {
-				//ignore nulls
-				finalResultsArray.push(result);
-			}
-		}
+    const browser = process.env.DEVELOPMENT
+        ? await Puppeteer.launch({
+              executablePath: process.env.CHROMEPATH,
+          })
+        : await Puppeteer.launch({
+              args: chromium.args,
+              defaultViewport: chromium.defaultViewport,
+              executablePath: await chromium.executablePath,
+              headless: chromium.headless,
+              ignoreHTTPSErrors: true,
+          });
 
-		const responseJson = {
-			results: finalResultsArray,
-		};
+    const gatherData = async () => {
+        const results = await Promise.all(
+            scrapers.map((scraper) =>
+                scraper(browser).catch((error) => {
+                    //print out the issue but don't fail, this way we still publish updates
+                    //for other locations even if this website's scrape doesn't work
+                    console.log(error);
+                    return null;
+                })
+            )
+        );
+        browser.close();
+        let scrapedResultsArray = [];
+        for (const result of results) {
+            if (Array.isArray(result)) {
+                scrapedResultsArray.push(...result);
+            } else if (result) {
+                //ignore nulls
+                scrapedResultsArray.push(result);
+            }
+        }
 
-		if (process.env.DEVELOPMENT) {
-			console.log("The following data would be published:");
-			console.dir(responseJson, { depth: null });
-			fs = require("fs");
-			await new Promise((resolve, reject) => {
-				fs.writeFile("out.json", JSON.stringify(responseJson), (err) => {
-					if (err) {
-						reject(err);
-					} else {
-						resolve();
-					}
-				});
-			});
-			return;
-		} else {
-			const params = {
-				Bucket: process.env.AWSS3BUCKETNAME,
-				Key: "data.json",
-				Body: JSON.stringify(responseJson),
-			};
+        let finalResultsArray = [];
+        if (process.argv.length <= 2) {
+            // Only add default data if we're not testing individual scrapers.
+            // We are not passing in the optional 3rd arg of mergeResults;
+            // this means that there is no time limit on stale data being merged in.
+            finalResultsArray = dataDefaulter.mergeResults(
+                scrapedResultsArray,
+                cachedResults
+            );
+        } else {
+            finalResultsArray = scrapedResultsArray;
+        }
 
-			// Uploading files to the bucket
-			const results = await new Promise((resolve, reject) => {
-				s3.upload(params, function (err, data) {
-					if (err) {
-						reject(err);
-					}
-					console.log(`File uploaded successfully. ${data.Location}`);
-					resolve(data);
-				});
-			});
-		        // Timestamped upload
-		  	var params2 = params;
-		  	const now = new Date();
-		        const timestamp = now.toISOString().substring(0,16).replace(':','')+'Z';
-		  	// timestamp form: "2021-02-12T1838Z" (we omit seconds for predictability)
-		  	params2.Key = "data-"+timestamp+".json";
-		  	// Do the reupload
-		  	const results2 = await new Promise((resolve, reject) => {
-		  			s3.upload(params2, function (err, data) {
-		  				if (err) {
-		  					reject(err);
-		  				}
-		  				console.log(`Timestamped file uploaded successfully. ${data.Location}`);
-		  				resolve(data);
-		  			});
-		  		});
-		  	return results;
-		}
-	};
-	await gatherData();
+        const responseJson = {
+            results: finalResultsArray,
+        };
+
+        if (process.env.DEVELOPMENT) {
+            console.log("The following data would be published:");
+            console.dir(responseJson, { depth: null });
+            fs = require("fs");
+            await new Promise((resolve, reject) => {
+                fs.writeFile(
+                    "out.json",
+                    JSON.stringify(responseJson),
+                    (err) => {
+                        if (err) {
+                            reject(err);
+                        } else {
+                            resolve();
+                        }
+                    }
+                );
+            });
+            return;
+        } else {
+            const params = {
+                Bucket: process.env.AWSS3BUCKETNAME,
+                Key: "data.json",
+                Body: JSON.stringify(responseJson),
+            };
+
+            // Uploading files to the bucket
+            const results = await new Promise((resolve, reject) => {
+                s3.upload(params, function (err, data) {
+                    if (err) {
+                        reject(err);
+                    }
+                    console.log(`File uploaded successfully. ${data.Location}`);
+                    resolve(data);
+                });
+            });
+            // Timestamped upload
+            var params2 = params;
+            const now = new Date();
+            const timestamp =
+                now.toISOString().substring(0, 16).replace(":", "") + "Z";
+            // timestamp form: "2021-02-12T1838Z" (we omit seconds for predictability)
+            params2.Key = "data-" + timestamp + ".json";
+            // Do the reupload
+            const results2 = await new Promise((resolve, reject) => {
+                s3.upload(params2, function (err, data) {
+                    if (err) {
+                        reject(err);
+                    }
+                    console.log(
+                        `Timestamped file uploaded successfully. ${data.Location}`
+                    );
+                    resolve(data);
+                });
+            });
+            return results;
+        }
+    };
+    await gatherData();
 }
 
 exports.handler = execute;
 
 if (process.env.DEVELOPMENT) {
-	(async () => {
-		console.log("DEV MODE");
-		await execute();
-		process.exit();
-	})();
+    (async () => {
+        console.log("DEV MODE");
+        await execute();
+        process.exit();
+    })();
 }