forked from livgust/covid-vaccine-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.js
106 lines (94 loc) · 3.45 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
const dotenv = require("dotenv");
//note: this only works locally; in Lambda we use environment variables set manually
dotenv.config();
const chromium = require("chrome-aws-lambda");
const { addExtra } = require("puppeteer-extra");
const Puppeteer = addExtra(chromium.puppeteer);
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const Recaptcha = require("puppeteer-extra-plugin-recaptcha");
const scrapers = require("./site-scrapers");
const fetch = require("node-fetch");
const dataDefaulter = require("./data/dataDefaulter");
const file = require("./lib/file");
const s3 = require("./lib/s3");
async function execute() {
const cachedResults = await fetch(
"https://mzqsa4noec.execute-api.us-east-1.amazonaws.com/prod"
)
.then((res) => res.json())
.then((unpack) => JSON.parse(unpack.body).results);
Puppeteer.use(StealthPlugin());
Puppeteer.use(
Recaptcha({
provider: { id: "2captcha", token: process.env.RECAPTCHATOKEN },
})
);
const browser = process.env.DEVELOPMENT
? await Puppeteer.launch({
executablePath: process.env.CHROMEPATH,
headless: false,
})
: await Puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: chromium.headless,
ignoreHTTPSErrors: true,
});
const gatherData = async () => {
const results = await Promise.all(
scrapers.map((scraper) =>
scraper(browser).catch((error) => {
//print out the issue but don't fail, this way we still publish updates
//for other locations even if this website's scrape doesn't work
console.log(error);
return null;
})
)
);
browser.close();
let scrapedResultsArray = [];
for (const result of results) {
if (Array.isArray(result)) {
scrapedResultsArray.push(...result);
} else if (result) {
//ignore nulls
scrapedResultsArray.push(result);
}
}
let finalResultsArray = [];
if (process.argv.length <= 2) {
// Only add default data if we're not testing individual scrapers.
// We are not passing in the optional 3rd arg of mergeResults;
// this means that there is no time limit on stale data being merged in.
finalResultsArray = dataDefaulter.mergeResults(
scrapedResultsArray,
cachedResults
);
} else {
finalResultsArray = scrapedResultsArray;
}
const responseJson = {
results: finalResultsArray,
};
const webData = JSON.stringify(responseJson);
if (process.env.DEVELOPMENT) {
console.log("The following data would be published:");
console.dir(responseJson, { depth: null });
file.write("out.json", webData);
return responseJson;
} else {
const uploadResponse = await s3.saveWebData(webData);
return uploadResponse;
}
};
await gatherData();
}
exports.handler = execute;
if (process.env.DEVELOPMENT) {
(async () => {
console.log("DEV MODE");
await execute();
process.exit();
})();
}