From 8139dee212ae35dcd8de682c80a1c585c3d03df9 Mon Sep 17 00:00:00 2001 From: Matvey Date: Wed, 9 Oct 2024 16:00:07 +0300 Subject: [PATCH] Compose action (#53) * proper handling of exceptions. * action-router model * Now we throw errors and action is more error-prone. * compose action * Fix everything... * Working service! * Docs * Fixes after review * Fixes after review --- README.md | 45 ++++++++++++++++++ actions/action.js | 33 +++++++++++++ actions/click.js | 29 ++++++++++++ actions/compose.js | 35 ++++++++++++++ actions/fill_form.js | 29 ++++++++++++ actions/goback.js | 6 +++ actions/goforward.js | 6 +++ actions/goto.js | 27 +++++++++++ actions/har.js | 11 +++++ actions/mhtml.js | 11 +++++ actions/recaptcha_solver.js | 37 +++++++++++++++ actions/screenshot.js | 13 +++++ actions/scroll.js | 24 ++++++++++ app.js | 2 + helpers/exceptions.js | 12 ++++- helpers/middlewares/process_exception.js | 6 ++- helpers/utils.js | 1 + package.json | 2 +- routes/action.js | 42 ++--------------- routes/click.js | 41 +++------------- routes/compose.js | 23 +++++++++ routes/fill_form.js | 41 +++------------- routes/goback.js | 18 +++---- routes/goforward.js | 18 +++---- routes/goto.js | 38 +++------------ routes/har.js | 23 ++------- routes/mhtml.js | 17 ++----- routes/recaptcha_solver.js | 60 +++--------------------- routes/screenshot.js | 19 ++------ routes/scroll.js | 27 ++--------- 30 files changed, 406 insertions(+), 290 deletions(-) create mode 100644 actions/action.js create mode 100644 actions/click.js create mode 100644 actions/compose.js create mode 100644 actions/fill_form.js create mode 100644 actions/goback.js create mode 100644 actions/goforward.js create mode 100644 actions/goto.js create mode 100644 actions/har.js create mode 100644 actions/mhtml.js create mode 100644 actions/recaptcha_solver.js create mode 100644 actions/screenshot.js create mode 100644 actions/scroll.js create mode 100644 routes/compose.js diff --git a/README.md b/README.md index 91ffc10..ca753b1 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,51 @@ Example request body: } ``` +### **/compose** + +This POST method allows to combine several puppeteer actions into one. +Note that the method does not expect nested composite actions inside its body. + +Example request body: +```json5 +{ + "actions": [ + { + "endpoint": "goto", + "body": { + "url": "", + "harRecording": false, + }, + }, + { + "endpoint": "click", + "body": { + "selector": "", + }, + }, + { + "endpoint": "click", + "body": { + "selector": "", + }, + }, + { + "endpoint": "scroll", + "body": {}, + }, + { + "endpoint": "screenshot", + "body": { + "options": { + "full_page": true, + "type": "jpeg", + }, + }, + } + ], +} +``` + ### **/scroll** This POST method allows to scroll page to the first element that is matched by selector and returns page result. diff --git a/actions/action.js b/actions/action.js new file mode 100644 index 0000000..92d7371 --- /dev/null +++ b/actions/action.js @@ -0,0 +1,33 @@ +const exceptions = require('../helpers/exceptions'); +const utils = require('../helpers/utils'); // For usage inside user's action(page, request) function + +/** + * Content-Type: application/javascript + * body = js function as pattern: + * async function action(page, request) { + * ... + * some actions with page in puppeteer syntax + * ... + * return { + * context_id: page.browserContext().id, + * page_id: page.target()._targetId, + * html: await page.content(), + * cookies: await page.cookies() + * }; + * }; + */ +exports.action = async function action(page, request) { + eval(request.body.toString()); + + // check action function existence + if (!(typeof action === "function" && action.length >= 1)) { + throw new exceptions.IncorrectArgumentError("Invalid action function.\n" + + "Valid action function: \"async function action(page, request) " + + "{ ... some actions with request and page in puppeteer " + + "syntax};\""); + } + + return { + data: await action(page, request) + } +} diff --git a/actions/click.js b/actions/click.js new file mode 100644 index 0000000..b676300 --- /dev/null +++ b/actions/click.js @@ -0,0 +1,29 @@ +const utils = require('../helpers/utils'); + +const DEFAULT_TIMEOUT = 1000; // 1 second + +/* + * body = { + * "selector": "", // A selector to search for element to click. If there are multiple elements satisfying the selector, the first will be clicked. + * "clickOptions": { + * "button", // <"left"|"right"|"middle"> Defaults to left. + * "clickCount", // defaults to 1. + * "delay" // Time to wait between mousedown and mouseup in milliseconds. Defaults to 0. + * }, + * "waitOptions": {...}, // same as in goto action, defaults to 1s timeout + * "navigationOptions": {...} // same as in goto action + * } + */ +exports.click = async function click(page, request) { + await page.hover(request.body.selector); + if (request.body.navigationOptions) { + await Promise.all([ + page.waitForNavigation(request.body.navigationOptions), + page.click(request.body.selector, request.body.clickOptions), + ]); + } else { + await page.click(request.body.selector, request.body.clickOptions); + } + const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT }; + return await utils.getContents(page, waitOptions); +} diff --git a/actions/compose.js b/actions/compose.js new file mode 100644 index 0000000..9866ffa --- /dev/null +++ b/actions/compose.js @@ -0,0 +1,35 @@ +endpoint2action = { + action: require("./action").action, + click: require("./click").click, + fill_form: require("./fill_form").fillForm, + back: require("./goback").goBack, + forward: require("./goforward").goForward, + goto: require("./goto").goto, + har: require("./har").har, + mhtml: require("./mhtml").captureSnapshot, + recaptcha_solver: require("./recaptcha_solver").recaptchaSolver, + screenshot: require("./screenshot").screenshot, + scroll: require("./scroll").scroll, +} + +async function compose(page, request) { + const originalClosePage = request.query.closePage; + const originalBody = structuredClone(request.body); + + request.query.closePage = false; + delete request.body["actions"]; + + let response; + try { + for (const action of originalBody["actions"]) { + request.body = action["body"]; + response = await endpoint2action[action["endpoint"]](page, request); + } + } finally { + request.query.closePage = originalClosePage; + request.body = originalBody; + } + + return response; +} +exports.compose = compose; diff --git a/actions/fill_form.js b/actions/fill_form.js new file mode 100644 index 0000000..4800223 --- /dev/null +++ b/actions/fill_form.js @@ -0,0 +1,29 @@ +const utils = require('../helpers/utils'); + +/* + * body = { + * "inputMapping": { A dictionary where each key is a CSS selector, and each value is another dictionary containing details about the input for that element: + * "selector": The CSS selector for the input element (used as the key). + * "value": The text to be inputted into the element. + * "delay": A delay (in milliseconds) between each keystroke when inputting the text. Defaults to 0 if not provided. + * }, + * "submitButton": The CSS selector for the form's submit button. If provided, the button will be clicked after filling in the form. + * } + */ +exports.fillForm = async function fillForm(page, request) { + const inputMapping = request.body.inputMapping; + const submitButton = request.body.submitButton; + + for (const [selector, params] of Object.entries(inputMapping)) { + const value = params.value; + const delay = params.delay || 0; + await page.type(selector, value, { delay }); + } + + if (submitButton) { + await page.click(submitButton); + } + + return await utils.getContents(page); + +} diff --git a/actions/goback.js b/actions/goback.js new file mode 100644 index 0000000..73b3c17 --- /dev/null +++ b/actions/goback.js @@ -0,0 +1,6 @@ +const utils = require('../helpers/utils'); + +exports.goBack = async function goBack(page, request) { + await page.goBack(request.body.navigationOptions); + return await utils.getContents(page, request.body.waitOptions); +} diff --git a/actions/goforward.js b/actions/goforward.js new file mode 100644 index 0000000..80e1d90 --- /dev/null +++ b/actions/goforward.js @@ -0,0 +1,6 @@ +const utils = require('../helpers/utils'); + +exports.goForward = async function goForward(page, request) { + await page.goForward(request.body.navigationOptions); + return await utils.getContents(page, request.body.waitOptions); +} diff --git a/actions/goto.js b/actions/goto.js new file mode 100644 index 0000000..a225d56 --- /dev/null +++ b/actions/goto.js @@ -0,0 +1,27 @@ +const utils = require('../helpers/utils'); + +/* + * body = { + * "url": URL to navigate page to. The url should include scheme, e.g. https://. + * "navigationOptions": { Navigation parameters which might have the following properties: + * "timeout": Maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout. The default value can be changed by using the page.setDefaultNavigationTimeout(timeout) or page.setDefaultTimeout(timeout) methods. + * "waitUntil": > When to consider navigation succeeded, defaults to load. Given an array of event strings, navigation is considered to be successful after all events have been fired. Events can be either: + * load - consider navigation to be finished when the load event is fired. + * domcontentloaded - consider navigation to be finished when the DOMContentLoaded event is fired. + * networkidle0 - consider navigation to be finished when there are no more than 0 network connections for at least 500 ms. + * networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms. + * "referer" Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders(). + * }, + * "waitOptions": { + * "timeout": Wait for given timeout in milliseconds + * "selector": Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector) + * "xpath": Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath) + * "options": Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions) + * }, + * "harRecording": true, + * } + */ +exports.goto = async function goto(page, request) { + await page.goto(request.body.url, request.body.navigationOptions); + return await utils.getContents(page, request.body.waitOptions); +} diff --git a/actions/har.js b/actions/har.js new file mode 100644 index 0000000..3dd4213 --- /dev/null +++ b/actions/har.js @@ -0,0 +1,11 @@ +const exceptions = require("../helpers/exceptions"); + +exports.har = async function har(page, request) { + if (!(page.harWriter)){ + throw new exceptions.NoHarWriterError(); + } + + return { + har: JSON.stringify(await page.harWriter.stop()) // TODO: do we really need JSON.stringify? + }; +} diff --git a/actions/mhtml.js b/actions/mhtml.js new file mode 100644 index 0000000..3292866 --- /dev/null +++ b/actions/mhtml.js @@ -0,0 +1,11 @@ +/* + * Captures mhtml snapshot of a page + */ +exports.captureSnapshot = async function captureSnapshot(page, request) { + const cdpSession = await page.target().createCDPSession(); + const { data } = await cdpSession.send('Page.captureSnapshot', { format: 'mhtml' }); + await cdpSession.detach() + return { + mhtml: data, + }; +} diff --git a/actions/recaptcha_solver.js b/actions/recaptcha_solver.js new file mode 100644 index 0000000..1f0e9a9 --- /dev/null +++ b/actions/recaptcha_solver.js @@ -0,0 +1,37 @@ +const utils = require('../helpers/utils') + +const DEFAULT_TIMEOUT = 1000; // 1 second + +/* + * This module introduces new ability to puppeteer-service. + * It is capable of solving recaptchas on a given web-page. + * If there is no recaptcha on the page nothing bad will happen. + * If there is recaptcha it solves it and then inserts the special code + * into the page automatically. + * + * Returns useful information about recaptcha_solving. + * For more information about return value visit + * https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object + */ +exports.recaptchaSolver = async function recaptchaSolver(page, request) { + let recaptcha_data; + + if (request.body.solve_recaptcha) { + recaptcha_data = await page.solveRecaptchas(); + } else { + recaptcha_data = await page.findRecaptchas(); + } + + const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT }; + const contents = await utils.getContents(page, waitOptions); + + if (request.query.closePage || + (request.body.close_on_empty && recaptcha_data['captchas'].length === 0)) { + await page.close(); + } + + return { + ...contents, + recaptcha_data: recaptcha_data, + } +} diff --git a/actions/screenshot.js b/actions/screenshot.js new file mode 100644 index 0000000..0b9be4b --- /dev/null +++ b/actions/screenshot.js @@ -0,0 +1,13 @@ +/* + * Method that returns screenshots of pages + * more description of options you can see on GitHub: + * https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md#pagescreenshotoptions + */ +exports.screenshot = async function screenshot(page, request) { + delete request.body.options.path; // no path for saving images + request.body.options.encoding = "base64"; // return in base64 + let screenshot = await page.screenshot(request.body.options); + return { + screenshot: screenshot + }; +} diff --git a/actions/scroll.js b/actions/scroll.js new file mode 100644 index 0000000..258220e --- /dev/null +++ b/actions/scroll.js @@ -0,0 +1,24 @@ +const utils = require('../helpers/utils'); + +const DEFAULT_TIMEOUT = 1000; // 1 second + +/* + * Method that scrolls page to a certain selector. + * Example body: + * body = { + * "selector": "", // A selector to search for element to scroll + * "waitOptions": {...}, // same as in goto action, defaults to 1s timeout + * } + */ +exports.scroll = async function scroll(page, request) { + if (request.body.selector) { + await page.hover(request.body.selector); + } else { + await page.evaluate(() => { + // scroll down until the bottom of the page to trigger scroll event even at the bottom of a page + window.scrollBy(0, document.body.scrollHeight) + }); + } + const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT}; + return utils.getContents(page, waitOptions); +} diff --git a/app.js b/app.js index d2bbcb5..5a6f75d 100644 --- a/app.js +++ b/app.js @@ -10,6 +10,7 @@ const bodyParser = require('body-parser'); const AsyncLock = require('async-lock'); const indexRouter = require('./routes/index'); +const composeRouter = require('./routes/compose'); const healthCheckRouter = require('./routes/health_check'); const gotoRouter = require('./routes/goto'); const backRouter = require('./routes/goback'); @@ -108,6 +109,7 @@ app.use(bodyParser.raw({ inflate: true, limit: '200kb', type: 'application/javas app.use(cookieParser()); app.use('/', indexRouter); +app.use('/compose', composeRouter); app.use('/health_check', healthCheckRouter); app.use('/goto', gotoRouter); app.use('/back', backRouter); diff --git a/helpers/exceptions.js b/helpers/exceptions.js index c867a98..b01655c 100644 --- a/helpers/exceptions.js +++ b/helpers/exceptions.js @@ -1,3 +1,11 @@ +exports.IncorrectArgumentError = class IncorrectArgumentError extends Error { + constructor(message="Passed incorrect argument", ...args) { + super(message, ...args); + this.message = message; + this.name = "IncorrectArgumentError"; + } +} + exports.PageNotFoundError = class PageNotFoundError extends Error { constructor(message="Page not found", ...args) { super(message, ...args); @@ -23,9 +31,9 @@ exports.TooManyContextsError = class TooManyContextsError extends Error { } exports.NoHarWriterError = class NoHarWriterError extends Error { - constructor(message="There is no initialized Har Writer on the page to which the Har action was applied.", ...args) { + constructor(message="There is no initialized Har Writer on the page to which the Har action was applied", ...args) { super(message, ...args); this.message = message; this.name = "NoHarWriterError"; } -} \ No newline at end of file +} diff --git a/helpers/middlewares/process_exception.js b/helpers/middlewares/process_exception.js index 363d4f0..3b01770 100644 --- a/helpers/middlewares/process_exception.js +++ b/helpers/middlewares/process_exception.js @@ -16,7 +16,11 @@ exports.processExceptionMiddleware = async function processExceptionMiddleware(e res.header('scrapy-puppeteer-service-context-id', contextId); } - if (err instanceof exceptions.TooManyContextsError) { + if (err instanceof exceptions.IncorrectArgumentError) { + res.status(400); + } else if (err instanceof exceptions.NoHarWriterError) { + res.status(400); + }else if (err instanceof exceptions.TooManyContextsError) { res.status(429); // Too Many Requests } else if (err.contextId) { // there was a context, but something went wrong res.status(500); diff --git a/helpers/utils.js b/helpers/utils.js index 76d72a7..3babba3 100644 --- a/helpers/utils.js +++ b/helpers/utils.js @@ -89,6 +89,7 @@ async function getIds(page) { pageId: page.target()._targetId, } } +exports.getIds = getIds; exports.getContents = async function getContents(page, waitFor) { if (waitFor) { diff --git a/package.json b/package.json index 76318dc..c208b80 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "scrapy-puppeteer-service", - "version": "0.3.7", + "version": "0.3.8", "private": true, "scripts": { "start": "node ./bin/www" diff --git a/routes/action.js b/routes/action.js index 790275d..52329c3 100644 --- a/routes/action.js +++ b/routes/action.js @@ -1,47 +1,13 @@ const express = require('express'); + +const {action} = require('../actions/action'); const utils = require('../helpers/utils'); + const router = express.Router(); -/** - * Content-Type: application/javascript - * body = js function as pattern: - * async function action(page, request) { - * ... - * some actions with page in puppeteer syntax - * ... - * return { - * context_id: page.browserContext().id, - * page_id: page.target()._targetId, - html: await page.content(), - cookies: await page.cookies() - * }; - * }; - */ router.post('/', async function (req, res, next) { - - //TODO better request error handling - // if (!("action" in req.body)) { - // res.status(400); - // res.send("No action in request") - // } - try { - eval(req.body.toString()); - - //check action function exists - if (!(typeof action === "function" && action.length >= 1)) { - res.status(400); - res.send("Valid action function: \"async function action(page, request) " + - "{ ... some actions with request and page in puppeteer " + - "syntax};\""); - throw new Error("Invalid action function"); - } - - let response = await utils.performAction(req, async (page, request) => { - return { - data: await action(page, request) - } - }); + let response = await utils.performAction(req, action); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) { diff --git a/routes/click.js b/routes/click.js index 5c4ff47..9351e63 100644 --- a/routes/click.js +++ b/routes/click.js @@ -1,47 +1,18 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); -const DEFAULT_TIMEOUT = 1000; // 1 second +const {click} = require('../actions/click'); +const utils = require('../helpers/utils'); +const exceptions = require('../helpers/exceptions'); -async function action(page, request) { - await page.hover(request.body.selector); - if (request.body.navigationOptions) { - await Promise.all([ - page.waitForNavigation(request.body.navigationOptions), - page.click(request.body.selector, request.body.clickOptions), - ]); - } else { - await page.click(request.body.selector, request.body.clickOptions); - } - const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT }; - return await utils.getContents(page, waitOptions); -} +const router = express.Router(); -/** - body = { - "selector": "", // A selector to search for element to click. If there are multiple elements satisfying the selector, the first will be clicked. - "clickOptions": { - "button", //<"left"|"right"|"middle"> Defaults to left. - "clickCount", // defaults to 1. - "delay" // Time to wait between mousedown and mouseup in milliseconds. Defaults to 0. - }, - "waitOptions": {...}, // same as in goto action, defaults to 1s timeout - "navigationOptions": {...} // same as in goto action - } - */ router.post('/', async function (req, res, next) { - - //TODO better request error handling if (!("selector" in req.body)) { - res.status(400); - res.send("No selector to click in request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No selector to click in request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, click); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response) } catch (e) { diff --git a/routes/compose.js b/routes/compose.js new file mode 100644 index 0000000..1436f52 --- /dev/null +++ b/routes/compose.js @@ -0,0 +1,23 @@ +const express = require('express'); + +const {compose} = require('../actions/compose'); +const exceptions = require('../helpers/exceptions'); +const utils = require("../helpers/utils"); + +const router = express.Router(); + +router.post('/', async function (req, res, next){ + if (!(req.body instanceof Object)) { + throw new exceptions.IncorrectArgumentError("Body of compose method should be an Object"); + } + + try { + let response = await utils.performAction(req, compose); + res.header('scrapy-puppeteer-service-context-id', response.contextId); + res.send(response); + } catch (e) { + next(e); + } +}); + +module.exports = router; diff --git a/routes/fill_form.js b/routes/fill_form.js index fa2e26e..9c250d6 100644 --- a/routes/fill_form.js +++ b/routes/fill_form.js @@ -1,45 +1,18 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); - -async function action(page, request) { - const inputMapping = request.body.inputMapping; - const submitButton = request.body.submitButton; - - for (const [selector, params] of Object.entries(inputMapping)) { - const value = params.value; - const delay = params.delay || 0; - await page.type(selector, value, { delay }); - } - if (submitButton) { - await page.click(submitButton); - } - - return await utils.getContents(page); +const {fillForm} = require('../actions/fill_form'); +const utils = require('../helpers/utils'); +const exceptions = require('../helpers/exceptions'); -} +const router = express.Router(); -// body = { -// "inputMapping": { A dictionary where each key is a CSS selector, and each value is another dictionary containing details about the input for that element: -// "selector": The CSS selector for the input element (used as the key). -// "value": The text to be inputted into the element. -// "delay": A delay (in milliseconds) between each keystroke when inputting the text. Defaults to 0 if not provided. -// }, -// "submitButton": The CSS selector for the form's submit button. If provided, the button will be clicked after filling in the form. -// } -// router.post('/', async function (req, res, next) { - if (!req.body.inputMapping) { - res.status(400); - res.send("No inputMapping provided in fill_form request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No inputMapping provided in fill_form request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, fillForm); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) { @@ -47,4 +20,4 @@ router.post('/', async function (req, res, next) { } }); -module.exports = router; \ No newline at end of file +module.exports = router; diff --git a/routes/goback.js b/routes/goback.js index 32d94d7..139fee4 100644 --- a/routes/goback.js +++ b/routes/goback.js @@ -1,24 +1,18 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); +const {goBack} = require('../actions/goback'); +const utils = require('../helpers/utils'); +const exceptions = require('../helpers/exceptions'); -async function action(page, request) { - await page.goBack(request.body.navigationOptions); - return await utils.getContents(page, request.body.waitOptions); -} +const router = express.Router(); router.post('/', async function (req, res, next) { - if (!req.query.contextId || !req.query.pageId) { - res.status(400); - res.send("No page in request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No page in request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, goBack); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response) } catch (e) { diff --git a/routes/goforward.js b/routes/goforward.js index 0b150ce..9b392ba 100644 --- a/routes/goforward.js +++ b/routes/goforward.js @@ -1,24 +1,18 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); +const {goForward} = require('../actions/goforward'); +const utils = require('../helpers/utils'); +const exceptions = require('../helpers/exceptions'); -async function action(page, request) { - await page.goForward(request.body.navigationOptions); - return await utils.getContents(page, request.body.waitOptions); -} +const router = express.Router(); router.post('/', async function (req, res, next) { - if (!req.query.contextId || !req.query.pageId) { - res.status(400); - res.send("No page in request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No page in request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, goForward); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response) } catch (e) { diff --git a/routes/goto.js b/routes/goto.js index cb2df1b..0f0d556 100644 --- a/routes/goto.js +++ b/routes/goto.js @@ -1,44 +1,18 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); +const {goto} = require('../actions/goto'); +const utils = require('../helpers/utils'); +const exceptions = require('../helpers/exceptions'); -async function action(page, request) { - await page.goto(request.body.url, request.body.navigationOptions); - return await utils.getContents(page, request.body.waitOptions); -} +const router = express.Router(); -// body = { -// "url": URL to navigate page to. The url should include scheme, e.g. https://. -// "navigationOptions": { Navigation parameters which might have the following properties: -// "timeout": Maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout. The default value can be changed by using the page.setDefaultNavigationTimeout(timeout) or page.setDefaultTimeout(timeout) methods. -// "waitUntil": > When to consider navigation succeeded, defaults to load. Given an array of event strings, navigation is considered to be successful after all events have been fired. Events can be either: -// load - consider navigation to be finished when the load event is fired. -// domcontentloaded - consider navigation to be finished when the DOMContentLoaded event is fired. -// networkidle0 - consider navigation to be finished when there are no more than 0 network connections for at least 500 ms. -// networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms. -// "referer" Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders(). -// }, -// "waitOptions": { -// "timeout": Wait for given timeout in milliseconds -// "selector": Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector) -// "xpath": Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath) -// "options": Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions) -// }, -// "harRecording": true, -// } -// router.post('/', async function (req, res, next) { - if (!req.body.url) { - res.status(400); - res.send("No URL provided in goto request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No URL provided in goto request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, goto); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) { diff --git a/routes/har.js b/routes/har.js index 679db37..18024b7 100644 --- a/routes/har.js +++ b/routes/har.js @@ -1,26 +1,13 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); -const PuppeteerHar = require('puppeteer-har'); -const exceptions = require("../helpers/exceptions"); -async function action(page, request) { - - if (!(page.harWriter)){ - throw new exceptions.NoHarWriterError(); - } +const {har} = require('../actions/har'); +const utils = require('../helpers/utils'); - harData = await page.harWriter.stop(); - harJson = JSON.stringify(harData); - return { - har: harJson - }; -} +const router = express.Router(); router.post('/', async function (req, res, next) { - try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, har); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) { @@ -28,4 +15,4 @@ router.post('/', async function (req, res, next) { } }); -module.exports = router; \ No newline at end of file +module.exports = router; diff --git a/routes/mhtml.js b/routes/mhtml.js index 8ea8b3f..f3aea23 100644 --- a/routes/mhtml.js +++ b/routes/mhtml.js @@ -1,22 +1,11 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); +const {captureSnapshot} = require('../actions/mhtml'); +const utils = require('../helpers/utils'); -async function captureSnapshot(page, request) { - const cdpSession = await page.target().createCDPSession(); - const { data } = await cdpSession.send('Page.captureSnapshot', { format: 'mhtml' }); - await cdpSession.detach() - return { - mhtml: data, - }; -} +const router = express.Router(); -/** - * Captures mhtml snapshot of a page - */ router.post('/', async function (req, res, next) { - try { const response = await utils.performAction(req, captureSnapshot); res.header('scrapy-puppeteer-service-context-id', response.contextId); diff --git a/routes/recaptcha_solver.js b/routes/recaptcha_solver.js index af49d0b..0046ee1 100644 --- a/routes/recaptcha_solver.js +++ b/routes/recaptcha_solver.js @@ -1,56 +1,13 @@ const express = require('express') const router = express.Router() -const utils = require('../helpers/utils') - -const DEFAULT_TIMEOUT = 1000; // 1 second - -/* - * This module introduces new ability to puppeteer-service. - * It is capable of solving recaptchas on the given web-page. - * If there is no recaptcha on the page nothing bad will happen. - * If there is recaptcha it solves it and then inserts the special code - * into the page automatically. - * - * Returns useful information about recaptcha_solving. - * For more information about return value visit - * https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object - */ - -/** - * - * @param page - page with possible recaptcha. - * @param request - request to the page. - */ - -async function action(page, request) { - let recaptcha_data; - - if (request.body.solve_recaptcha) { - recaptcha_data = await page.solveRecaptchas(); - } else { - recaptcha_data = await page.findRecaptchas(); - } - - const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT }; - const contents = await utils.getContents(page, waitOptions); - if (request.query.closePage || - (request.body.close_on_empty && recaptcha_data['captchas'].length === 0)) { - await page.close(); - } - - return { - ...contents, - recaptcha_data: recaptcha_data, - } -} +const {recaptchaSolver} = require('../actions/recaptcha_solver'); +const utils = require('../helpers/utils') +const exceptions = require('../helpers/exceptions'); router.post('/', async function (req, res, next) { if (!req.query.contextId || !req.query.pageId) { - res.status(400); - res.send("No page in request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No page in request"); } if (!process.env.TOKEN_2CAPTCHA) { @@ -61,14 +18,11 @@ router.post('/', async function (req, res, next) { } if (!("solve_recaptcha" in req.body)) { - res.status(400); - res.send("No solve_recaptcha parameter in request"); - next(); - return; + throw new exceptions.IncorrectArgumentError("No solve_recaptcha parameter in request"); } try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, recaptchaSolver); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response) } catch (e) { @@ -76,4 +30,4 @@ router.post('/', async function (req, res, next) { } }); -module.exports = router; \ No newline at end of file +module.exports = router; diff --git a/routes/screenshot.js b/routes/screenshot.js index 559c0f5..1f0177f 100644 --- a/routes/screenshot.js +++ b/routes/screenshot.js @@ -1,24 +1,13 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); +const utils = require('../helpers/utils'); +const {screenshot} = require('../actions/screenshot'); -async function action(page, request) { - delete request.body.options.path; // no path for saving images - request.body.options.encoding = "base64"; // return in base64 - let screenshot = await page.screenshot(request.body.options); - return { - screenshot: screenshot - }; -} +const router = express.Router(); -// Method that returns screenshots of pages -// more description of options you can see on GitHub: -// https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md#pagescreenshotoptions router.post('/', async function (req, res, next) { - try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, screenshot); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) { diff --git a/routes/scroll.js b/routes/scroll.js index d9fb455..63e61e8 100644 --- a/routes/scroll.js +++ b/routes/scroll.js @@ -1,32 +1,13 @@ const express = require('express'); -const utils = require('../helpers/utils'); -const router = express.Router(); -const DEFAULT_TIMEOUT = 1000; // 1 second +const {scroll} = require('../actions/scroll'); +const utils = require('../helpers/utils'); -async function action(page, request) { - if (request.body.selector) { - await page.hover(request.body.selector); - } else { - await page.evaluate(() => { - // scroll down until the bottom of the page to trigger scroll event even at the bottom of a page - window.scrollBy(0, document.body.scrollHeight) - }); - } - const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT}; - return utils.getContents(page, waitOptions); -} +const router = express.Router(); -// Method that scrolls page to a certain selector. -// Example body: -// body = { -// "selector": "", // A selector to search for element to scroll -// "waitOptions": {...} // same as in goto action, defaults to 1s timeout -// } router.post('/', async function (req, res, next) { - try { - let response = await utils.performAction(req, action); + let response = await utils.performAction(req, scroll); res.header('scrapy-puppeteer-service-context-id', response.contextId); res.send(response); } catch (e) {