Skip to content

Commit

Permalink
Compose action (#53)
Browse files Browse the repository at this point in the history
* proper handling of exceptions.

* action-router model

* Now we throw errors and action is more error-prone.

* compose action

* Fix everything...

* Working service!

* Docs

* Fixes after review

* Fixes after review
  • Loading branch information
MatthewZMSU authored Oct 9, 2024
1 parent b37e0ae commit 8139dee
Show file tree
Hide file tree
Showing 30 changed files with 406 additions and 290 deletions.
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,51 @@ Example request body:
}
```

### **/compose**

This POST method allows to combine several puppeteer actions into one.
Note that the method does not expect nested composite actions inside its body.

Example request body:
```json5
{
"actions": [
{
"endpoint": "goto",
"body": {
"url": "<URL>",
"harRecording": false,
},
},
{
"endpoint": "click",
"body": {
"selector": "<SELECTOR>",
},
},
{
"endpoint": "click",
"body": {
"selector": "<SELECTOR>",
},
},
{
"endpoint": "scroll",
"body": {},
},
{
"endpoint": "screenshot",
"body": {
"options": {
"full_page": true,
"type": "jpeg",
},
},
}
],
}
```

### **/scroll**

This POST method allows to scroll page to the first element that is matched by selector and returns page result.
Expand Down
33 changes: 33 additions & 0 deletions actions/action.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
const exceptions = require('../helpers/exceptions');
const utils = require('../helpers/utils'); // For usage inside user's action(page, request) function

/**
* Content-Type: application/javascript
* body = js function as pattern:
* async function action(page, request) {
* ...
* some actions with page in puppeteer syntax
* ...
* return {
* context_id: page.browserContext().id,
* page_id: page.target()._targetId,
* html: await page.content(),
* cookies: await page.cookies()
* };
* };
*/
exports.action = async function action(page, request) {
eval(request.body.toString());

// check action function existence
if (!(typeof action === "function" && action.length >= 1)) {
throw new exceptions.IncorrectArgumentError("Invalid action function.\n" +
"Valid action function: \"async function action(page, request) " +
"{ ... some actions with request and page in puppeteer " +
"syntax};\"");
}

return {
data: await action(page, request)
}
}
29 changes: 29 additions & 0 deletions actions/click.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const utils = require('../helpers/utils');

const DEFAULT_TIMEOUT = 1000; // 1 second

/*
* body = {
* "selector": "", // <string> A selector to search for element to click. If there are multiple elements satisfying the selector, the first will be clicked.
* "clickOptions": {
* "button", // <"left"|"right"|"middle"> Defaults to left.
* "clickCount", // <number> defaults to 1.
* "delay" // <number> Time to wait between mousedown and mouseup in milliseconds. Defaults to 0.
* },
* "waitOptions": {...}, // same as in goto action, defaults to 1s timeout
* "navigationOptions": {...} // same as in goto action
* }
*/
exports.click = async function click(page, request) {
await page.hover(request.body.selector);
if (request.body.navigationOptions) {
await Promise.all([
page.waitForNavigation(request.body.navigationOptions),
page.click(request.body.selector, request.body.clickOptions),
]);
} else {
await page.click(request.body.selector, request.body.clickOptions);
}
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT };
return await utils.getContents(page, waitOptions);
}
35 changes: 35 additions & 0 deletions actions/compose.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
endpoint2action = {
action: require("./action").action,
click: require("./click").click,
fill_form: require("./fill_form").fillForm,
back: require("./goback").goBack,
forward: require("./goforward").goForward,
goto: require("./goto").goto,
har: require("./har").har,
mhtml: require("./mhtml").captureSnapshot,
recaptcha_solver: require("./recaptcha_solver").recaptchaSolver,
screenshot: require("./screenshot").screenshot,
scroll: require("./scroll").scroll,
}

async function compose(page, request) {
const originalClosePage = request.query.closePage;
const originalBody = structuredClone(request.body);

request.query.closePage = false;
delete request.body["actions"];

let response;
try {
for (const action of originalBody["actions"]) {
request.body = action["body"];
response = await endpoint2action[action["endpoint"]](page, request);
}
} finally {
request.query.closePage = originalClosePage;
request.body = originalBody;
}

return response;
}
exports.compose = compose;
29 changes: 29 additions & 0 deletions actions/fill_form.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const utils = require('../helpers/utils');

/*
* body = {
* "inputMapping": { A dictionary where each key is a CSS selector, and each value is another dictionary containing details about the input for that element:
* "selector": <string> The CSS selector for the input element (used as the key).
* "value": <string> The text to be inputted into the element.
* "delay": <number> A delay (in milliseconds) between each keystroke when inputting the text. Defaults to 0 if not provided.
* },
* "submitButton": <string> The CSS selector for the form's submit button. If provided, the button will be clicked after filling in the form.
* }
*/
exports.fillForm = async function fillForm(page, request) {
const inputMapping = request.body.inputMapping;
const submitButton = request.body.submitButton;

for (const [selector, params] of Object.entries(inputMapping)) {
const value = params.value;
const delay = params.delay || 0;
await page.type(selector, value, { delay });
}

if (submitButton) {
await page.click(submitButton);
}

return await utils.getContents(page);

}
6 changes: 6 additions & 0 deletions actions/goback.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
const utils = require('../helpers/utils');

exports.goBack = async function goBack(page, request) {
await page.goBack(request.body.navigationOptions);
return await utils.getContents(page, request.body.waitOptions);
}
6 changes: 6 additions & 0 deletions actions/goforward.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
const utils = require('../helpers/utils');

exports.goForward = async function goForward(page, request) {
await page.goForward(request.body.navigationOptions);
return await utils.getContents(page, request.body.waitOptions);
}
27 changes: 27 additions & 0 deletions actions/goto.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
const utils = require('../helpers/utils');

/*
* body = {
* "url": <string> URL to navigate page to. The url should include scheme, e.g. https://.
* "navigationOptions": { Navigation parameters which might have the following properties:
* "timeout": <number> Maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout. The default value can be changed by using the page.setDefaultNavigationTimeout(timeout) or page.setDefaultTimeout(timeout) methods.
* "waitUntil": <string|Array<string>> When to consider navigation succeeded, defaults to load. Given an array of event strings, navigation is considered to be successful after all events have been fired. Events can be either:
* load - consider navigation to be finished when the load event is fired.
* domcontentloaded - consider navigation to be finished when the DOMContentLoaded event is fired.
* networkidle0 - consider navigation to be finished when there are no more than 0 network connections for at least 500 ms.
* networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms.
* "referer" <string> Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders().
* },
* "waitOptions": {
* "timeout": <number> Wait for given timeout in milliseconds
* "selector": <string> Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector)
* "xpath": <string> Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath)
* "options": <object> Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions)
* },
* "harRecording": true,
* }
*/
exports.goto = async function goto(page, request) {
await page.goto(request.body.url, request.body.navigationOptions);
return await utils.getContents(page, request.body.waitOptions);
}
11 changes: 11 additions & 0 deletions actions/har.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
const exceptions = require("../helpers/exceptions");

exports.har = async function har(page, request) {
if (!(page.harWriter)){
throw new exceptions.NoHarWriterError();
}

return {
har: JSON.stringify(await page.harWriter.stop()) // TODO: do we really need JSON.stringify?
};
}
11 changes: 11 additions & 0 deletions actions/mhtml.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/*
* Captures mhtml snapshot of a page
*/
exports.captureSnapshot = async function captureSnapshot(page, request) {
const cdpSession = await page.target().createCDPSession();
const { data } = await cdpSession.send('Page.captureSnapshot', { format: 'mhtml' });
await cdpSession.detach()
return {
mhtml: data,
};
}
37 changes: 37 additions & 0 deletions actions/recaptcha_solver.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
const utils = require('../helpers/utils')

const DEFAULT_TIMEOUT = 1000; // 1 second

/*
* This module introduces new ability to puppeteer-service.
* It is capable of solving recaptchas on a given web-page.
* If there is no recaptcha on the page nothing bad will happen.
* If there is recaptcha it solves it and then inserts the special code
* into the page automatically.
*
* Returns useful information about recaptcha_solving.
* For more information about return value visit
* https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
*/
exports.recaptchaSolver = async function recaptchaSolver(page, request) {
let recaptcha_data;

if (request.body.solve_recaptcha) {
recaptcha_data = await page.solveRecaptchas();
} else {
recaptcha_data = await page.findRecaptchas();
}

const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT };
const contents = await utils.getContents(page, waitOptions);

if (request.query.closePage ||
(request.body.close_on_empty && recaptcha_data['captchas'].length === 0)) {
await page.close();
}

return {
...contents,
recaptcha_data: recaptcha_data,
}
}
13 changes: 13 additions & 0 deletions actions/screenshot.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/*
* Method that returns screenshots of pages
* more description of options you can see on GitHub:
* https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md#pagescreenshotoptions
*/
exports.screenshot = async function screenshot(page, request) {
delete request.body.options.path; // no path for saving images
request.body.options.encoding = "base64"; // return in base64
let screenshot = await page.screenshot(request.body.options);
return {
screenshot: screenshot
};
}
24 changes: 24 additions & 0 deletions actions/scroll.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
const utils = require('../helpers/utils');

const DEFAULT_TIMEOUT = 1000; // 1 second

/*
* Method that scrolls page to a certain selector.
* Example body:
* body = {
* "selector": "", // <string> A selector to search for element to scroll
* "waitOptions": {...}, // same as in goto action, defaults to 1s timeout
* }
*/
exports.scroll = async function scroll(page, request) {
if (request.body.selector) {
await page.hover(request.body.selector);
} else {
await page.evaluate(() => {
// scroll down until the bottom of the page to trigger scroll event even at the bottom of a page
window.scrollBy(0, document.body.scrollHeight)
});
}
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT};
return utils.getContents(page, waitOptions);
}
2 changes: 2 additions & 0 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const bodyParser = require('body-parser');
const AsyncLock = require('async-lock');

const indexRouter = require('./routes/index');
const composeRouter = require('./routes/compose');
const healthCheckRouter = require('./routes/health_check');
const gotoRouter = require('./routes/goto');
const backRouter = require('./routes/goback');
Expand Down Expand Up @@ -108,6 +109,7 @@ app.use(bodyParser.raw({ inflate: true, limit: '200kb', type: 'application/javas
app.use(cookieParser());

app.use('/', indexRouter);
app.use('/compose', composeRouter);
app.use('/health_check', healthCheckRouter);
app.use('/goto', gotoRouter);
app.use('/back', backRouter);
Expand Down
12 changes: 10 additions & 2 deletions helpers/exceptions.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
exports.IncorrectArgumentError = class IncorrectArgumentError extends Error {
constructor(message="Passed incorrect argument", ...args) {
super(message, ...args);
this.message = message;
this.name = "IncorrectArgumentError";
}
}

exports.PageNotFoundError = class PageNotFoundError extends Error {
constructor(message="Page not found", ...args) {
super(message, ...args);
Expand All @@ -23,9 +31,9 @@ exports.TooManyContextsError = class TooManyContextsError extends Error {
}

exports.NoHarWriterError = class NoHarWriterError extends Error {
constructor(message="There is no initialized Har Writer on the page to which the Har action was applied.", ...args) {
constructor(message="There is no initialized Har Writer on the page to which the Har action was applied", ...args) {
super(message, ...args);
this.message = message;
this.name = "NoHarWriterError";
}
}
}
6 changes: 5 additions & 1 deletion helpers/middlewares/process_exception.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ exports.processExceptionMiddleware = async function processExceptionMiddleware(e
res.header('scrapy-puppeteer-service-context-id', contextId);
}

if (err instanceof exceptions.TooManyContextsError) {
if (err instanceof exceptions.IncorrectArgumentError) {
res.status(400);
} else if (err instanceof exceptions.NoHarWriterError) {
res.status(400);
}else if (err instanceof exceptions.TooManyContextsError) {
res.status(429); // Too Many Requests
} else if (err.contextId) { // there was a context, but something went wrong
res.status(500);
Expand Down
1 change: 1 addition & 0 deletions helpers/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ async function getIds(page) {
pageId: page.target()._targetId,
}
}
exports.getIds = getIds;

exports.getContents = async function getContents(page, waitFor) {
if (waitFor) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "scrapy-puppeteer-service",
"version": "0.3.7",
"version": "0.3.8",
"private": true,
"scripts": {
"start": "node ./bin/www"
Expand Down
Loading

0 comments on commit 8139dee

Please sign in to comment.