From 615340c4ad66d2370c024b40c9bfae553f9a21b5 Mon Sep 17 00:00:00 2001 From: Matthew Date: Thu, 4 Jul 2024 09:20:40 +0300 Subject: [PATCH] Context limit middleware (#45) * Base * separated middlewares * readme * Too many requests error * Now limit_context is a small service. * Typos * Typos * Delete useless variable * Version update --- README.md | 1 + app.js | 7 ++- helpers/exceptions.js | 8 +++ helpers/limit_context.js | 18 +++++++ helpers/middlewares.js | 66 ------------------------ helpers/middlewares/index.js | 3 ++ helpers/middlewares/logging.js | 30 +++++++++++ helpers/middlewares/process_exception.js | 38 ++++++++++++++ helpers/utils.js | 20 ++++++- package.json | 2 +- 10 files changed, 122 insertions(+), 71 deletions(-) create mode 100644 helpers/limit_context.js delete mode 100644 helpers/middlewares.js create mode 100644 helpers/middlewares/index.js create mode 100644 helpers/middlewares/logging.js create mode 100644 helpers/middlewares/process_exception.js diff --git a/README.md b/README.md index 1f1aef0..949b8ba 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,7 @@ Here we list them all with their purpose. * `VIEWPORT_HEIGHT = 720` - height of the browser's window * `TOKEN_2CAPTCHA = undefined` - token of [2captcha service](https://2captcha.com) * `STEALTH_BROWSING = true` - should the service use the [stealth browsing](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) mode +* `MAX_CONCURRENT_CONTEXTS = undefined` - should the service limit the number of contexts ## Notes on memory usage You need to explicitly close the browser tab once you don't need it (e.g. at the end of the parse method). diff --git a/app.js b/app.js index f6990db..5478c8e 100644 --- a/app.js +++ b/app.js @@ -21,7 +21,8 @@ const mhtmlRouter = require('./routes/mhtml'); const harRouter = require('./routes/har'); const closeContextRouter = require('./routes/close_context'); -const middlewares = require('./helpers/middlewares') +const middlewares = require('./helpers/middlewares'); +const limitContext = require('./helpers/limit_context'); const loggers = require("./helpers/loggers"); const app = express(); @@ -36,12 +37,14 @@ const VIEWPORT_WIDTH = parseInt(process.env.VIEWPORT_WIDTH) || 1280; const VIEWPORT_HEIGHT = parseInt(process.env.VIEWPORT_HEIGHT) || 720; const TOKEN_2CAPTCHA = process.env.TOKEN_2CAPTCHA; const STEALTH_BROWSING = (process.env.STEALTH_BROWSING || "true").toLowerCase() === "true"; +const MAX_CONCURRENT_CONTEXTS = process.env.MAX_CONCURRENT_CONTEXTS === "Infinity" ? Infinity : parseInt(process.env.MAX_CONCURRENT_CONTEXTS); +limitContext.initContextCounter(MAX_CONCURRENT_CONTEXTS); loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT); async function setupBrowser() { try { - if (TOKEN_2CAPTCHA) { // If token is given then RecapcthaPlugin is activated + if (TOKEN_2CAPTCHA) { // If token is given then RecaptchaPlugin is activated puppeteer.use( RecaptchaPlugin({ provider: { diff --git a/helpers/exceptions.js b/helpers/exceptions.js index afc7545..eee261f 100644 --- a/helpers/exceptions.js +++ b/helpers/exceptions.js @@ -13,3 +13,11 @@ exports.ContextNotFoundError = class ContextNotFoundError extends Error { this.name = "ContextNotFoundError"; } } + +exports.TooManyContextsError = class TooManyContextsError extends Error { + constructor(message="Could not create new context due to restriction", ...args) { + super(message, ...args); + this.message = message; + this.name = "TooManyContextsError"; + } +} diff --git a/helpers/limit_context.js b/helpers/limit_context.js new file mode 100644 index 0000000..3bff9e5 --- /dev/null +++ b/helpers/limit_context.js @@ -0,0 +1,18 @@ +let contextCounter = 0; + +function incContextCounter() {} +exports.incContextCounter = incContextCounter; // Empty function or incrementer + +function decContextCounter() {} +exports.decContextCounter = decContextCounter; // Empty function or decrementer + +function canCreateContext() { return true; } +exports.canCreateContext = canCreateContext; // Truish function or checker if the context can be created + +exports.initContextCounter = function (maxContextCounter) { + if (!isNaN(maxContextCounter)) { + exports.incContextCounter = () => { contextCounter++ }; + exports.decContextCounter = () => { contextCounter-- }; + exports.canCreateContext = () => { return contextCounter < maxContextCounter } + } +} diff --git a/helpers/middlewares.js b/helpers/middlewares.js deleted file mode 100644 index 62f30b2..0000000 --- a/helpers/middlewares.js +++ /dev/null @@ -1,66 +0,0 @@ -const morgan = require('morgan'); -const loggers = require('./loggers'); -const exceptions = require("./exceptions"); - -/*** - * Middleware for logging HTTP request-response. -***/ -exports.logHTTPMiddleware = function logHTTPMiddleware() { - const logger = loggers.getLogger(); - - return morgan( - loggers.HTTPFormat, - { - stream: { - write: (message) => logger.http(message), - }, - } - ); -} - -/*** - * Middleware for processing exceptions. -***/ -exports.processExceptionMiddleware = async function processExceptionMiddleware(err, req, res, next) { - if (res.headersSent) { - return next(err); - } - - const contextId = err.contextId || req.query.contextId; - const pageId = err.pageId || req.query.pageId; - const errorMessage = err.message || 'Unknown error'; - - if (contextId) { - res.header('scrapy-puppeteer-service-context-id', contextId); - } - - if (err.contextId) { // there was a context, but something went wrong - res.status(500); - } else { // No context. Possibly, our service was restarted - if (err instanceof exceptions.PageNotFoundError || err instanceof exceptions.ContextNotFoundError) { - res.status(422); - } else { - res.status(500); - } - } - - res.send({ - contextId, - pageId, - error: errorMessage - }); - - next(err); -} - -/*** - * Middleware for logging exceptions. -***/ -exports.logExceptionMiddleware = async function logExceptionMiddleware(err, req, res, next) { - loggers.getLogger().error({ - message: err, - contextId: req.query["contextId"], - pageId: req.query["pageId"], - }); - next(); -} diff --git a/helpers/middlewares/index.js b/helpers/middlewares/index.js new file mode 100644 index 0000000..2a03cae --- /dev/null +++ b/helpers/middlewares/index.js @@ -0,0 +1,3 @@ +exports.logHTTPMiddleware = require('./logging').logHTTPMiddleware; +exports.logExceptionMiddleware = require('./logging').logExceptionMiddleware; +exports.processExceptionMiddleware = require('./process_exception').processExceptionMiddleware; diff --git a/helpers/middlewares/logging.js b/helpers/middlewares/logging.js new file mode 100644 index 0000000..3b9b84e --- /dev/null +++ b/helpers/middlewares/logging.js @@ -0,0 +1,30 @@ +const loggers = require("../loggers"); +const morgan = require("morgan"); + +/*** + * Returns the middleware for logging HTTP request-response. + ***/ +exports.logHTTPMiddleware = function logHTTPMiddleware() { + const logger = loggers.getLogger(); + + return morgan( + loggers.HTTPFormat, + { + stream: { + write: (message) => logger.http(message), + }, + } + ); +} + +/*** + * Middleware for logging exceptions. + ***/ +exports.logExceptionMiddleware = async function logExceptionMiddleware(err, req, res, next) { + loggers.getLogger().error({ + message: err, + contextId: req.query["contextId"], + pageId: req.query["pageId"], + }); + next(); +} diff --git a/helpers/middlewares/process_exception.js b/helpers/middlewares/process_exception.js new file mode 100644 index 0000000..363d4f0 --- /dev/null +++ b/helpers/middlewares/process_exception.js @@ -0,0 +1,38 @@ +const exceptions = require("../exceptions"); + +/*** + * Middleware for processing exceptions. + ***/ +exports.processExceptionMiddleware = async function processExceptionMiddleware(err, req, res, next) { + if (res.headersSent) { + return next(err); + } + + const contextId = err.contextId || req.query.contextId; + const pageId = err.pageId || req.query.pageId; + const errorMessage = err.message || 'Unknown error'; + + if (contextId) { + res.header('scrapy-puppeteer-service-context-id', contextId); + } + + if (err instanceof exceptions.TooManyContextsError) { + res.status(429); // Too Many Requests + } else if (err.contextId) { // there was a context, but something went wrong + res.status(500); + } else { // No context. Possibly, our service was restarted + if (err instanceof exceptions.PageNotFoundError || err instanceof exceptions.ContextNotFoundError) { + res.status(422); // Unprocessable Entity + } else { + res.status(500); + } + } + + res.send({ + contextId, + pageId, + error: errorMessage + }); + + next(err); +} diff --git a/helpers/utils.js b/helpers/utils.js index afc5772..34ad788 100644 --- a/helpers/utils.js +++ b/helpers/utils.js @@ -1,5 +1,6 @@ const exceptions = require("./exceptions"); const { proxyRequest } = require('puppeteer-proxy'); +const limitContext = require('./limit_context'); const PROXY_URL_KEY = 'puppeteer-service-proxy-url' @@ -26,6 +27,7 @@ exports.closeContexts = async function closeContexts(browser, contextIds) { const closePromises = []; for (const context of browser.browserContexts()) { if (contextIds.includes(context.id)) { + limitContext.decContextCounter(); closePromises.push(context.close()); } } @@ -106,6 +108,20 @@ async function newPage(context) { return page; } +async function newContext(browser, options = {}) { + if (!limitContext.canCreateContext()) { + throw new exceptions.TooManyContextsError(); + } + + try { + limitContext.incContextCounter(); + return await browser.createIncognitoBrowserContext(options); + } catch (err) { + limitContext.decContextCounter(); + throw err; + } +} + function getProxy(request) { if ('body' in request && 'proxy' in request.body) { return request.body.proxy; @@ -127,12 +143,12 @@ exports.getBrowserPage = async function getBrowserPage(browser, request) { } const proxy = getProxy(request); if (!proxy) { - const context = await browser.createIncognitoBrowserContext(); + const context = await newContext(browser); return newPage(context); } const { origin: proxyServer, username, password } = new URL(proxy); - const context = await browser.createIncognitoBrowserContext({ proxyServer }); + const context = await newContext(browser, { proxyServer }); context[PROXY_URL_KEY] = proxy; const page = await newPage(context); if (username) { diff --git a/package.json b/package.json index 360ff90..9db0cf0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "scrapy-puppeteer-service", - "version": "0.3.0", + "version": "0.3.1", "private": true, "scripts": { "start": "node ./bin/www"