diff --git a/integration-tests/worker/CHANGELOG.md b/integration-tests/worker/CHANGELOG.md index ad7244b76..bb0c78d97 100644 --- a/integration-tests/worker/CHANGELOG.md +++ b/integration-tests/worker/CHANGELOG.md @@ -1,5 +1,27 @@ # @openfn/integration-tests-worker +## 1.0.31 + +### Patch Changes + +- Updated dependencies [eb10b1f] +- Updated dependencies [281391b] +- Updated dependencies [281391b] +- Updated dependencies [2857fe6] + - @openfn/ws-worker@0.6.0 + - @openfn/engine-multi@0.3.1 + - @openfn/lightning-mock@1.1.10 + +## 1.0.30 + +### Patch Changes + +- Updated dependencies [9b9ca0c] +- Updated dependencies [9b9ca0c] + - @openfn/ws-worker@0.6.0 + - @openfn/engine-multi@0.3.0 + - @openfn/lightning-mock@1.1.9 + ## 1.0.29 ### Patch Changes diff --git a/integration-tests/worker/package.json b/integration-tests/worker/package.json index f82c02b94..c7da14c47 100644 --- a/integration-tests/worker/package.json +++ b/integration-tests/worker/package.json @@ -1,7 +1,7 @@ { "name": "@openfn/integration-tests-worker", "private": true, - "version": "1.0.29", + "version": "1.0.31", "description": "Lightning WOrker integration tests", "author": "Open Function Group ", "license": "ISC", diff --git a/integration-tests/worker/test/benchmark.test.ts b/integration-tests/worker/test/benchmark.test.ts index fe309de8c..bc304b1d3 100644 --- a/integration-tests/worker/test/benchmark.test.ts +++ b/integration-tests/worker/test/benchmark.test.ts @@ -8,7 +8,7 @@ import { run, humanMb } from '../src/util'; let lightning; let worker; -const maxConcurrency = 1; +const maxConcurrency = 20; test.before(async () => { const lightningPort = 4322; @@ -90,7 +90,7 @@ test.serial.skip('run 100 attempts', async (t) => { lightning.on('run:complete', (evt) => { // May want to disable this but it's nice feedback - //console.log('Completed ', evt.attemptId); + t.log('Completed ', evt.attemptId); if (evt.payload.reason !== 'success') { t.log('Atempt failed:'); diff --git a/integration-tests/worker/test/exit-reasons.test.ts b/integration-tests/worker/test/exit-reasons.test.ts index f8e981276..05af1681b 100644 --- a/integration-tests/worker/test/exit-reasons.test.ts +++ b/integration-tests/worker/test/exit-reasons.test.ts @@ -48,7 +48,6 @@ test('crash: syntax error', async (t) => { const result = await run(attempt); const { reason, error_type, error_message } = result; - t.is(reason, 'crash'); t.is(error_type, 'CompileError'); t.regex(error_message, /Unexpected token \(1:9\)$/); @@ -77,7 +76,31 @@ test('exception: autoinstall error', async (t) => { ); }); -test('kill: oom', async (t) => { +test('kill: oom (small, kill worker)', async (t) => { + const attempt = { + id: crypto.randomUUID(), + jobs: [ + { + adaptor: '@openfn/language-common@latest', + body: `fn((s) => { + s.data = []; + while(true) { + s.data.push(new Array(1e6).fill("xyz")) + } + })`, + }, + ], + }; + + const result = await run(attempt); + + const { reason, error_type, error_message } = result; + t.is(reason, 'kill'); + t.is(error_type, 'OOMError'); + t.is(error_message, 'Run exceeded maximum memory usage'); +}); + +test('kill: oom (large, kill vm)', async (t) => { const attempt = { id: crypto.randomUUID(), jobs: [ @@ -86,7 +109,7 @@ test('kill: oom', async (t) => { body: `fn((s) => { s.data = []; while(true) { - s.data.push(new Array(1e5).fill("xyz")) + s.data.push(new Array(1e9).fill("xyz")) } })`, }, diff --git a/integration-tests/worker/test/integration.test.ts b/integration-tests/worker/test/integration.test.ts index 522624968..b1dc71c4e 100644 --- a/integration-tests/worker/test/integration.test.ts +++ b/integration-tests/worker/test/integration.test.ts @@ -333,15 +333,19 @@ test('a timeout error should still call run-complete', (t) => { id: crypto.randomUUID(), jobs: [ { - adaptor: '@openfn/language-common@latest', // version lock to something stable? - body: 'fn((s) => new Promise((resolve) => setTimeout(() => resolve(s), 1000)))', + // don't try to autoinstall an adaptor because it'll count in the timeout + body: 'export default [(s) => new Promise((resolve) => setTimeout(() => resolve(s), 2000))]', }, ], options: { - timeout: 100, + runTimeout: 500, }, }; + lightning.once('attempt:start', (event) => { + t.log('attempt started'); + }); + lightning.once('run:complete', (event) => { t.is(event.payload.reason, 'kill'); t.is(event.payload.error_type, 'TimeoutError'); @@ -421,8 +425,8 @@ test('an OOM error should still call run-complete', (t) => { // }); // }); -// TODO this probably needs to move out into another test file -// not going to do it now as I've changed too much too quickly already... +// TODO this test is a bit different now +// I think it's worth keeping test('stateful adaptor should create a new client for each attempt', (t) => { return new Promise(async (done) => { // We want to create our own special worker here @@ -454,22 +458,15 @@ test('stateful adaptor should create a new client for each attempt', (t) => { if (id === attempt2.id) { const one = results[attempt1.id]; const two = results[attempt2.id]; - // The module should be isolated within the same thread - t.is(one.threadId, two.threadId); + // The two attempts should run in different threads + t.not(one.threadId, two.threadId); t.not(one.clientId, two.clientId); + done(); } }); - // Note that this API doesn't work!! - // shaeme, it would be useful - // lightning.waitForResult(attempt.id, (result) => { - // console.log(result) - // t.pass() - // done() - // }) - const engineArgs = { repoDir: path.resolve('./dummy-repo'), maxWorkers: 1, diff --git a/integration-tests/worker/test/server.test.ts b/integration-tests/worker/test/server.test.ts index 0f0e3b866..7f5975b60 100644 --- a/integration-tests/worker/test/server.test.ts +++ b/integration-tests/worker/test/server.test.ts @@ -52,7 +52,7 @@ const getPort = () => ++portgen; test.serial('worker should start, respond to 200, and close', async (t) => { workerProcess = await spawnServer(); - // The runnign server should respond to a get at root + // The running server should respond to a get at root let { status } = await fetch('http://localhost:2222/'); t.is(status, 200); diff --git a/packages/cli/CHANGELOG.md b/packages/cli/CHANGELOG.md index d6362ff39..561ee4d38 100644 --- a/packages/cli/CHANGELOG.md +++ b/packages/cli/CHANGELOG.md @@ -1,5 +1,12 @@ # @openfn/cli +## 0.4.15 + +### Patch Changes + +- Updated dependencies [0f22694] + - @openfn/runtime@0.2.5 + ## 0.4.14 ### Patch Changes diff --git a/packages/cli/package.json b/packages/cli/package.json index b0d22c059..67b286895 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -1,6 +1,6 @@ { "name": "@openfn/cli", - "version": "0.4.14", + "version": "0.4.15", "description": "CLI devtools for the openfn toolchain.", "engines": { "node": ">=18", diff --git a/packages/engine-multi/CHANGELOG.md b/packages/engine-multi/CHANGELOG.md index 70afc318c..b19f63df9 100644 --- a/packages/engine-multi/CHANGELOG.md +++ b/packages/engine-multi/CHANGELOG.md @@ -1,5 +1,17 @@ # engine-multi +## 0.3.0 + +### Minor Changes + +- 9b9ca0c: Replace workerpool with new child_process pool implementation +- 281391b: Replace timeout option with attemptTimeoutMs + +### Patch Changes + +- Updated dependencies [0f22694] + - @openfn/runtime@0.2.5 + ## 0.2.6 ### Patch Changes diff --git a/packages/engine-multi/README.md b/packages/engine-multi/README.md index 4705e7864..0f1ee4892 100644 --- a/packages/engine-multi/README.md +++ b/packages/engine-multi/README.md @@ -4,6 +4,34 @@ A runtime engine which runs multiple jobs in worker threads. A long-running node service, suitable for integration with a Worker, for executing workflows. +# Architecture + +The engine runs in the main process and exposes an `execute` function, which can be called by some wrapping service (ie, the Lightning Worker). + +The engine maintains a pool of long-lived child processes. For every `execute` call, the engine will pick an idle child process from the pool and execute the workflow within it. + +Inside the child process, we actually execute the runtime inside a worker thread. Each child process has exactly one worker, which is created on demand and destroyed on completion. + +So the process tree looks a bit like this: + +``` +-- main thread (execute, compile, autoinstall) + -- child_process (spawn worker) + -- worker_thread (@openfn/runtime) +``` + +Pooled child-processes are lazily spawned. If a worker never needs to run more than one task concurrently, it will only have one child process. + +![architecture diagram](docs/architecture.png) + +This architecture has several benefits: + +- Each run executes in a clean sandbox inside a worker_thread /inside/ a child process. A double-buffered sandbox. +- The child process can always control the thread, even if the thread locks the CPU, to shut it down +- If the worker thread blows its memory limit, other runs will be unaffected as they are in different child processes + +At the time of writing, compilation and autoinstall are run on the main thread - not in the child process. + ## Usage The Engine runs Workflows or Execution Plans. A plan MUST have an id. @@ -82,37 +110,3 @@ engine.execute(plan, { resolvers }); ``` Initial state and credentials are at the moment pre-loaded, with a "fully resolved" state object passed into the runtime. The Runtime has the ability to lazy load but implementing lazy loading across the worker_thread interface has proven tricky. - -## Architecture - -The Engine uses a dedicated worker found in src/worker/worker.ts. Most of the actual logic is in worker-helper.ts, and is shared by both the real worker (which calls out to @openfn/runtime), and the mock worker (which simulates and evals a run). The mock worker is mostly used in unit tests. - -The main interface to the engine, API, exposes a very limited and controlled interface to consumers. api.ts provides the main export and is a thin API wrapper around the main implementation, providing defauls and validation. - -The main implementation is in engine.ts, which exposes a much broader interface, with more options. This is potentially dangerous to consumers, but is extremely useful for unit testing here. For example, the dedicated worker path can be set here, as can the whitelist. - -When execute is called and passed a plan, the engine first generates an execution context. This contains an event emitter just for that workflower and some contextualised state. - -## Security Considerations & Memory Management - -The engine uses workerpool to maintain a pool of worker threads. - -As workflows come in to be executed, they are passed to workerpool which will pick an idle worker and execute the workflow within it. - -workerpool has no natural environment hardening, which means workflows running in the same thread will share an environment. Globals set in workflow A will be available to workflow B, and by the same token an adaptor loaded for workflow A will be shared with workflow B. - -Also, because the thread is long-lived, modules imported into the sandbox will be shared. - -We have several mitgations against this, ensuring a safe, secure and stable execution environment: - -- The runtime sandbox itself ensures that each job runs in an isolated context. If a job escapes the sandbox, it will have access to the thread's global scope -- Each workflow appends a unique id to all its imports, busting the node cache and forcing each module to be re-initialised. This means workers cannot share adaptors and all state is reset. -- To preserve memory, worker threads are regularly purged, meaning destroyed (note that this comes with a performance hit and undermines the use of worker pooling entirely!). When each workflow is complete, if there are no pending tasks to execute, all worker threads are destroyed. - -Inside the worker thread, we ensure that: - -- The parent `process.env` is not visible (by default in workerpool the woker will "inherit" the parent env) -- The parent process is not accessible (check this) -- The parent scope is not visible (this is innate in workerpool design). - -After initialisation, the only way that the parent process and child thread can communicate is a) through the sendMessage() interface (which really means the child can only send messages that the parent is expecting), b) through a shared memory buffer (usage of which is limited and controlled by the parent), and c) returning a value from a function execution. diff --git a/packages/engine-multi/docs/architecture.png b/packages/engine-multi/docs/architecture.png new file mode 100644 index 000000000..72f8eb82a Binary files /dev/null and b/packages/engine-multi/docs/architecture.png differ diff --git a/packages/engine-multi/package.json b/packages/engine-multi/package.json index d2a1bc464..07172ae22 100644 --- a/packages/engine-multi/package.json +++ b/packages/engine-multi/package.json @@ -1,11 +1,11 @@ { "name": "@openfn/engine-multi", - "version": "0.2.6", + "version": "0.3.0", "description": "Multi-process runtime engine", "main": "dist/index.js", "type": "module", "scripts": { - "test": "pnpm ava", + "test": "pnpm ava --serial", "test:types": "pnpm tsc --noEmit --project tsconfig.json", "build": "tsup --config ./tsup.config.js", "build:watch": "pnpm build --watch", @@ -17,12 +17,10 @@ "@openfn/compiler": "workspace:*", "@openfn/language-common": "2.0.0-rc3", "@openfn/logger": "workspace:*", - "@openfn/runtime": "workspace:*", - "workerpool": "^6.5.1" + "@openfn/runtime": "workspace:*" }, "devDependencies": { "@types/node": "^18.15.13", - "@types/workerpool": "^6.4.4", "ava": "5.3.1", "ts-node": "^10.9.1", "tslib": "^2.4.0", @@ -31,7 +29,8 @@ "typescript": "^5.1.6" }, "files": [ - "dist", + "dist/index.js", + "dist/worker/", "README.md", "CHANGELOG.md" ] diff --git a/packages/engine-multi/src/api.ts b/packages/engine-multi/src/api.ts index 6b0beac59..d949fcaa1 100644 --- a/packages/engine-multi/src/api.ts +++ b/packages/engine-multi/src/api.ts @@ -22,7 +22,7 @@ export type LazyResolvers = { expressions?: Resolver; }; -export type RTEOptions = Partial< +export type APIOptions = Partial< Omit & { // Needed here for unit tests to support json expressions. Would rather exclude tbh compile?: { @@ -38,7 +38,7 @@ const DEFAULT_MEMORY_LIMIT = 500; // Create the engine and handle user-facing stuff, like options parsing // and defaulting const createAPI = async function ( - options: RTEOptions = {} + options: APIOptions = {} ): Promise { let { repoDir } = options; @@ -50,7 +50,7 @@ const createAPI = async function ( } logger.info('repoDir set to ', repoDir); - const engineOptions = { + const engineOptions: EngineOptions = { logger, // TODO should resolvers be set here on passed to execute? @@ -65,17 +65,16 @@ const createAPI = async function ( // TODO should we disable autoinstall overrides? autoinstall: options.autoinstall, - minWorkers: options.minWorkers, maxWorkers: options.maxWorkers, memoryLimitMb: options.memoryLimitMb || DEFAULT_MEMORY_LIMIT, - - purge: options.hasOwnProperty('purge') ? options.purge : true, + attemptTimeoutMs: options.attemptTimeoutMs, statePropsToRemove: options.statePropsToRemove ?? [ 'configuration', 'response', ], }; + logger.info(`memory limit set to ${options.memoryLimitMb}mb`); logger.info(`statePropsToRemove set to: `, engineOptions.statePropsToRemove); diff --git a/packages/engine-multi/src/api/call-worker.ts b/packages/engine-multi/src/api/call-worker.ts index 257175705..4ecc1d441 100644 --- a/packages/engine-multi/src/api/call-worker.ts +++ b/packages/engine-multi/src/api/call-worker.ts @@ -1,100 +1,49 @@ -import { fileURLToPath } from 'node:url'; -import path from 'node:path'; -import workerpool from 'workerpool'; - -import { PURGE } from '../events'; - -import type { EngineAPI } from '../types'; -import type { Logger } from '@openfn/logger'; +import { Logger } from '@openfn/logger'; +import createPool from '../worker/pool'; +import { CallWorker } from '../types'; // All events coming out of the worker need to include a type key -type WorkerEvent = { +export type WorkerEvent = { type: string; [key: string]: any; }; type WorkerOptions = { - purge?: boolean; - minWorkers?: number; maxWorkers?: number; env?: any; timeout?: number; // ms - memoryLimitMb?: number; + silent?: boolean; // don't forward stdout to the parent }; -// Adds a `callWorker` function to the API object, which will execute a task in a worker +// Create a worker pool and return helper functions +// to use and destroy it export default function initWorkers( - engine: EngineAPI, workerPath: string, options: WorkerOptions = {}, - logger?: Logger + logger: Logger ) { - const workers = createWorkers(workerPath, options); - engine.callWorker = ( - task: string, - args: any[] = [], - events: any = {}, - timeout?: number - ) => { - const promise = workers.exec(task, args, { + const { env = {}, maxWorkers = 5, silent } = options; + + const workers = createPool( + workerPath, + { + maxWorkers, + env, + silent, + }, + logger + ); + + const callWorker: CallWorker = (task, args = [], events = [], options = {}) => + workers.exec(task, args, { + ...options, on: ({ type, ...args }: WorkerEvent) => { // just call the callback events[type]?.(args); }, }); - if (timeout) { - promise.timeout(timeout); - } - - return promise; - }; - - engine.purge = () => { - const { pendingTasks } = workers.stats(); - if (pendingTasks == 0) { - logger?.debug('Purging workers'); - engine.emit(PURGE); - workers.terminate(); - } - }; + const closeWorkers = async (instant?: boolean) => workers.destroy(instant); - // This will force termination (with grace period if allowed) - engine.closeWorkers = async (instant?: boolean) => workers.terminate(instant); -} - -export function createWorkers(workerPath: string, options: WorkerOptions) { - const { - env = {}, - minWorkers = 0, - maxWorkers = 5, // what's a good default here? Keeping it low to be conservative - memoryLimitMb, - } = options; - - let resolvedWorkerPath; - - if (workerPath) { - // If a path to the worker has been passed in, just use it verbatim - // We use this to pass a mock worker for testing purposes - resolvedWorkerPath = workerPath; - } else { - // By default, we load ./worker.js but can't rely on the working dir to find it - const dirname = path.dirname(fileURLToPath(import.meta.url)); - resolvedWorkerPath = path.resolve(dirname, workerPath || './worker.js'); - } - - return workerpool.pool(resolvedWorkerPath, { - minWorkers, - maxWorkers, - workerThreadOpts: { - execArgv: ['--no-warnings', '--experimental-vm-modules'], - // Important to override the child env so that it cannot access the parent env - env, - resourceLimits: { - // This is a fair approximation for heapsize - // Note that it's still possible to OOM the process without hitting this limit - maxOldGenerationSizeMb: memoryLimitMb, - }, - }, - }); + return { callWorker, closeWorkers }; } diff --git a/packages/engine-multi/src/api/execute.ts b/packages/engine-multi/src/api/execute.ts index ca2710a68..e4ff9c67e 100644 --- a/packages/engine-multi/src/api/execute.ts +++ b/packages/engine-multi/src/api/execute.ts @@ -1,9 +1,7 @@ -// Execute a compiled workflow -import { Promise as WorkerPoolPromise } from 'workerpool'; +import { timestamp } from '@openfn/logger'; import * as workerEvents from '../worker/events'; -import { ExecutionContext } from '../types'; - +import type { ExecutionContext } from '../types'; import autoinstall from './autoinstall'; import compile from './compile'; import { @@ -16,7 +14,7 @@ import { jobError, } from './lifecycle'; import preloadCredentials from './preload-credentials'; -import { ExecutionError, ExitError, OOMError, TimeoutError } from '../errors'; +import { ExecutionError } from '../errors'; const execute = async (context: ExecutionContext) => { const { state, callWorker, logger, options } = context; @@ -46,12 +44,51 @@ const execute = async (context: ExecutionContext) => { ); } + // Map any regexes in the whitelist to strings + const whitelist = options.whitelist?.map((w) => w.toString()); + const runOptions = { adaptorPaths, - whitelist: options.whitelist, + whitelist, statePropsToRemove: options.statePropsToRemove, }; + const workerOptions = { + memoryLimitMb: options.memoryLimitMb, + timeout: options.attemptTimeoutMs, + }; + + // Put out a log with the memory limit for the attempt + // This is a bit annoying but the log needs to be associated with the attempt + // and not just emitted to stdout + // The runtime can't do it because it doesn't know the memory limit + if (workerOptions.memoryLimitMb) { + await log(context, { + type: workerEvents.LOG, + workflowId: state.plan.id!, + threadId: '-', // no thread at this point + message: { + level: 'debug', + message: [`Memory limit: ${workerOptions.memoryLimitMb}mb`], + name: 'RTE', + time: timestamp().toString(), + }, + }); + } + if (workerOptions.timeout) { + await log(context, { + type: workerEvents.LOG, + workflowId: state.plan.id!, + threadId: '-', // no thread at this point + message: { + level: 'debug', + message: [`Timeout: ${workerOptions.timeout / 1000}s`], + name: 'RTE', + time: timestamp().toString(), + }, + }); + } + const events = { [workerEvents.WORKFLOW_START]: (evt: workerEvents.WorkflowStartEvent) => { workflowStart(context, evt); @@ -79,28 +116,15 @@ const execute = async (context: ExecutionContext) => { }, }; + // TODO in the new world order, what sorts of errors are being caught here? return callWorker( 'run', [state.plan, runOptions], events, - options.timeout + workerOptions ).catch((e: any) => { - // Catch process.exit from inside the thread - // This approach is not pretty - we are banking on replacing workerpool soon - if (e.message.match(/^Workerpool Worker terminated Unexpectedly/)) { - const [_match, exitCode] = e.message.match(/exitCode: `(\d+)`/); - if (exitCode === '111111') { - // This means a controlled exit from inside the worker - // The error has already been reported and we should do nothing - return; - } - e = new ExitError(parseInt(exitCode)); - } else if (e.code === 'ERR_WORKER_OUT_OF_MEMORY') { - e = new OOMError(); - } else if (e instanceof WorkerPoolPromise.TimeoutError) { - // Map the workerpool error to our own - e = new TimeoutError(options.timeout!); - } + // TODO are timeout errors being handled nicely here? + // actually I think the occur outside of here, in the pool error(context, { workflowId: state.plan.id, error: e }); logger.error(`Critical error thrown by ${state.plan.id}`, e); diff --git a/packages/engine-multi/src/api/lifecycle.ts b/packages/engine-multi/src/api/lifecycle.ts index c5c3dc80c..33fac1737 100644 --- a/packages/engine-multi/src/api/lifecycle.ts +++ b/packages/engine-multi/src/api/lifecycle.ts @@ -104,7 +104,6 @@ export const jobError = ( event: internalEvents.JobErrorEvent ) => { const { threadId, state, error, duration, jobId, next } = event; - context.emit(externalEvents.JOB_ERROR, { threadId, state, diff --git a/packages/engine-multi/src/api/validate-worker.ts b/packages/engine-multi/src/api/validate-worker.ts index 4f75b9485..fb29b9ffc 100644 --- a/packages/engine-multi/src/api/validate-worker.ts +++ b/packages/engine-multi/src/api/validate-worker.ts @@ -6,11 +6,12 @@ import { EngineAPI } from '../types'; // Call a handshake task in a worker thread // This really jsut validates that the worker path exists -export default async (api: EngineAPI) => { +export default async (api: EngineAPI, timeout = 5000) => { try { - await api.callWorker('handshake', []); - } catch { - // Throw a nice error if the worker isn't valid + // TODO argument drive this + await api.callWorker('handshake', [], {}, { timeout }); + } catch (e) { + console.error(e); throw new Error('Invalid worker path'); } }; diff --git a/packages/engine-multi/src/engine.ts b/packages/engine-multi/src/engine.ts index ca760765d..f04dc41df 100644 --- a/packages/engine-multi/src/engine.ts +++ b/packages/engine-multi/src/engine.ts @@ -21,6 +21,10 @@ import type { EngineAPI, EventHandler, WorkflowState } from './types'; import type { Logger } from '@openfn/logger'; import type { AutoinstallOptions } from './api/autoinstall'; +const DEFAULT_ATTEMPT_TIMEOUT = 1000 * 60 * 10; // ms + +const DEFAULT_MEMORY_LIMIT_MB = 500; + // For each workflow, create an API object with its own event emitter // this is a bt wierd - what if the emitter went on state instead? const createWorkflowEvents = ( @@ -72,16 +76,17 @@ export type EngineOptions = { autoinstall?: AutoinstallOptions; - minWorkers?: number; maxWorkers?: number; memoryLimitMb?: number; whitelist?: RegExp[]; // Timeout for the whole workflow - timeout?: number; + // timeout?: number; - purge?: boolean; + // Default timeouts in ms(used if an attempt does not provide its own) + attemptTimeoutMs?: number; + runTimeoutMs?: number; statePropsToRemove?: string[]; }; @@ -89,7 +94,14 @@ export type EngineOptions = { export type ExecuteOptions = { sanitize?: SanitizePolicies; resolvers?: LazyResolvers; - timeout?: number; + + // timeout?: number; // DEPRECATED + + // NB this deliberately uses old terminology + attemptTimeoutMs?: number; + runTimeout?: number; + + memoryLimitMb?: number; }; // This creates the internal API @@ -101,6 +113,9 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { const contexts: Record = {}; const deferredListeners: Record[]> = {}; + const defaultTimeout = options.attemptTimeoutMs || DEFAULT_ATTEMPT_TIMEOUT; + const defaultMemoryLimit = options.memoryLimitMb || DEFAULT_MEMORY_LIMIT_MB; + let resolvedWorkerPath; if (workerPath) { // If a path to the worker has been passed in, just use it verbatim @@ -113,7 +128,7 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { dirname, // TODO there are too many assumptions here, it's an argument for the path just to be // passed by the mian api or the unit test - workerPath || '../dist/worker/worker.js' + workerPath || '../dist/worker/thread/run.js' ); } @@ -121,17 +136,14 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { const engine = new Engine() as EngineAPI; - initWorkers( - engine, + const { callWorker, closeWorkers } = initWorkers( resolvedWorkerPath, { - minWorkers: options.minWorkers, maxWorkers: options.maxWorkers, - purge: options.purge, - memoryLimitMb: options.memoryLimitMb, }, options.logger ); + engine.callWorker = callWorker; await validateWorker(engine); @@ -154,10 +166,7 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { // TODO too much logic in this execute function, needs farming out // I don't mind having a wrapper here but it must be super thin // TODO maybe engine options is too broad? - const executeWrapper = ( - plan: ExecutionPlan, - opts: Partial = {} - ) => { + const executeWrapper = (plan: ExecutionPlan, opts: ExecuteOptions = {}) => { options.logger!.debug('executing plan ', plan?.id ?? ''); const workflowId = plan.id!; // TODO throw if plan is invalid @@ -168,12 +177,13 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { const context = new ExecutionContext({ state, logger: options.logger!, - callWorker: engine.callWorker, + callWorker, options: { ...options, sanitize: opts.sanitize, resolvers: opts.resolvers, - timeout: opts.timeout, + attemptTimeoutMs: opts.attemptTimeoutMs ?? defaultTimeout, + memoryLimitMb: opts.memoryLimitMb ?? defaultMemoryLimit, }, }); @@ -192,9 +202,6 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { // @ts-ignore execute(context).finally(() => { delete contexts[workflowId]; - if (options.purge && Object.keys(contexts).length === 0) { - engine.purge?.(); - } }); }, 1); @@ -231,7 +238,7 @@ const createEngine = async (options: EngineOptions, workerPath?: string) => { // How does this work if deferred? }; - const destroy = (instant?: boolean) => engine.closeWorkers(instant); + const destroy = (instant?: boolean) => closeWorkers(instant); return Object.assign(engine, { options, diff --git a/packages/engine-multi/src/errors.ts b/packages/engine-multi/src/errors.ts index 2da37b270..c6020e84f 100644 --- a/packages/engine-multi/src/errors.ts +++ b/packages/engine-multi/src/errors.ts @@ -11,6 +11,7 @@ export class EngineError extends Error { export class TimeoutError extends EngineError { severity = 'kill'; type = 'TimeoutError'; + name = 'TimeoutError'; duration; constructor(durationInMs: number) { super(); diff --git a/packages/engine-multi/src/events.ts b/packages/engine-multi/src/events.ts index ce2fbf93f..a7af03b1c 100644 --- a/packages/engine-multi/src/events.ts +++ b/packages/engine-multi/src/events.ts @@ -4,6 +4,10 @@ import { JSONLog } from '@openfn/logger'; import { Versions } from './types'; +// If the worker thread exists a process safely, it'll return this error code +// any other error code is unexpected +export const HANDLED_EXIT_CODE = 111111; + // Top level API events - these are what the engine publishes externally // should it just be start, log, job-start, job-complete, end etc? // What about engine-level logging? CLI-level stuff? @@ -29,8 +33,6 @@ export const AUTOINSTALL_COMPLETE = 'autoinstall-complete'; export const AUTOINSTALL_ERROR = 'autoinstall-error'; -export const PURGE = 'purge-workers'; - export type EventMap = { [WORKFLOW_START]: WorkflowStartPayload; [WORKFLOW_COMPLETE]: WorkflowCompletePayload; @@ -41,7 +43,6 @@ export type EventMap = { [WORKFLOW_ERROR]: WorkflowErrorPayload; [AUTOINSTALL_COMPLETE]: AutoinstallCompletePayload; [AUTOINSTALL_ERROR]: AutoinstallErrorPayload; - [PURGE]: null; }; export type ExternalEvents = keyof EventMap; diff --git a/packages/engine-multi/src/test/bad-worker.js b/packages/engine-multi/src/test/bad-worker.js new file mode 100644 index 000000000..a2022aad0 --- /dev/null +++ b/packages/engine-multi/src/test/bad-worker.js @@ -0,0 +1,2 @@ +// This worker will not respond to a handshake task because no listener has been set up +const handshake = () => true; diff --git a/packages/engine-multi/src/test/worker-functions.js b/packages/engine-multi/src/test/worker-functions.ts similarity index 68% rename from packages/engine-multi/src/test/worker-functions.js rename to packages/engine-multi/src/test/worker-functions.ts index ae7e6cf78..0c516e07e 100644 --- a/packages/engine-multi/src/test/worker-functions.js +++ b/packages/engine-multi/src/test/worker-functions.ts @@ -1,53 +1,46 @@ import path from 'node:path'; -import workerpool from 'workerpool'; -import { threadId } from 'node:worker_threads'; -import v8 from 'v8'; +import { register, publish, threadId } from '../worker/thread/runtime'; import { increment } from './counter.js'; -workerpool.worker({ - handshake: () => true, - test: (result = 42) => { - const { pid, scribble } = process; - - workerpool.workerEmit({ - type: 'message', +const tasks = { + test: async (result = 42) => { + publish('test-message', { result, - pid, - scribble, }); return result; }, - readEnv: (key) => { + wait: (duration = 500) => + new Promise((resolve) => { + setTimeout(() => resolve(1), duration); + }), + readEnv: async (key: string) => { if (key) { return process.env[key]; } return process.env; }, - threadId: () => threadId, + threadId: async () => threadId, + processId: async () => process.pid, // very very simple intepretation of a run function // Most tests should use the mock-worker instead - run: (plan, _adaptorPaths) => { + run: async (plan: any, _adaptorPaths: any) => { const workflowId = plan.id; - workerpool.workerEmit({ - type: 'worker:workflow-start', + publish('worker:workflow-start', { workflowId, - threadId, }); try { const [job] = plan.jobs; const result = eval(job.expression); - workerpool.workerEmit({ - type: 'worker:workflow-complete', + publish('worker:workflow-complete', { workflowId, state: result, - threadId, }); } catch (err) { // console.error(err); // // @ts-ignore TODO sort out error typing - // workerpool.workerEmit({ + // publish({ // type: 'worker:workflow-error', // workflowId, // message: err.message, @@ -58,13 +51,13 @@ workerpool.worker({ } }, - timeout: () => { - while (true) {} + throw: async () => { + throw new Error('test_error'); }, // Experiments with freezing the global scope // We may do this in the actual worker - freeze: () => { + freeze: async () => { // This is not a deep freeze, so eg global.Error is not frozen // Also some things like Uint8Array are not freezable, so these remain ways to scribble Object.freeze(global); @@ -74,21 +67,32 @@ workerpool.worker({ Object.freeze(this); }, - setGlobalX: (newValue = 42) => { + setGlobalX: async (newValue = 42) => { + // @ts-ignore global.x = newValue; }, - getGlobalX: () => global.x, + getGlobalX: async () => { + // @ts-ignore + return global.x; + }, - writeToGlobalError: (obj) => { + // @ts-ignore + writeToGlobalError: async (obj) => { Object.assign(Error, obj); + + // @ts-ignore + console.log(Error.y); }, - getFromGlobalError: (key) => Error[key], + getFromGlobalError: async (key: string) => { + // @ts-ignore + return Error[key]; + }, // Tests of module state across executions // Ie, does a module get re-initialised between runs? (No.) - incrementStatic: () => increment(), + incrementStatic: async () => increment(), incrementDynamic: async () => { const { increment } = await import(path.resolve('src/test/counter.js')); return increment(); @@ -98,7 +102,7 @@ workerpool.worker({ // is enghuh to OOM the _process_, taking the whole engine out // This function should blow the thread's memory without // killing the parent process - blowMemory: () => { + blowMemory: async () => { let data = []; while (true) { data.push(Array(1e6).fill('mario')); @@ -115,4 +119,6 @@ workerpool.worker({ // stats.heap_size_limit / 1024 / 1024 // } Mb\n heap used = ${hprocess.memoryUsage().heapUsed / 1024 / 1024}mb` // ); -}); +}; + +register(tasks); diff --git a/packages/engine-multi/src/types.ts b/packages/engine-multi/src/types.ts index f3ab80fd1..b1b323a63 100644 --- a/packages/engine-multi/src/types.ts +++ b/packages/engine-multi/src/types.ts @@ -1,12 +1,10 @@ -// ok first of allI want to capture the key interfaces -import workerpool from 'workerpool'; - import type { Logger, SanitizePolicies } from '@openfn/logger'; import type { ExecutionPlan } from '@openfn/runtime'; import type { EventEmitter } from 'node:events'; import type { ExternalEvents, EventMap } from './events'; import type { EngineOptions } from './engine'; +import type { ExecOpts } from './worker/pool'; export type Resolver = (id: string) => Promise; @@ -30,12 +28,12 @@ export type WorkflowState = { options: any; // TODO this is wf specific options, like logging policy }; -export type CallWorker = ( +export type CallWorker = ( task: string, args: any[], events?: any, - timeout?: number -) => workerpool.Promise; + options?: Omit +) => Promise; export type ExecutionContextConstructor = { state: WorkflowState; @@ -65,7 +63,6 @@ export interface ExecutionContext extends EventEmitter { export interface EngineAPI extends EventEmitter { callWorker: CallWorker; closeWorkers: (instant?: boolean) => void; - purge?: () => void; } export interface RuntimeEngine { diff --git a/packages/engine-multi/src/util/serialize-error.ts b/packages/engine-multi/src/util/serialize-error.ts new file mode 100644 index 000000000..382e35cf5 --- /dev/null +++ b/packages/engine-multi/src/util/serialize-error.ts @@ -0,0 +1,7 @@ +export default (error: any) => { + return { + message: error.message, + type: error.subtype || error.type || error.name, + severity: error.severity || 'crash', + }; +}; diff --git a/packages/engine-multi/src/worker/child/create-thread.ts b/packages/engine-multi/src/worker/child/create-thread.ts new file mode 100644 index 000000000..29dd8be4d --- /dev/null +++ b/packages/engine-multi/src/worker/child/create-thread.ts @@ -0,0 +1,31 @@ +// creates a worker thread + +import { Worker } from 'node:worker_threads'; +import { ENGINE_RUN_TASK } from '../events'; + +const scriptPath = process.argv[2]; + +type ThreadOptions = { + memoryLimitMb?: number; +}; + +const createThread = ( + task: string, + args: any[] = [], + options: ThreadOptions = {} +) => { + const worker = new Worker(scriptPath, { + resourceLimits: { + maxOldGenerationSizeMb: options.memoryLimitMb, + }, + }); + + worker.postMessage({ + type: ENGINE_RUN_TASK, + task, + args, + }); + + return worker; +}; +export default createThread; diff --git a/packages/engine-multi/src/worker/child/runner.ts b/packages/engine-multi/src/worker/child/runner.ts new file mode 100644 index 000000000..1a9f6ba25 --- /dev/null +++ b/packages/engine-multi/src/worker/child/runner.ts @@ -0,0 +1,44 @@ +// this is the child process runtime environment +// it lists to run calls, spins up a thread, and forwards events +import type { WorkerEvent } from '../../api/call-worker'; +import { OOMError } from '../../errors'; +import { + ENGINE_REJECT_TASK, + ENGINE_RESOLVE_TASK, + ENGINE_RUN_TASK, +} from '../events'; +import createThread from './create-thread'; +import serializeError from '../../util/serialize-error'; + +process.on('message', async (evt: WorkerEvent) => { + if (evt.type === ENGINE_RUN_TASK) { + const { args, options } = evt; + run(evt.task, args, options); + } +}); + +const run = async (task: string, args: any[] = [], options = {}) => { + const thread = createThread(task, args, options); + + thread.on('error', (e) => { + // @ts-ignore + if (e.code === 'ERR_WORKER_OUT_OF_MEMORY') { + e = new OOMError(); + + process.send!({ + type: ENGINE_REJECT_TASK, + error: serializeError(e), + }); + } + }); + + thread.on('message', (evt) => { + process.send!(evt); + + if (evt.type === ENGINE_RESOLVE_TASK || evt.type === ENGINE_REJECT_TASK) { + // TODO wait for this to finish (or throw) + // then destroy the thread + thread.terminate(); + } + }); +}; diff --git a/packages/engine-multi/src/worker/events.ts b/packages/engine-multi/src/worker/events.ts index 0b3c4fb17..9e871e338 100644 --- a/packages/engine-multi/src/worker/events.ts +++ b/packages/engine-multi/src/worker/events.ts @@ -5,6 +5,12 @@ import { JSONLog } from '@openfn/logger'; import { Versions } from '../types'; +// events used by the internal thread runtime + +export const ENGINE_RUN_TASK = 'engine:run_task'; +export const ENGINE_RESOLVE_TASK = 'engine:resolve_task'; +export const ENGINE_REJECT_TASK = 'engine:reject_task'; + // These events are basically duplicates of the externally published ones // (ie those consumed by the lightning worker) // But I want them to be explicity named and typed to avoid confusion diff --git a/packages/engine-multi/src/worker/pool.ts b/packages/engine-multi/src/worker/pool.ts new file mode 100644 index 000000000..74b699259 --- /dev/null +++ b/packages/engine-multi/src/worker/pool.ts @@ -0,0 +1,280 @@ +import { fileURLToPath } from 'node:url'; +import { ChildProcess, fork } from 'node:child_process'; +import path from 'node:path'; + +import readline from 'node:readline/promises'; + +import { ExitError, OOMError, TimeoutError } from '../errors'; +import { + ENGINE_REJECT_TASK, + ENGINE_RESOLVE_TASK, + ENGINE_RUN_TASK, +} from './events'; +import { HANDLED_EXIT_CODE } from '../events'; +import { Logger } from '@openfn/logger'; + +type PoolOptions = { + capacity?: number; // defaults to 5 + maxWorkers?: number; // alias for capacity. Which is best? + env?: Record; // default environment for workers + + silent?: boolean; +}; + +type RunTaskEvent = { + type: typeof ENGINE_RUN_TASK; + task: string; + args: any[]; +}; + +export type ExecOpts = { + // for parity with workerpool, but this will change later + on?: (event: any) => void; + + timeout?: number; // ms + + memoryLimitMb?: number; +}; + +export type ChildProcessPool = Array; + +type QueuedTask = { + task: string; + args: any[]; + opts: ExecOpts; + resolve: (...args: any[]) => any; + reject: (...args: any[]) => any; +}; + +let root = path.dirname(fileURLToPath(import.meta.url)); +while (!root.endsWith('engine-multi')) { + root = path.resolve(root, '..'); +} +const envPath = path.resolve(root, 'dist/worker/child/runner.js'); + +// Restore a child at the first non-child process position +// this encourages the child to be reused before creating a new one +export const returnToPool = ( + pool: ChildProcessPool, + worker: ChildProcess | false +) => { + let idx = pool.findIndex((child) => child); + if (idx === -1) idx = pool.length; + pool.splice(idx, 0, worker); +}; + +// creates a new pool of workers which use the same script +function createPool(script: string, options: PoolOptions = {}, logger: Logger) { + const capacity = options.capacity || options.maxWorkers || 5; + + logger.debug(`pool: Creating new child process pool | capacity: ${capacity}`); + let destroyed = false; + + // a pool of processes + const pool: ChildProcessPool = new Array(capacity).fill(false); + + const queue: QueuedTask[] = []; + + // Keep track of all the workers we created + const allWorkers: Record = {}; + + const init = (child: ChildProcess | false) => { + if (!child) { + // create a new child process and load the module script into it + child = fork(envPath, [script], { + execArgv: ['--experimental-vm-modules', '--no-warnings'], + + env: options.env || {}, + + // This pipes the stderr stream onto the child, so we can read it later + stdio: ['ipc', 'ignore', 'pipe'], + }); + + logger.debug('pool: Created new child process', child.pid); + allWorkers[child.pid!] = child; + } else { + logger.debug('pool: Using existing child process', child.pid); + } + return child; + }; + + const finish = (worker: ChildProcess | false) => { + if (worker) { + logger.debug('pool: finished task in worker', worker.pid); + worker.removeAllListeners(); + } + + if (destroyed) { + killWorker(worker); + } else { + returnToPool(pool, worker); + + const next = queue.pop(); + if (next) { + // TODO actually I think if there's a queue we should empty it first + const { task, args, resolve, reject, opts } = next; + logger.debug('pool: Picking up deferred task', task); + + exec(task, args, opts).then(resolve).catch(reject); + } + } + }; + + const exec = (task: string, args: any[] = [], opts: ExecOpts = {}) => { + // TODO Throw if destroyed + if (destroyed) { + throw new Error('Worker destroyed'); + } + + const promise = new Promise(async (resolve, reject) => { + // TODO what should we do if a process in the pool dies, perhaps due to OOM? + const onExit = async (code: number) => { + if (code !== HANDLED_EXIT_CODE) { + logger.debug('pool: Worker exited unexpectedly'); + clearTimeout(timeout); + + // Read the stderr stream from the worked to see if this looks like an OOM error + const rl = readline.createInterface({ + input: worker.stderr!, + crlfDelay: Infinity, + }); + + try { + for await (const line of rl) { + logger.debug(line); + if (line.match(/JavaScript heap out of memory/)) { + reject(new OOMError()); + + killWorker(worker); + // restore a placeholder to the queue + finish(false); + return; + } + } + } catch (e) { + // do nothing + } + + reject(new ExitError(code)); + finish(worker); + } + }; + + let timeout: NodeJS.Timeout; + let didTimeout = false; + if (!pool.length) { + logger.debug('pool: Deferring task', task); + return queue.push({ task, args, opts, resolve, reject }); + } + + const worker = init(pool.pop()!); + + // Start a timeout running + if (opts.timeout && opts.timeout !== Infinity) { + // Setup a handler to kill the running worker after the timeout expires + const timeoutWorker = () => { + logger.debug( + `pool: Timed out task "${task}" in worker ${worker.pid} (${opts.timeout}ms)` + ); + // Disconnect the on-exit handler + worker.off('exit', onExit); + + // Kill the worker, just in case it's still processing away + killWorker(worker); + + // Restore a placeholder in the pool + pool.splice(0, 0, false); + + reject(new TimeoutError(opts.timeout!)); + }; + + timeout = setTimeout(() => { + timeoutWorker(); + }, opts.timeout); + } + + try { + logger.debug(`pool: Running task "${task}" in worker ${worker.pid}`); + worker.send({ + type: ENGINE_RUN_TASK, + task, + args, + options: { + memoryLimitMb: opts.memoryLimitMb, + }, + } as RunTaskEvent); + } catch (e) { + // swallow errors here + // this may occur if the inner worker is invalid + } + + worker.on('exit', onExit); + + worker.on('message', (evt: any) => { + // forward the message out of the pool + opts.on?.(evt); + + // Listen to a complete event to know the work is done + if (evt.type === ENGINE_RESOLVE_TASK) { + clearTimeout(timeout); + if (!didTimeout) { + resolve(evt.result); + + finish(worker); + } + } else if (evt.type === ENGINE_REJECT_TASK) { + // Note that this is an unexpected error + // Actual engine errors should return a workflow:error event and resolve + clearTimeout(timeout); + if (!didTimeout) { + const e = new Error(evt.error.message); + // @ts-ignore + e.severity = evt.error.severity; + e.name = evt.error.type; + reject(e); + + finish(worker); + } + } + }); + }); + + return promise; + }; + + const killWorker = (worker: ChildProcess | false) => { + if (worker) { + logger.debug('pool: destroying worker ', worker.pid); + worker.kill(); + delete allWorkers[worker.pid!]; + } + }; + + const destroy = (immediate = false) => { + destroyed = true; + + // Drain the pool + while (pool.length) { + killWorker(pool.pop()!); + } + + if (immediate) { + Object.values(allWorkers).forEach(killWorker); + } + // TODO set a timeout and force any outstanding workers to die + }; + + const api = { + exec, + destroy, + + // for debugging and testing + _pool: pool, + _queue: queue, + _allWorkers: allWorkers, + }; + + return api; +} + +export default createPool; diff --git a/packages/engine-multi/src/worker/worker-helper.ts b/packages/engine-multi/src/worker/thread/helpers.ts similarity index 66% rename from packages/engine-multi/src/worker/worker-helper.ts rename to packages/engine-multi/src/worker/thread/helpers.ts index 66c64d15f..b3489a338 100644 --- a/packages/engine-multi/src/worker/worker-helper.ts +++ b/packages/engine-multi/src/worker/thread/helpers.ts @@ -1,12 +1,16 @@ // utilities to run inside the worker // This is designed to minimize the amount of code we have to mock -import workerpool from 'workerpool'; -import { threadId } from 'node:worker_threads'; +import process from 'node:process'; + import createLogger, { SanitizePolicies } from '@openfn/logger'; -import * as workerEvents from './events'; -import { ExecutionError } from '../errors'; +import * as workerEvents from '../events'; +import { HANDLED_EXIT_CODE } from '../../events'; +import { ExecutionError, ExitError } from '../../errors'; + +import { publish } from './runtime'; +import serializeError from '../../util/serialize-error'; export const createLoggers = ( workflowId: string, @@ -15,9 +19,8 @@ export const createLoggers = ( const log = (message: string) => { // Apparently the json log stringifies the message // We don't really want it to do that - workerpool.workerEmit({ + publish(workerEvents.LOG, { workflowId, - type: workerEvents.LOG, message: JSON.parse(message), } as workerEvents.LogEvent); }; @@ -48,38 +51,27 @@ export const createLoggers = ( return { logger, jobLogger }; }; -export function publish( +// Execute wrapper function +export const execute = async ( workflowId: string, - type: T, - payload: Omit -) { - workerpool.workerEmit({ - workflowId, - threadId, - type, - ...payload, - }); -} - -async function helper(workflowId: string, execute: () => Promise) { - publish(workflowId, workerEvents.WORKFLOW_START, {}); - + executeFn: () => Promise +) => { const handleError = (err: any) => { - publish(workflowId, workerEvents.ERROR, { + publish(workerEvents.ERROR, { // @ts-ignore workflowId, - threadId, - // Map the error out of the thread in a serializable format - error: { - message: err.message, - type: err.subtype || err.type || err.name, - severity: err.severity || 'crash', - }, + error: serializeError(err), // TODO job id maybe }); }; + process.on('exit', (code: number) => { + if (code !== HANDLED_EXIT_CODE) { + handleError(new ExitError(code)); + } + }); + // catch-all for any uncaught errors, which likely come from asynchronous code // (probably in an adaptor) // Note that if this occurs after the execute promise resolved, @@ -91,24 +83,26 @@ async function helper(workflowId: string, execute: () => Promise) { e.severity = 'crash'; // Downgrade this to a crash because it's likely not our fault handleError(e); - // Close down the process justto be 100% sure that all async code stops + // Close down the process just to be 100% sure that all async code stops // This is in a timeout to give the emitted message time to escape // There is a TINY WINDOW in which async code can still run and affect the next attempt // This should all go away when we replace workerpool setTimeout(() => { - process.exit(111111); + process.exit(HANDLED_EXIT_CODE); }, 2); }); + publish(workerEvents.WORKFLOW_START, { + workflowId, + }); + try { - const result = await execute(); - publish(workflowId, workerEvents.WORKFLOW_COMPLETE, { state: result }); + const result = await executeFn(); + publish(workerEvents.WORKFLOW_COMPLETE, { workflowId, state: result }); // For tests return result; } catch (err: any) { handleError(err); } -} - -export default helper; +}; diff --git a/packages/engine-multi/src/worker/mock-worker.ts b/packages/engine-multi/src/worker/thread/mock-run.ts similarity index 84% rename from packages/engine-multi/src/worker/mock-worker.ts rename to packages/engine-multi/src/worker/thread/mock-run.ts index f4465c841..54d6de72b 100644 --- a/packages/engine-multi/src/worker/mock-worker.ts +++ b/packages/engine-multi/src/worker/thread/mock-run.ts @@ -7,9 +7,9 @@ * This mock handler does nothing and returns after a while, ignoring the source argument * and reading instructions out of state object. */ -import workerpool from 'workerpool'; -import helper, { createLoggers, publish } from './worker-helper'; -import * as workerEvents from './events'; +import { register, publish } from './runtime'; +import { execute, createLoggers } from './helpers'; +import * as workerEvents from '../events'; type MockJob = { id?: string; @@ -30,13 +30,15 @@ type MockExecutionPlan = { // This is a fake runtime handler which will return a fixed value, throw, and // optionally delay -function mock(plan: MockExecutionPlan) { +function mockRun(plan: MockExecutionPlan) { const [job] = plan.jobs; const { jobLogger } = createLoggers(plan.id!); + const workflowId = plan.id; return new Promise((resolve) => { const jobId = job.id || ''; setTimeout(async () => { - publish(plan.id, workerEvents.JOB_START, { + publish(workerEvents.JOB_START, { + workflowId, jobId, versions: { node: '1', runtime: '1', compiler: '1', engine: '1' }, }); @@ -63,7 +65,8 @@ function mock(plan: MockExecutionPlan) { }; } } - publish(plan.id, workerEvents.JOB_COMPLETE, { + publish(workerEvents.JOB_COMPLETE, { + workflowId, jobId, duration: 100, state, @@ -75,7 +78,7 @@ function mock(plan: MockExecutionPlan) { }); } -workerpool.worker({ +register({ run: async (plan: MockExecutionPlan, _options?: any) => - helper(plan.id, () => mock(plan)), + execute(plan.id, () => mockRun(plan)), }); diff --git a/packages/engine-multi/src/worker/thread/run.ts b/packages/engine-multi/src/worker/thread/run.ts new file mode 100644 index 000000000..196853859 --- /dev/null +++ b/packages/engine-multi/src/worker/thread/run.ts @@ -0,0 +1,64 @@ +// This is the run command that will be executed inside the worker thread +// Most of the heavy lifting is actually handled by execute +import run from '@openfn/runtime'; +import type { ExecutionPlan } from '@openfn/runtime'; +import type { SanitizePolicies } from '@openfn/logger'; +import type { NotifyEvents } from '@openfn/runtime'; + +import { register, publish } from './runtime'; +import { execute, createLoggers } from './helpers'; +import serializeError from '../../util/serialize-error'; +import { JobErrorPayload } from '../../events'; + +type RunOptions = { + adaptorPaths: Record; + whitelist?: RegExp[]; + sanitize: SanitizePolicies; + statePropsToRemove?: string[]; + // TODO timeout +}; + +const eventMap = { + 'job-error': (evt: JobErrorPayload) => ({ + ...evt, + error: serializeError(evt.error), + }), +}; + +register({ + run: (plan: ExecutionPlan, runOptions: RunOptions) => { + const { adaptorPaths, whitelist, sanitize, statePropsToRemove } = + runOptions; + const { logger, jobLogger } = createLoggers(plan.id!, sanitize); + // TODO I would like to pull these options out of here + + const options = { + // disable the run/step timeout + timeout: 0, + strict: false, + logger, + jobLogger, + linker: { + modules: adaptorPaths, + whitelist, + cacheKey: plan.id, + }, + statePropsToRemove, + callbacks: { + // TODO: this won't actually work across the worker boundary + // For now I am preloading credentials + // resolveCredential: async (id: string) => {}, + notify: (name: NotifyEvents, payload: any) => { + // @ts-ignore + const mappedPayload = eventMap[name]?.(payload) ?? payload; + publish(`worker:${name}`, { + workflowId: plan.id, + ...mappedPayload, + }); + }, + }, + }; + + return execute(plan.id!, () => run(plan, undefined, options)); + }, +}); diff --git a/packages/engine-multi/src/worker/thread/runtime.ts b/packages/engine-multi/src/worker/thread/runtime.ts new file mode 100644 index 000000000..4f772923a --- /dev/null +++ b/packages/engine-multi/src/worker/thread/runtime.ts @@ -0,0 +1,68 @@ +// this is the core runtime for inside the thread +import { parentPort, threadId } from 'node:worker_threads'; +import { + ENGINE_REJECT_TASK, + ENGINE_RESOLVE_TASK, + ENGINE_RUN_TASK, +} from '../events'; + +type TaskRegistry = Record Promise>; + +export const processId = process.pid; + +export { threadId } from 'node:worker_threads'; + +const tasks: TaskRegistry = { + // startup validation script + handshake: async () => { + return true; + }, +}; + +export const register = (newTasks: TaskRegistry) => { + Object.assign(tasks, newTasks); +}; + +type Event = { + type: string; + threadId: number; + processId: number; + [key: string]: any; +}; + +export const publish = ( + type: string, + payload: Omit +) => { + parentPort!.postMessage({ + type, + threadId, + processId, + ...payload, + }); +}; + +const run = (task: string, args: any[]) => { + tasks[task](...args) + .then((result) => { + publish(ENGINE_RESOLVE_TASK, { + result, + }); + }) + .catch((e) => { + publish(ENGINE_REJECT_TASK, { + error: { + severity: e.severity || 'crash', + message: e.message, + type: e.type || e.name, + }, + }); + }); +}; + +parentPort!.on('message', async (evt) => { + if (evt.type === ENGINE_RUN_TASK) { + const args = evt.args || []; + run(evt.task, args); + } +}); diff --git a/packages/engine-multi/src/worker/worker.ts b/packages/engine-multi/src/worker/worker.ts deleted file mode 100644 index 203eae4c8..000000000 --- a/packages/engine-multi/src/worker/worker.ts +++ /dev/null @@ -1,59 +0,0 @@ -// Dedicated worker for running jobs -// Security thoughts: the process inherits the node command arguments -// (it has to for experimental modules to work) -// Is this a concern? If secrets are passed in they could be visible -// The sandbox should help - -// What about imports in a worker thread? -// Is there an overhead in reimporting stuff (presumably!) -// Should we actually be pooling workers by adaptor[+version] -// Does this increase the danger of sharing state between jobs? -// Suddenly it's a liability for the same environent in the same adaptor -// to be running the same jobs - break out of the sandbox and who knows what you can get -import workerpool from 'workerpool'; -import run from '@openfn/runtime'; -import type { ExecutionPlan } from '@openfn/runtime'; -import type { SanitizePolicies } from '@openfn/logger'; -import helper, { createLoggers, publish } from './worker-helper'; -import { NotifyEvents } from '@openfn/runtime'; - -type RunOptions = { - adaptorPaths: Record; - whitelist?: RegExp[]; - sanitize: SanitizePolicies; - statePropsToRemove?: string[]; - // TODO timeout -}; - -workerpool.worker({ - // startup validation script - handshake: () => true, - - run: (plan: ExecutionPlan, runOptions: RunOptions) => { - const { adaptorPaths, whitelist, sanitize, statePropsToRemove } = - runOptions; - const { logger, jobLogger } = createLoggers(plan.id!, sanitize); - const options = { - strict: false, - logger, - jobLogger, - linker: { - modules: adaptorPaths, - whitelist, - cacheKey: plan.id, - }, - statePropsToRemove, - callbacks: { - // TODO: this won't actually work across the worker boundary - // For now I am preloading credentials - // resolveCredential: async (id: string) => {}, - notify: (name: NotifyEvents, payload: any) => { - // convert runtime notify events to internal engine events - publish(plan.id!, `worker:${name}`, payload); - }, - }, - }; - - return helper(plan.id!, () => run(plan, undefined, options)); - }, -}); diff --git a/packages/engine-multi/test/__repo__/node_modules/helper_1.0.0/index.cjs b/packages/engine-multi/test/__repo__/node_modules/helper_1.0.0/index.cjs index a12a1a2b0..42a79abc5 100644 --- a/packages/engine-multi/test/__repo__/node_modules/helper_1.0.0/index.cjs +++ b/packages/engine-multi/test/__repo__/node_modules/helper_1.0.0/index.cjs @@ -1,7 +1,6 @@ module.exports = { exit: function() { return function (state) { - console.log('exiting process') process.exit(42) return state; } diff --git a/packages/engine-multi/test/api.test.ts b/packages/engine-multi/test/api.test.ts index 7a9091dac..fd7fb85aa 100644 --- a/packages/engine-multi/test/api.test.ts +++ b/packages/engine-multi/test/api.test.ts @@ -1,7 +1,6 @@ import test from 'ava'; import createAPI from '../src/api'; import { createMockLogger } from '@openfn/logger'; -import { PURGE } from '../src/events'; import pkg from '../package.json' assert { type: 'json' }; import { RuntimeEngine } from '../src/types'; @@ -51,10 +50,37 @@ test.serial('engine api uses default options', async (t) => { t.deepEqual(api.options.statePropsToRemove, ['configuration', 'response']); t.false(api.options.noCompile); - t.true(api.options.purge); t.truthy(api.options.whitelist); }); +test.serial('engine api uses custom options', async (t) => { + const options = { + logger, // no test + + repoDir: 'a/b/c', + whitelist: ['/@openfn/'], + + // noCompile + // autoinstall + + maxWorkers: 29, + memoryLimitMb: 99, + attemptTimeoutMs: 33, + statePropsToRemove: ['z'], + }; + + api = await createAPI(options); + + t.truthy(api.options); + + t.is(api.options.repoDir, 'a/b/c'); + t.true(api.options.whitelist![0] instanceof RegExp); + t.is(api.options.maxWorkers, 29); + t.is(api.options.memoryLimitMb, 99); + t.is(api.options.attemptTimeoutMs, 33); + t.deepEqual(api.options.statePropsToRemove, ['z']); +}); + // Note that this runs with the actual runtime worker // I won't want to do deep testing on execute here - I just want to make sure the basic // exeuction functionality is working. It's more a test of the api surface than the inner @@ -119,32 +145,3 @@ test.serial('should listen to workflow-complete', async (t) => { }); }); }); - -test.serial('should purge workers after a single run', async (t) => { - return new Promise(async (done) => { - api = await createAPI({ - logger, - // Disable compilation - compile: { - skip: true, - }, - }); - - const plan = { - id: 'a', - jobs: [ - { - expression: 'export default [s => s]', - // with no adaptor it shouldn't try to autoinstall - }, - ], - }; - - api.on(PURGE, () => { - t.pass('workers purged'); - done(); - }); - - api.execute(plan); - }); -}); diff --git a/packages/engine-multi/test/api/call-worker.test.ts b/packages/engine-multi/test/api/call-worker.test.ts index 1887d877c..314314527 100644 --- a/packages/engine-multi/test/api/call-worker.test.ts +++ b/packages/engine-multi/test/api/call-worker.test.ts @@ -1,122 +1,166 @@ import test from 'ava'; import path from 'node:path'; import EventEmitter from 'node:events'; -import { Promise as WorkerPoolPromise } from 'workerpool'; +import { createMockLogger } from '@openfn/logger'; import initWorkers from '../../src/api/call-worker'; import { EngineAPI } from '../../src/types'; -import { PURGE } from '../../src/events'; -let api = new EventEmitter() as EngineAPI; +let engine = new EventEmitter() as EngineAPI; -const workerPath = path.resolve('src/test/worker-functions.js'); +const workerPath = path.resolve('dist/test/worker-functions.js'); +const logger = createMockLogger(); test.before(() => { - initWorkers(api, workerPath); + const { callWorker, closeWorkers } = initWorkers( + workerPath, + { + maxWorkers: 1, + }, + logger + ); + engine.callWorker = callWorker; + engine.closeWorkers = closeWorkers; }); -test.after(() => api.closeWorkers()); +test.after(() => engine.closeWorkers()); test.serial('initWorkers should add a callWorker function', (t) => { - t.assert(typeof api.callWorker === 'function'); + t.assert(typeof engine.callWorker === 'function'); }); test.serial('callWorker should return the default result', async (t) => { - const result = await api.callWorker('test', []); + const result = await engine.callWorker('test', []); t.is(result, 42); }); test.serial('callWorker should return a custom result', async (t) => { - const result = await api.callWorker('test', [84]); + const result = await engine.callWorker('test', [84]); t.is(result, 84); }); test.serial('callWorker should trigger an event callback', async (t) => { - return new Promise((done) => { - const onCallback = ({ result }) => { - t.is(result, 11); - done(); - }; + const onCallback = ({ result }) => { + t.is(result, 11); + }; - api.callWorker('test', [11], { message: onCallback }); - }); + await engine.callWorker('test', [11], { 'test-message': onCallback }); }); test.serial( 'callWorker should throw TimeoutError if it times out', async (t) => { - await t.throwsAsync(() => api.callWorker('timeout', [11], {}, 10), { - instanceOf: WorkerPoolPromise.TimeoutError, - }); + await t.throwsAsync( + () => engine.callWorker('wait', [5000], {}, { timeout: 100 }), + { + name: 'TimeoutError', + } + ); } ); -// Dang, this doesn't work, the worker threads run in the same process -test.skip('callWorker should execute with a different process id', async (t) => { - return new Promise((done) => { - const onCallback = ({ pid }) => { - t.not(process.pid, pid); - done(); +test.serial( + 'callWorker should not freak out after a timeout error', + async (t) => { + await t.throwsAsync( + () => engine.callWorker('wait', [5000], {}, { timeout: 100 }), + { + name: 'TimeoutError', + } + ); + + const onCallback = (evt) => { + t.pass('all ok'); }; - api.callWorker('test', [], { message: onCallback }); + await engine.callWorker('test', [], { 'test-message': onCallback }); + } +); + +test.serial('callWorker should execute in one process', async (t) => { + const ids: number[] = []; + + await engine.callWorker('test', [], { + 'test-message': ({ processId }) => { + ids.push(processId); + }, }); + + await engine.callWorker('test', [], { + 'test-message': ({ processId }) => { + ids.push(processId); + }, + }); + + t.log(ids); + t.is(ids[0], ids[1]); }); -test.serial('callWorker should execute in a different process', async (t) => { - return new Promise((done) => { - // @ts-ignore - process.scribble = 'xyz'; +test.serial('callWorker should execute in two different threads', async (t) => { + const ids: number[] = []; - const onCallback = ({ scribble }) => { - // @ts-ignore - t.not(process.scribble, scribble); - done(); - }; + await engine.callWorker('test', [], { + 'test-message': ({ threadId }) => { + ids.push(threadId); + }, + }); - api.callWorker('test', [], { message: onCallback }); + await engine.callWorker('test', [], { + 'test-message': ({ threadId }) => { + ids.push(threadId); + }, }); + + t.log(ids); + t.not(ids[0], ids[1]); }); test.serial( - 'If null env is passed, worker thread should be able to access parent env', + 'Even if null env is passed, worker thread should not be able to access parent env', async (t) => { - const badAPI = {} as EngineAPI; const env = null; - initWorkers(badAPI, workerPath, { env }); + const { callWorker, closeWorkers } = initWorkers( + workerPath, + { + env, + }, + logger + ); // Set up a special key on process.env const code = '76ytghjs'; process.env.TEST = code; // try and read that key inside the thread - const result = await badAPI.callWorker('readEnv', ['TEST']); + const result = await callWorker('readEnv', ['TEST']); - // voila, the kingdom is yours - t.is(result, code); + // Sorry pal, no dice + t.not(result, code); - badAPI.closeWorkers(); + closeWorkers(); } ); test.serial( 'By default, worker thread cannot access parent env if env not set (no options arg)', async (t) => { - const defaultAPI = {} as EngineAPI; - - initWorkers(defaultAPI, workerPath /* no options passed*/); + const { callWorker, closeWorkers } = initWorkers( + workerPath, + undefined, + logger + ); // Set up a special key on process.env const code = '76ytghjs'; process.env.TEST = code; // try and read that key inside the thread - const result = await defaultAPI.callWorker('readEnv', ['TEST']); + const result = await callWorker('readEnv', ['TEST']); // No fish t.is(result, undefined); - defaultAPI.closeWorkers(); + closeWorkers(); } ); @@ -125,42 +169,49 @@ test.serial( async (t) => { const defaultAPI = {} as EngineAPI; - initWorkers(defaultAPI, workerPath, { maxWorkers: 1 }); + const { callWorker, closeWorkers } = initWorkers( + workerPath, + { maxWorkers: 1 }, + logger + ); // Set up a special key on process.env const code = '76ytghjs'; process.env.TEST = code; // try and read that key inside the thread - const result = await defaultAPI.callWorker('readEnv', ['TEST']); + const result = await callWorker('readEnv', ['TEST']); // No fish t.is(result, undefined); - defaultAPI.closeWorkers(); + closeWorkers(); } ); test.serial( 'Worker thread cannot access parent env if custom env is passted', async (t) => { - const customAPI = {} as EngineAPI; const env = { NODE_ENV: 'production' }; - initWorkers(customAPI, workerPath, { env }); + const { callWorker, closeWorkers } = initWorkers( + workerPath, + { env }, + logger + ); // Set up a special key on process.env const code = '76ytghjs'; process.env.TEST = code; // try and read that key inside the thread - const result = await customAPI.callWorker('readEnv', ['TEST']); + const result = await callWorker('readEnv', ['TEST']); // No fish t.is(result, undefined); - const result2 = await customAPI.callWorker('readEnv', ['NODE_ENV']); + const result2 = await callWorker('readEnv', ['NODE_ENV']); t.is(result2, 'production'); - customAPI.closeWorkers(); + closeWorkers(); } ); diff --git a/packages/engine-multi/test/api/compile.test.ts b/packages/engine-multi/test/api/compile.test.ts deleted file mode 100644 index 214a689ad..000000000 --- a/packages/engine-multi/test/api/compile.test.ts +++ /dev/null @@ -1,8 +0,0 @@ -import test from 'ava'; - -// Because compile is going to change I'm not too excited about doing a lot of work here right nopw -// just filling the space - -test.todo('should compile multiple expressions in a job'); - -test.todo('should log as it compiles'); diff --git a/packages/engine-multi/test/api/execute.test.ts b/packages/engine-multi/test/api/execute.test.ts index a99ea0e40..bdc039be1 100644 --- a/packages/engine-multi/test/api/execute.test.ts +++ b/packages/engine-multi/test/api/execute.test.ts @@ -1,6 +1,5 @@ import path from 'node:path'; import test from 'ava'; -import { WorkflowState } from '../../src/types'; import initWorkers from '../../src/api/call-worker'; import execute from '../../src/api/execute'; import { createMockLogger } from '@openfn/logger'; @@ -12,21 +11,27 @@ import { WORKFLOW_LOG, WORKFLOW_START, } from '../../src/events'; -import { RTEOptions } from '../../src/api'; import ExecutionContext from '../../src/classes/ExecutionContext'; -import loadVersions from '../../src/util/load-versions'; -const workerPath = path.resolve('dist/worker/mock.js'); +import type { RTEOptions } from '../../src/api'; +import type { WorkflowState } from '../../src/types'; +import { ExecuteOptions } from '../../src/engine'; + +const workerPath = path.resolve('dist/test/mock-run.js'); const createContext = ({ state, options }) => { + const logger = createMockLogger(); + const { callWorker } = initWorkers(workerPath, {}, logger); + const ctx = new ExecutionContext({ state: state || { workflowId: 'x' }, - logger: createMockLogger(), - callWorker: () => {}, + logger, + callWorker, options, - versions: loadVersions(), }); - initWorkers(ctx, workerPath); + + ctx.callWorker = callWorker; + return ctx; }; @@ -78,6 +83,32 @@ test.serial('should emit a workflow-start event', async (t) => { t.is(workflowStart.workflowId, 'x'); }); +test.serial('should emit a log event with the memory limit', async (t) => { + const state = { + id: 'x', + plan, + } as WorkflowState; + + const logs = []; + + const context = createContext({ + state, + options: { + ...options, + memoryLimitMb: 666, + }, + }); + + context.on(WORKFLOW_LOG, (evt) => { + logs.push(evt); + }); + + await execute(context); + + const log = logs.find(({ name }) => name === 'RTE'); + t.is(log.message[0], 'Memory limit: 666mb'); +}); + test.serial('should emit a workflow-complete event', async (t) => { let workflowComplete; const state = { @@ -197,9 +228,9 @@ test.serial('should emit error on timeout', async (t) => { }, } as WorkflowState; - const wfOptions = { + const wfOptions: ExecuteOptions = { ...options, - timeout: 10, + attemptTimeoutMs: 10, }; let event; @@ -266,3 +297,27 @@ test.serial('should emit CompileError if compilation fails', async (t) => { await execute(context); }); + +test.serial('should stringify the whitelist array', async (t) => { + let passedOptions; + + const state = { + id: 'x', + plan, + } as WorkflowState; + + const opts = { + ...options, + whitelist: [/abc/], + }; + + const context = createContext({ state, options: opts }); + context.callWorker = (_command, args) => { + passedOptions = args[1]; + }; + + await execute(context); + + t.truthy(passedOptions); + t.deepEqual(passedOptions.whitelist, ['/abc/']); +}); diff --git a/packages/engine-multi/test/api/validate-worker.test.ts b/packages/engine-multi/test/api/validate-worker.test.ts index 699d75ba6..5566c4972 100644 --- a/packages/engine-multi/test/api/validate-worker.test.ts +++ b/packages/engine-multi/test/api/validate-worker.test.ts @@ -1,26 +1,30 @@ import test from 'ava'; import path from 'node:path'; +import { createMockLogger } from '@openfn/logger'; import initWorkers from '../../src/api/call-worker'; -import { EngineAPI } from '../../src/types'; import validateWorker from '../../src/api/validate-worker'; -let api = {} as EngineAPI; - -const workerPath = path.resolve('src/test/worker-functions.js'); - -test.beforeEach(() => { - api = {} as EngineAPI; -}); +const logger = createMockLogger(); test('validate should not throw if the worker path is valid', async (t) => { - initWorkers(api, workerPath); - await t.notThrowsAsync(() => validateWorker(api)); + const workerPath = path.resolve('dist/test/worker-functions.js'); + const api = initWorkers(workerPath, {}, logger); + await t.notThrowsAsync(() => validateWorker(api as any, 500)); }); test('validate should throw if the worker path is invalid', async (t) => { - initWorkers(api, 'a/b/c.js'); - await t.throwsAsync(() => validateWorker(api), { + const workerPath = 'a/b/c.js'; + const api = initWorkers(workerPath, { silent: true }, logger); + await t.throwsAsync(() => validateWorker(api as any, 500), { + message: 'Invalid worker path', + }); +}); + +test('validate should throw if the worker does not respond to a handshake', async (t) => { + const workerPath = path.resolve('src/test/bad-worker.js'); + const api = initWorkers(workerPath, { silent: true }, logger); + await t.throwsAsync(() => validateWorker(api as any, 500), { message: 'Invalid worker path', }); }); diff --git a/packages/engine-multi/test/engine.test.ts b/packages/engine-multi/test/engine.test.ts index a38d9e957..b4d79ae70 100644 --- a/packages/engine-multi/test/engine.test.ts +++ b/packages/engine-multi/test/engine.test.ts @@ -2,7 +2,7 @@ import test from 'ava'; import path from 'node:path'; import { createMockLogger } from '@openfn/logger'; -import createEngine from '../src/engine'; +import createEngine, { ExecuteOptions } from '../src/engine'; import * as e from '../src/events'; import { ExecutionPlan } from '@openfn/runtime'; @@ -14,14 +14,13 @@ const options = { // This uses the mock worker, not the actual runtime // It will still exercise all the lifecycle logic found in the worker-helper, // Just not the runtime logic - workerPath: path.resolve('dist/mock-worker.js'), + workerPath: path.resolve('dist/test/mock-run.js'), logger, repoDir: '.', // doesn't matter for the mock noCompile: true, // messy - needed to allow an expression to be passed as json autoinstall: { handleIsInstalled: async () => true, }, - purge: true, }; let engine; @@ -67,11 +66,11 @@ test.serial('get workflow state', async (t) => { test.serial('use the default worker path', async (t) => { engine = await createEngine({ logger, repoDir: '.' }); - t.true(engine.workerPath.endsWith('worker/worker.js')); + t.true(engine.workerPath.endsWith('worker/thread/run.js')); }); test.serial('use a custom worker path', async (t) => { - const workerPath = path.resolve('src/test/worker-functions.js'); + const workerPath = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, workerPath); t.is(engine.workerPath, workerPath); }); @@ -80,7 +79,7 @@ test.serial( 'execute with test worker and trigger workflow-complete', async (t) => { return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); + const p = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, p); const plan = { @@ -105,7 +104,7 @@ test.serial( test.serial('execute does not return internal state stuff', async (t) => { return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); + const p = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, p); const plan = { @@ -140,7 +139,7 @@ test.serial('execute does not return internal state stuff', async (t) => { test.serial('listen to workflow-complete', async (t) => { return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); + const p = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, p); const plan = { @@ -167,7 +166,7 @@ test.serial('listen to workflow-complete', async (t) => { test.serial('call listen before execute', async (t) => { return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); + const p = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, p); const plan = { @@ -193,7 +192,7 @@ test.serial('call listen before execute', async (t) => { test.serial('catch and emit errors', async (t) => { return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); + const p = path.resolve('dist/test/worker-functions.js'); engine = await createEngine(options, p); const plan = { @@ -216,117 +215,70 @@ test.serial('catch and emit errors', async (t) => { }); }); -test.serial('timeout the whole attempt and emit an error', async (t) => { - return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); - engine = await createEngine(options, p); - - const plan = { - id: 'a', - jobs: [ - { - expression: 'while(true) {}', - }, - ], - }; - - const opts = { - timeout: 10, - }; - - engine.listen(plan.id, { - [e.WORKFLOW_ERROR]: ({ message, type }) => { - t.is(type, 'TimeoutError'); - t.regex(message, /failed to return within 10ms/); - done(); - }, - }); +test.serial( + 'timeout the whole attempt and emit an error (timeout on attempt)', + async (t) => { + return new Promise(async (done) => { + const p = path.resolve('dist/test/worker-functions.js'); + engine = await createEngine(options, p); - engine.execute(plan, opts); - }); -}); + const plan = { + id: 'a', + jobs: [ + { + expression: 'while(true) {}', + }, + ], + }; -test.serial('Purge workers when a run is complete', async (t) => { - return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); - engine = await createEngine(options, p); + const opts: ExecuteOptions = { + attemptTimeoutMs: 10, + }; - const plan = { - id: 'a', - jobs: [ - { - expression: '34', + engine.listen(plan.id, { + [e.WORKFLOW_ERROR]: ({ message, type }) => { + t.is(type, 'TimeoutError'); + t.regex(message, /failed to return within 10ms/); + done(); }, - ], - }; + }); - engine.on(e.PURGE, () => { - t.pass('purge event called'); - done(); + engine.execute(plan, opts); }); + } +); - engine.execute(plan); - }); -}); - -test.serial('Purge workers when run errors', async (t) => { - return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); - engine = await createEngine(options, p); - - const plan = { - id: 'a', - jobs: [ +test.serial( + 'timeout the whole attempt and emit an error (default engine timeout) ', + async (t) => { + return new Promise(async (done) => { + const p = path.resolve('dist/test/worker-functions.js'); + engine = await createEngine( { - expression: 'throw new Error("test")', + ...options, + attemptTimeoutMs: 22, }, - ], - }; - - engine.on(e.PURGE, () => { - t.pass('purge event called'); - done(); - }); + p + ); - engine.execute(plan); - }); -}); - -test.serial("Don't purge if purge is false", async (t) => { - return new Promise(async (done) => { - const p = path.resolve('src/test/worker-functions.js'); - engine = await createEngine( - { - ...options, - purge: false, - }, - p - ); + const plan = { + id: 'a', + jobs: [ + { + expression: 'while(true) {}', + }, + ], + }; - const plan = { - id: 'a', - jobs: [ - { - expression: '34', + engine.listen(plan.id, { + [e.WORKFLOW_ERROR]: ({ message, type }) => { + t.is(type, 'TimeoutError'); + t.regex(message, /failed to return within 22ms/); + done(); }, - ], - }; - - engine.on(e.PURGE, () => { - t.fail('purge event called'); - done(); - }); + }); - engine.execute(plan).on(e.WORKFLOW_COMPLETE, () => { - setTimeout(() => { - t.pass('no purge called within 50ms'); - done(); - }, 50); + engine.execute(plan); }); - }); -}); - -// I'm not actually going to use the destroy API (not for graceful shutdown anyway) -// So it doesn't feel too important to implement these tests -test.todo('destroy immediately, killing active workflows'); -test.todo('destroy gracefully, allowing active workflows to complete'); + } +); diff --git a/packages/engine-multi/test/errors.test.ts b/packages/engine-multi/test/errors.test.ts index ad6f4f9e3..9b3bce873 100644 --- a/packages/engine-multi/test/errors.test.ts +++ b/packages/engine-multi/test/errors.test.ts @@ -72,7 +72,7 @@ test.serial('syntax error: illegal throw', (t) => { }); }); -test.serial('syntax error: oom error', (t) => { +test.serial('thread oom error', (t) => { return new Promise((done) => { const plan = { id: 'a', @@ -92,13 +92,42 @@ test.serial('syntax error: oom error', (t) => { engine.execute(plan).on(WORKFLOW_ERROR, (evt) => { t.is(evt.type, 'OOMError'); t.is(evt.severity, 'kill'); + t.is(evt.message, 'Run exceeded maximum memory usage'); + done(); + }); + }); +}); + +test.serial('vm oom error', (t) => { + return new Promise((done) => { + const plan = { + id: 'b', + jobs: [ + { + expression: `export default [(s) => { + s.a = []; + while(true) { + s.a.push(new Array(1e8).fill("oom")); + } + return s; + }]`, + }, + ], + }; + + engine.execute(plan).on(WORKFLOW_ERROR, (evt) => { + t.is(evt.type, 'OOMError'); + t.is(evt.severity, 'kill'); + t.is(evt.message, 'Run exceeded maximum memory usage'); done(); }); }); }); // https://github.com/OpenFn/kit/issues/509 -test.serial('execution error from async code', (t) => { +// TODO this passes standalone, but will trigger an exception in the next test +// This should start working again once we spin up the worker thread +test.serial.skip('execution error from async code', (t) => { return new Promise((done) => { const plan = { id: 'a', @@ -131,7 +160,7 @@ test.serial('emit a crash error on process.exit()', (t) => { jobs: [ { adaptor: 'helper@1.0.0', - expression: `export default [exit()]`, + expression: 'export default [exit()]', }, ], }; diff --git a/packages/engine-multi/test/integration.test.ts b/packages/engine-multi/test/integration.test.ts index b9379ce97..129770fe7 100644 --- a/packages/engine-multi/test/integration.test.ts +++ b/packages/engine-multi/test/integration.test.ts @@ -192,6 +192,30 @@ test.serial('run without error if no state is returned', (t) => { }); }); +test.serial('errors get nicely serialized', (t) => { + return new Promise(async (done) => { + api = await createAPI({ + logger, + }); + + const plan = createPlan([ + { + expression: `${withFn}fn((s) => s.data.x.y)`, + }, + ]); + + api.execute(plan).on('job-error', (evt) => { + t.is(evt.error.type, 'TypeError'); + t.is(evt.error.severity, 'fail'); + t.is( + evt.error.message, + "TypeError: Cannot read properties of undefined (reading 'y')" + ); + done(); + }); + }); +}); + test.serial( 'execute should remove the configuration and response keys', (t) => { @@ -309,7 +333,14 @@ test.serial('preload credentials', (t) => { }); test.serial('accept initial state', (t) => { - return new Promise((done) => { + return new Promise(async (done) => { + api = await createAPI({ + logger, + compile: { + skip: true, + }, + }); + const plan = createPlan(); // important! The runtime must use both x and y as initial state diff --git a/packages/engine-multi/test/security.test.ts b/packages/engine-multi/test/security.test.ts new file mode 100644 index 000000000..45b42634d --- /dev/null +++ b/packages/engine-multi/test/security.test.ts @@ -0,0 +1,85 @@ +/** + * These tests run code in the engine and demonstate + * the natural sandboxing provided by our architecture + * + * These do NOT use the runtime engine or the runtime sandbox + */ + +import test from 'ava'; +import path from 'node:path'; +import { createMockLogger } from '@openfn/logger'; +import createEngine from '../src/engine'; + +const logger = createMockLogger('', { level: 'debug' }); + +const options = { + logger, + env: { PRIVATE: 'xyz' }, + repoDir: '.', + // Important: ensure there's only one child process in the pool for these tests + maxWorkers: 1, +}; + +let engine; + +test.before(async () => { + engine = await createEngine( + options, + path.resolve('dist/test/worker-functions.js') + ); +}); + +test.afterEach(async () => { + logger._reset(); +}); + +test.serial('parent env is hidden from sandbox', async (t) => { + // Note that these use the underlying callworker function, + // not execute + const result2 = await engine.callWorker('readEnv', ['PRIVATE']); + t.falsy(result2, 'xyz'); + + delete process.env.TEST; +}); + +test.serial('sandbox does not share a global scope', async (t) => { + t.is(global.x, undefined); + + // Set a global inside the first task + await engine.callWorker('setGlobalX', [9]); + + // (this should not affect us outside) + t.is(global.x, undefined); + + // the next task should not be able to read that value + const result = await engine.callWorker('getGlobalX', []); + t.falsy(result); +}); + +test.serial( + 'statically imported modules should isolate scope and state across runs', + async (t) => { + const count1 = await engine.callWorker('incrementStatic', []); + t.is(count1, 1); + + const count2 = await engine.callWorker('incrementStatic', []); + t.is(count2, 1); + + const count3 = await engine.callWorker('incrementStatic', []); + t.is(count3, 1); + } +); + +test.serial( + 'dynamically imported modules should isolate scope and state across runs', + async (t) => { + const count1 = await engine.callWorker('incrementDynamic', []); + t.is(count1, 1); + + const count2 = await engine.callWorker('incrementDynamic', []); + t.is(count2, 1); + + const count3 = await engine.callWorker('incrementDynamic', []); + t.is(count3, 1); + } +); diff --git a/packages/engine-multi/test/worker/mock-worker.test.ts b/packages/engine-multi/test/worker/mock-worker.test.ts index a611a167b..19486175e 100644 --- a/packages/engine-multi/test/worker/mock-worker.test.ts +++ b/packages/engine-multi/test/worker/mock-worker.test.ts @@ -7,13 +7,22 @@ */ import path from 'node:path'; import test from 'ava'; -import workerpool from 'workerpool'; +import createPool from '../../src/worker/pool'; import { createPlan } from '../../src/test/util'; import * as e from '../../src/worker/events'; +import { createMockLogger } from '@openfn/logger'; -const workers = workerpool.pool(path.resolve('dist/worker/mock.js')); +const logger = createMockLogger('', { level: 'debug' }); + +const workers = createPool( + path.resolve('dist/test/mock-run.js'), + { + capacity: 1, + }, + logger +); test('execute a mock plan inside a worker thread', async (t) => { const plan = createPlan(); @@ -91,14 +100,16 @@ test('execute a mock plan with delay', async (t) => { }); await workers.exec('run', [plan]); const elapsed = new Date().getTime() - start; - t.assert(elapsed > 50); + t.log(elapsed); + t.assert(elapsed > 40); }); test('Publish workflow-start event', async (t) => { const plan = createPlan(); + plan.id = 'xx'; let didFire = false; await workers.exec('run', [plan], { - on: ({ type, ...args }) => { + on: ({ type }) => { if (type === e.WORKFLOW_START) { didFire = true; } diff --git a/packages/engine-multi/test/worker/pool.test.ts b/packages/engine-multi/test/worker/pool.test.ts new file mode 100644 index 000000000..9695e71aa --- /dev/null +++ b/packages/engine-multi/test/worker/pool.test.ts @@ -0,0 +1,452 @@ +import test from 'ava'; +import path from 'node:path'; + +import createPool, { + ChildProcessPool, + returnToPool, +} from '../../src/worker/pool'; +import { createMockLogger } from '@openfn/logger'; + +const workerPath = path.resolve('dist/test/worker-functions.js'); + +const logger = createMockLogger(); + +test('create a pool with empty processes (5 by default)', (t) => { + const pool = createPool('.', {}, logger); + + t.is(pool._pool.length, 5); + t.true(pool._pool.every((f) => f === false)); +}); + +test('create a pool with 10 empty processes', (t) => { + const pool = createPool('.', { capacity: 10 }, logger); + + t.is(pool._pool.length, 10); + t.true(pool._pool.every((f) => f === false)); +}); + +test('run a task and return the result', async (t) => { + const pool = createPool(workerPath, {}, logger); + const result = await pool.exec('test', []); + t.is(result, 42); +}); + +test('run a task with arguments and return the result', async (t) => { + const pool = createPool(workerPath, {}, logger); + const result = await pool.exec('test', [22]); + t.is(result, 22); +}); + +test('task runs inside a different process id to the parent', async (t) => { + const pool = createPool(workerPath, {}, logger); + const parentPid = process.pid; + + const childPid = await pool.exec('threadId', []); + + t.log('parent pid: ', parentPid); + t.log('child pid: ', childPid); + + t.truthy(parentPid); + t.truthy(childPid); + t.not(parentPid, childPid); +}); + +test.serial( + 'tasks in the same worker should have the same processId', + async (t) => { + const pool = createPool(workerPath, { maxWorkers: 1 }, logger); + + const ids = {}; + + const saveProcessId = (id: string) => { + if (!ids[id]) { + ids[id] = 0; + } + ids[id]++; + }; + + // Run 4 jobs and return the processId for each + // With only one worker thread they should all be the same + await Promise.all([ + pool + .exec('processId', []) + .then((id: unknown) => saveProcessId(id as string)), + pool + .exec('processId', []) + .then((id: unknown) => saveProcessId(id as string)), + pool + .exec('processId', []) + .then((id: unknown) => saveProcessId(id as string)), + pool + .exec('processId', []) + .then((id: unknown) => saveProcessId(id as string)), + ]); + + const allUsedIds = Object.keys(ids); + + t.is(allUsedIds.length, 1); + t.is(ids[allUsedIds[0]], 4); + } +); + +test('Remove a worker from the pool and release it when finished', async (t) => { + const pool = createPool(workerPath, {}, logger); + + t.is(pool._pool.length, 5); + const p = pool.exec('test', []); + t.is(pool._pool.length, 4); + return p.then(() => { + t.is(pool._pool.length, 5); + + // the first thing in the queue should be a worker + t.true(pool[0] !== false); + }); +}); + +test('run a wait task', async (t) => { + const pool = createPool(workerPath, {}, logger); + await pool.exec('wait', []); + t.pass(); +}); + +test('add tasks to a queue if the pool is empty', async (t) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + t.is(pool._pool.length, 1); + + const p1 = pool.exec('wait', []); + t.is(pool._queue.length, 0); + t.is(pool._pool.length, 0); + + const p2 = pool.exec('wait', []); + t.is(pool._queue.length, 1); + t.is(pool._pool.length, 0); + + await Promise.all([p1, p2]); + + t.is(pool._queue.length, 0); + t.is(pool._pool.length, 1); +}); + +test('add tasks with args to a queue if the pool is empty', async (t) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + t.is(pool._pool.length, 1); + + const p1 = pool.exec('wait', []); + t.is(pool._queue.length, 0); + t.is(pool._pool.length, 0); + + const p2 = pool.exec('test', [11]); + t.is(pool._queue.length, 1); + t.is(pool._pool.length, 0); + + const [_r1, r2] = await Promise.all([p1, p2]); + + t.is(r2, 11); +}); + +test('run through a queue of tasks', async (t) => { + const count = 30; + const capacity = 10; + + const pool = createPool(workerPath, { capacity }, logger); + t.is(pool._queue.length, 0); + t.is(pool._pool.length, capacity); + + const queue = new Array(count).map(() => pool.exec('wait', [20])); + + const results = await Promise.all(queue); + + t.is(results.length, count); + t.is(pool._queue.length, 0); + t.is(pool._pool.length, capacity); +}); + +// This might be a bit of an artificial test +// because the actual inner runtime should never throw +test('throw if the task throws', async (t) => { + const pool = createPool(workerPath, {}, logger); + + try { + await pool.exec('throw', []); + } catch (e) { + // NB e is not an error isntance + t.is(e.message, 'test_error'); + } +}); + +test('throw if memory limit is exceeded', async (t) => { + const pool = createPool(workerPath, {}, logger); + + try { + await pool.exec('blowMemory', [], { memoryLimitMb: 100 }); + } catch (e) { + t.is(e.message, 'Run exceeded maximum memory usage'); + t.is(e.name, 'OOMError'); + } +}); + +test('destroy should handle un-initialised workers', async (t) => { + const pool = createPool(workerPath, { capacity: 10 }, logger); + pool.destroy(); + t.is(pool._pool.length, 0); +}); + +test('destroy should close all child processes', async (t) => { + // warm up a pool + const pool = createPool(workerPath, { capacity: 10 }, logger); + + const queue = new Array(10).fill(true).map(() => pool.exec('test')); + await Promise.all(queue); + + const workers = Object.values(pool._allWorkers); + + // now destroy it + pool.destroy(); + + // check that every child is disconnected + t.true(workers.every((child) => child.killed)); + + // the pool should be empty + t.is(pool._pool.length, 0); +}); + +test('destroy gracefully', (t) => { + return new Promise((done) => { + const pool = createPool(workerPath, {}, logger); + const workers = Object.values(pool._allWorkers); + + t.is(pool._pool.length, 5); + + pool.exec('wait', [100]).then((result) => { + t.is(result, 1); + setTimeout(() => { + t.true(workers.every((child) => child.killed)); + t.is(pool._pool.length, 0); + + done(); + }, 1); + }); + + pool.destroy(); + + t.is(pool._pool.length, 0); + }); +}); + +// TODO should the worker throw on sigterm? +test('destroy immediately', (t) => { + return new Promise((done) => { + const pool = createPool(workerPath, {}, logger); + + t.is(pool._pool.length, 5); + + // this should not return + pool + .exec('wait', [100]) + .then(() => { + t.fail('Task should not have returned!'); + }) + .catch((e) => { + // TODO the pool currnetly throws an ExitError if it is interrupted + // Is this correct? + t.log(e); + // console.log(e); + }); + + pool.destroy(true); + + t.is(pool._pool.length, 0); + + setTimeout(() => { + t.pass(); + done(); + }, 1000); // not sure why but this needs to be quite a long delay + }); +}); + +// TODO is this right? +// If we've claimed and the claimed attempt is waiting, we should probably run it +// so this is invalid +test.skip("don't process the queue after destroy", () => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + + pool.exec('wait', [100]); + pool.exec('wait', [100]); +}); + +test('throw on exec if destroyed', (t) => { + const pool = createPool(workerPath, {}, logger); + + t.is(pool._pool.length, 5); + + pool.destroy(true); + + t.throws(() => pool.exec('test'), { + message: 'Worker destroyed', + }); +}); + +test('listen to an event', async (t) => { + const pool = createPool(workerPath, {}, logger); + + await pool.exec('test', [20], { + on: (evt) => { + if (evt.type === 'test-message') { + t.log(evt); + t.pass(); + } + }, + }); +}); + +test('listen to an event in two successive tasks', async (t) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + + let count = 0; + + await pool.exec('test', [20], { + on: (evt) => { + if (evt.type === 'test-message') { + count++; + } + }, + }); + + await pool.exec('test', [20], { + on: (evt) => { + if (evt.type === 'test-message') { + count++; + } + }, + }); + + t.is(count, 2); +}); + +test('listen to an event in two successive tasks after a queue', async (t) => { + return new Promise((done) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + + let count = 0; + + pool.exec('test', [20], { + on: (evt) => { + if (evt.type === 'test-message') { + count++; + } + }, + }); + + pool + .exec('test', [20], { + on: (evt) => { + if (evt.type === 'test-message') { + count++; + } + }, + }) + .then(() => { + t.is(count, 2); + done(); + }); + t.is(pool._queue.length, 1); + }); +}); + +test('throw if task times out', async (t) => { + const pool = createPool(workerPath, {}, logger); + + await t.throwsAsync(() => pool.exec('test', [], { timeout: 5 }), { + name: 'TimeoutError', + message: 'Workflow failed to return within 5ms', + }); +}); + +test('throw if a deferred task times out', async (t) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + + // start one task off + pool.exec('wait', [100]); + + // now add a second to the queue + // this should still throw + await t.throwsAsync(() => pool.exec('test', [], { timeout: 5 }), { + name: 'TimeoutError', + message: 'Workflow failed to return within 5ms', + }); +}); + +test('after timeout, destroy the worker and reset the pool', async (t) => { + return new Promise((done) => { + const pool = createPool(workerPath, { capacity: 2 }, logger); + t.deepEqual(pool._pool, [false, false]); + + pool.exec('test', [], { timeout: 5 }).catch(() => { + t.true(worker.killed); + t.deepEqual(pool._pool, [false, false]); + done(); + }); + + t.not(pool._pool, [false, false]); + let [worker] = Object.values(pool._allWorkers); + t.false(worker.killed); + }); +}); + +// Ie, running two tasks in the same worker process should isolate events properly +test('events should disconnect between executions', (t) => { + return new Promise(async (done) => { + const pool = createPool(workerPath, { capacity: 1 }, logger); + + const counts = { + a: 0, + b: 0, + c: 0, + }; + + const on = (event) => { + if (event.type === 'test-message') { + counts[event.result] += 1; + } + }; + + await pool.exec('test', ['a'], { on }); + await pool.exec('test', ['b'], { on }); + await pool.exec('test', ['c'], { on }); + + t.is(counts.a, 1); + t.is(counts.b, 1); + t.is(counts.c, 1); + + done(); + }); +}); + +test('returnToPool: add to the start of a full pool', (t) => { + const pool = [ + { pid: 1 }, + { pid: 2 }, + { pid: 3 }, + { pid: 4 }, + ] as ChildProcessPool; + + returnToPool(pool, { pid: 5 } as any); + + t.deepEqual(pool[0], { pid: 5 }); +}); + +test('returnToPool: add to the end of an empty pool', (t) => { + const pool = [false, false, false, false] as ChildProcessPool; + + returnToPool(pool, { pid: 5 } as any); + + t.deepEqual(pool[4], { pid: 5 }); +}); + +test('returnToPool: add to the middle of a empty pool', (t) => { + const pool = [false, false, { pid: 1 }, { pid: 2 }] as ChildProcessPool; + + returnToPool(pool, { pid: 5 } as any); + + t.deepEqual(pool[2], { pid: 5 }); +}); diff --git a/packages/engine-multi/test/worker/worker-pool.test.ts b/packages/engine-multi/test/worker/worker-pool.test.ts deleted file mode 100644 index f2186ba2d..000000000 --- a/packages/engine-multi/test/worker/worker-pool.test.ts +++ /dev/null @@ -1,255 +0,0 @@ -import path from 'node:path'; -import test from 'ava'; -import v8 from 'v8'; -import workerpool from 'workerpool'; - -const workerPath = path.resolve('src/test/worker-functions.js'); - -let pool; - -const createPool = () => workerpool.pool({ maxWorkers: 1 }); - -// note that a dedicated pool does not allow arbitrary code execution -const createDedicatedPool = (opts = {}) => - workerpool.pool(workerPath, { maxWorkers: 1, ...opts }); - -test.afterEach(() => pool.terminate(true)); - -test.serial('run an expression inside a worker', async (t) => { - pool = createPool(); - - const fn = () => 42; - - const result = await pool.exec(fn, []); - - t.is(result, 42); -}); - -test.serial('expressions should have the same threadId', async (t) => { - pool = createDedicatedPool(); - - const ids = {}; - - const saveThreadId = (id: string) => { - if (!ids[id]) { - ids[id] = 0; - } - ids[id]++; - }; - - // Run 4 jobs and return the threadId for each - // With only one worker thread they should all be the same - await Promise.all([ - pool.exec('threadId', []).then(saveThreadId), - pool.exec('threadId', []).then(saveThreadId), - pool.exec('threadId', []).then(saveThreadId), - pool.exec('threadId', []).then(saveThreadId), - ]); - - const allUsedIds = Object.keys(ids); - - t.is(allUsedIds.length, 1); - t.is(ids[allUsedIds[0]], 4); -}); - -// This is only true by default and is easily overridable -test.serial('thread has access to parent env', async (t) => { - pool = createDedicatedPool(); - - process.env.TEST = 'foobar'; - - const result = await pool.exec('readEnv', ['TEST']); - - t.is(result, 'foobar'); - - delete process.env.TEST; -}); - -test.serial('parent env can be hidden from thread', async (t) => { - pool = createDedicatedPool({ - workerThreadOpts: { - env: { PRIVATE: 'xyz' }, - }, - }); - - process.env.TEST = 'foobar'; - - const result = await pool.exec('readEnv', ['TEST']); - t.is(result, undefined); - - const result2 = await pool.exec('readEnv', ['PRIVATE']); - t.is(result2, 'xyz'); - - delete process.env.TEST; -}); - -test.serial('worker should not have access to host globals', async (t) => { - pool = createPool(); - global.x = 22; - - const fn = () => global.x; - - const result = await pool.exec(fn, []); - - t.is(result, undefined); - delete global.x; -}); - -test.serial('worker should not mutate host global scope', async (t) => { - pool = createPool(); - - t.is(global.x, undefined); - - const fn = () => { - global.x = 9; - }; - - await pool.exec(fn, []); - - t.is(global.x, undefined); -}); - -// This is potentially a security concern for jobs which escape the runtime sandbox -test.serial('workers share a global scope', async (t) => { - pool = createPool(); - - t.is(global.x, undefined); - - const fn1 = () => { - global.x = 9; - }; - - // Set a global inside the worker - await pool.exec(fn1, []); - - // (should not affect us outside) - t.is(global.x, undefined); - - const fn2 = () => global.x; - - // Call into the same worker and reads the global scope again - const result = await pool.exec(fn2, []); - - // And yes, the internal global x has a value of 9 - t.is(result, 9); -}); - -test.serial('get/set global x', async (t) => { - pool = createDedicatedPool(); - - await pool.exec('setGlobalX', [11]); - const result = await pool.exec('getGlobalX'); - - t.is(result, 11); -}); - -test.serial('get/set global error', async (t) => { - pool = createDedicatedPool(); - - await pool.exec('writeToGlobalError', [{ y: 222 }]); - const result = await pool.exec('getFromGlobalError', ['y']); - - t.is(result, 222); -}); - -test.serial('freeze prevents global scope being mutated', async (t) => { - pool = createDedicatedPool(); - - // Freeze the scope - await pool.exec('freeze', []); - - t.is(global.x, undefined); - - await t.throwsAsync(pool.exec('setGlobalX', [11]), { - message: 'Cannot add property x, object is not extensible', - }); -}); - -test.serial('freeze does not prevent global Error being mutated', async (t) => { - pool = createDedicatedPool(); - - // Freeze the scope - await pool.exec('freeze', []); - - t.is(global.x, undefined); - - await pool.exec('writeToGlobalError', [{ y: 222 }]); - const result = await pool.exec('getFromGlobalError', ['y']); - - t.is(result, 222); -}); - -// test imports inside the worker -// this is basically testing that imported modules do not get re-intialised -test.serial('static imports should share state across runs', async (t) => { - pool = createDedicatedPool(); - - const count1 = await pool.exec('incrementStatic', []); - t.is(count1, 1); - - const count2 = await pool.exec('incrementStatic', []); - t.is(count2, 2); - - const count3 = await pool.exec('incrementStatic', []); - t.is(count3, 3); -}); - -test.serial('dynamic imports should share state across runs', async (t) => { - pool = createDedicatedPool(); - - const count1 = await pool.exec('incrementDynamic', []); - t.is(count1, 1); - - const count2 = await pool.exec('incrementDynamic', []); - t.is(count2, 2); - - const count3 = await pool.exec('incrementDynamic', []); - t.is(count3, 3); -}); - -// This is kinda done in the tests above, it's just to setup the next test -test.serial('module scope is shared within a thread', async (t) => { - pool = createDedicatedPool({ maxWorkers: 1 }); - - const result = await Promise.all([ - pool.exec('incrementDynamic', []), - pool.exec('incrementDynamic', []), - pool.exec('incrementDynamic', []), - ]); - - t.deepEqual(result, [1, 2, 3]); -}); - -test.serial('module scope is isolated across threads', async (t) => { - pool = createDedicatedPool({ maxWorkers: 3 }); - - const result = await Promise.all([ - pool.exec('incrementDynamic', []), - pool.exec('incrementDynamic', []), - pool.exec('incrementDynamic', []), - ]); - - t.deepEqual(result, [1, 1, 1]); -}); - -test.serial('worker should die if it blows the memory limit', async (t) => { - pool = createDedicatedPool({ - workerThreadOpts: { - // See resourceLimits for more docs - // Note for the record that these limits do NOT include arraybuffers - resourceLimits: { - // This is basically heap size - // Note that this needs to be at least like 200mb to not blow up in test - maxOldGenerationSizeMb: 100, - }, - }, - }); - - await t.throwsAsync(() => pool.exec('blowMemory', []), { - code: 'ERR_WORKER_OUT_OF_MEMORY', - message: - 'Worker terminated due to reaching memory limit: JS heap out of memory', - }); -}); - -test.todo('threads should all have the same rss memory'); diff --git a/packages/engine-multi/tsup.config.js b/packages/engine-multi/tsup.config.js index 8934555d4..e98427fe3 100644 --- a/packages/engine-multi/tsup.config.js +++ b/packages/engine-multi/tsup.config.js @@ -5,8 +5,10 @@ export default { splitting: false, entry: { index: 'src/index.ts', - 'worker/worker': 'src/worker/worker.ts', - // TODO I don't actually want to build this into the dist, I don't think? - 'worker/mock': 'src/worker/mock-worker.ts', + 'worker/thread/run': 'src/worker/thread/run.ts', + 'worker/child/runner': 'src/worker/child/runner.ts', + + 'test/mock-run': 'src/worker/thread/mock-run.ts', + 'test/worker-functions': 'src/test/worker-functions.ts', }, }; diff --git a/packages/lightning-mock/CHANGELOG.md b/packages/lightning-mock/CHANGELOG.md index 691c50948..f07474b89 100644 --- a/packages/lightning-mock/CHANGELOG.md +++ b/packages/lightning-mock/CHANGELOG.md @@ -1,5 +1,21 @@ # @openfn/lightning-mock +## 1.1.10 + +### Patch Changes + +- Updated dependencies [281391b] + - @openfn/engine-multi@0.3.1 + +## 1.1.9 + +### Patch Changes + +- Updated dependencies [0f22694] +- Updated dependencies [9b9ca0c] + - @openfn/runtime@0.2.5 + - @openfn/engine-multi@0.3.0 + ## 1.1.8 ### Patch Changes diff --git a/packages/lightning-mock/package.json b/packages/lightning-mock/package.json index 3369c5d3a..c1fe76e05 100644 --- a/packages/lightning-mock/package.json +++ b/packages/lightning-mock/package.json @@ -1,6 +1,6 @@ { "name": "@openfn/lightning-mock", - "version": "1.1.8", + "version": "1.1.10", "private": true, "description": "A mock Lightning server", "main": "dist/index.js", diff --git a/packages/lightning-mock/src/api-sockets.ts b/packages/lightning-mock/src/api-sockets.ts index 128cb41c2..9c4b17140 100644 --- a/packages/lightning-mock/src/api-sockets.ts +++ b/packages/lightning-mock/src/api-sockets.ts @@ -1,5 +1,5 @@ import { WebSocketServer } from 'ws'; -import createLogger, { Logger } from '@openfn/logger'; +import createLogger, { LogLevel, Logger } from '@openfn/logger'; import type { Server } from 'http'; import createPheonixMockSocketServer, { @@ -82,7 +82,8 @@ const createSocketAPI = ( state: ServerState, path: string, httpServer: Server, - logger?: Logger + logger?: Logger, + logLevel?: LogLevel ) => { // set up a websocket server to listen to connections const server = new WebSocketServer({ @@ -97,7 +98,7 @@ const createSocketAPI = ( // @ts-ignore server typings server, state, - logger: logger && createLogger('PHX', { level: 'debug' }), + logger: logger && createLogger('PHX', { level: logLevel }), }); wss.registerEvents('worker:queue', { @@ -111,7 +112,7 @@ const createSocketAPI = ( }); const startAttempt = (attemptId: string) => { - logger && logger.debug(`joining channel attempt:${attemptId}`); + logger?.info(`joining channel attempt:${attemptId}`); // mark the attempt as started on the server state.pending[attemptId] = { @@ -194,7 +195,7 @@ const createSocketAPI = ( if (attempts.length) { logger?.info(`Claiming ${attempts.length} attempts`); } else { - logger?.info('No claims (queue empty)'); + logger?.debug('No claims (queue empty)'); } ws.reply({ ref, join_ref, topic, payload }); @@ -317,6 +318,8 @@ const createSocketAPI = ( }; } + logger?.info(`LOG [${attemptId}] ${evt.payload.message[0]}`); + ws.reply({ ref, join_ref, diff --git a/packages/lightning-mock/src/server.ts b/packages/lightning-mock/src/server.ts index bd5f09f21..a0b8a53f6 100644 --- a/packages/lightning-mock/src/server.ts +++ b/packages/lightning-mock/src/server.ts @@ -80,7 +80,7 @@ const createLightningServer = (options: LightningOptions = {}) => { // Only create a http logger if there's a top-level logger passed // This is a bit flaky really but whatever if (options.logger) { - const httpLogger = createLogger('HTTP', { level: 'debug' }); + const httpLogger = createLogger('HTTP', { level: options.logLevel }); const klogger = koaLogger((str) => httpLogger.debug(str)); app.use(klogger); } @@ -90,7 +90,8 @@ const createLightningServer = (options: LightningOptions = {}) => { state, '/worker', // TODO I should option drive this server, - options.logger && logger + options.logger, + options.logLevel ); app.use(createDevAPI(app as any, state, logger, api)); diff --git a/packages/lightning-mock/src/start.ts b/packages/lightning-mock/src/start.ts index 557d7b07f..5b746a8c7 100644 --- a/packages/lightning-mock/src/start.ts +++ b/packages/lightning-mock/src/start.ts @@ -1,12 +1,13 @@ import yargs from 'yargs'; import { hideBin } from 'yargs/helpers'; -import createLogger from '@openfn/logger'; +import createLogger, { LogLevel } from '@openfn/logger'; import createLightningServer from './server'; type Args = { _: string[]; port?: number; + log?: LogLevel; }; const args = yargs(hideBin(process.argv)) @@ -17,13 +18,20 @@ const args = yargs(hideBin(process.argv)) type: 'number', default: 8888, }) + .option('log', { + alias: 'l', + description: 'Log level', + type: 'string', + default: 'debug', + }) .parse() as Args; -const logger = createLogger('LNG', { level: 'debug' }); +const logger = createLogger('LNG', { level: args.log }); createLightningServer({ port: args.port, logger, + logLevel: args.log, }); logger.success('Started mock Lightning server on ', args.port); diff --git a/packages/runtime/CHANGELOG.md b/packages/runtime/CHANGELOG.md index ec7744d68..9d1a0158a 100644 --- a/packages/runtime/CHANGELOG.md +++ b/packages/runtime/CHANGELOG.md @@ -1,5 +1,12 @@ # @openfn/runtime +## 0.2.5 + +### Patch Changes + +- 0f22694: Accept the linker's whitelist as strings +- ignore timeout if it has a value of 0 or false + ## 0.2.4 ### Patch Changes diff --git a/packages/runtime/package.json b/packages/runtime/package.json index 5e38e314a..f4c1be126 100644 --- a/packages/runtime/package.json +++ b/packages/runtime/package.json @@ -1,6 +1,6 @@ { "name": "@openfn/runtime", - "version": "0.2.4", + "version": "0.2.5", "description": "Job processing runtime.", "type": "module", "exports": { diff --git a/packages/runtime/src/execute/expression.ts b/packages/runtime/src/execute/expression.ts index 4bbf61dc7..324f611bb 100644 --- a/packages/runtime/src/execute/expression.ts +++ b/packages/runtime/src/execute/expression.ts @@ -31,10 +31,7 @@ export default ( let duration = Date.now(); const { logger, opts = {} } = ctx; try { - const timeout = opts.timeout || TIMEOUT; - - logger.debug('Intialising pipeline'); - logger.debug(`Timeout set to ${timeout}ms`); + const timeout = opts.timeout ?? TIMEOUT; // Setup an execution context const context = buildContext(initialState, opts); @@ -52,13 +49,16 @@ export default ( ); // Run the pipeline + let tid; logger.debug(`Executing expression (${operations.length} operations)`); + if (timeout) { + logger.debug(`Timeout set to ${timeout}ms`); - const tid = setTimeout(() => { - logger.error(`Error: Timeout (${timeout}ms) expired!`); - logger.error(' Set a different timeout by passing "-t 10000" ms)'); - reject(new TimeoutError(timeout)); - }, timeout); + tid = setTimeout(() => { + logger.error(`Error: Timeout expired (${timeout}ms)`); + reject(new TimeoutError(timeout)); + }, timeout); + } // Note that any errors will be trapped by the containing Job const result = await reducer(initialState); diff --git a/packages/runtime/src/runtime.ts b/packages/runtime/src/runtime.ts index 97e57222a..6d91ea408 100644 --- a/packages/runtime/src/runtime.ts +++ b/packages/runtime/src/runtime.ts @@ -10,6 +10,7 @@ import type { import type { LinkerOptions } from './modules/linker'; import executePlan from './execute/plan'; import clone from './util/clone'; +import parseRegex from './util/regex'; export const TIMEOUT = 5 * 60 * 1000; // 5 minutes @@ -40,6 +41,12 @@ export type Options = { statePropsToRemove?: string[]; }; +type RawOptions = Omit & { + linker?: Omit & { + whitelist?: Array; + }; +}; + const defaultState = { data: {}, configuration: {} }; // Log nothing by default @@ -50,7 +57,7 @@ const defaultLogger = createMockLogger(); const run = ( expressionOrXPlan: string | Operation[] | ExecutionPlan, state?: State, - opts: Options = {} + opts: RawOptions = {} ) => { const logger = opts.logger || defaultLogger; @@ -61,6 +68,14 @@ const run = ( if (!opts.hasOwnProperty('statePropsToRemove')) { opts.statePropsToRemove = ['configuration']; } + if (opts.linker?.whitelist) { + opts.linker.whitelist = opts.linker.whitelist.map((w) => { + if (typeof w === 'string') { + return parseRegex(w); + } + return w; + }); + } // TODO the plan doesn't have an id, should it be given one? // Ditto the jobs? @@ -90,7 +105,7 @@ const run = ( plan.initialState = defaultState; } - return executePlan(plan, opts, logger); + return executePlan(plan, opts as Options, logger); }; export default run; diff --git a/packages/runtime/src/util/regex.ts b/packages/runtime/src/util/regex.ts new file mode 100644 index 000000000..8ae7fad29 --- /dev/null +++ b/packages/runtime/src/util/regex.ts @@ -0,0 +1,20 @@ +// Util function to parse a regex from a string +// supports abd, /abc/ and /abc/g formats + +const parse = (str: string) => { + let pattern = ''; + let flags = ''; + + // Treat this as a full regex + if (str.startsWith('/')) { + const last = str.lastIndexOf('/'); + pattern = str.substring(1, last); + flags = str.substring(last + 1); + } else { + pattern = str; + } + + return new RegExp(pattern, flags); +}; + +export default parse; diff --git a/packages/runtime/test/repo/get-latest-installed-version.ts b/packages/runtime/test/repo/get-latest-installed-version.test.ts similarity index 100% rename from packages/runtime/test/repo/get-latest-installed-version.ts rename to packages/runtime/test/repo/get-latest-installed-version.test.ts diff --git a/packages/runtime/test/runtime.test.ts b/packages/runtime/test/runtime.test.ts index 76249c5b9..48c735f16 100644 --- a/packages/runtime/test/runtime.test.ts +++ b/packages/runtime/test/runtime.test.ts @@ -560,3 +560,51 @@ test('run from an adaptor with error', async (t) => { // This should safely return with an error t.truthy(result.errors['job-1']); }); + +test('accept a whitelist as a regex', async (t) => { + const expression = ` + import { call } from 'blah'; + export default [call("22")]; + `; + + try { + await run( + expression, + {}, + { + linker: { + whitelist: [/^@opennfn/], + }, + } + ); + } catch (error) { + t.truthy(error); + t.is(error.severity, 'crash'); + t.is(error.type, 'ImportError'); + t.is(error.message, 'module blacklisted: blah'); + } +}); + +test('accept a whitelist as a string', async (t) => { + const expression = ` + import { call } from 'blah'; + export default [call("22")]; + `; + + try { + await run( + expression, + {}, + { + linker: { + whitelist: ['/^@opennfn/'], + }, + } + ); + } catch (error) { + t.truthy(error); + t.is(error.severity, 'crash'); + t.is(error.type, 'ImportError'); + t.is(error.message, 'module blacklisted: blah'); + } +}); diff --git a/packages/runtime/test/util/regex.ts b/packages/runtime/test/util/regex.ts new file mode 100644 index 000000000..d47860be8 --- /dev/null +++ b/packages/runtime/test/util/regex.ts @@ -0,0 +1,57 @@ +import test from 'ava'; +import parse from '../../src/util/regex'; + +test('parse abc', (t) => { + const re = parse('abc'); + + t.true(re instanceof RegExp); + + t.true(re.test('abc')); + t.true(re.test('123abc123')); + t.false(re.test('a b c')); + t.is(re.flags, ''); +}); + +test('parse /abc/', (t) => { + const re = parse('/abc/'); + + t.true(re instanceof RegExp); + + t.true(re.test('abc')); + t.true(re.test('123abc123')); + t.false(re.test('a b c')); + t.is(re.flags, ''); +}); + +test('parse /abc/ig', (t) => { + const re = parse('/abc/ig'); + + t.true(re instanceof RegExp); + + t.true(re.test('ABC')); + t.true(re.test('123abc123')); + t.false(re.test('a b c')); + t.is(re.flags, 'gi'); +}); + +test('parse a/b/c', (t) => { + const re = parse('a/b/c'); + + t.true(re instanceof RegExp); + + t.true(re.test('a/b/c')); + t.true(re.test('123a/b/c123')); + t.false(re.test('abc')); + t.is(re.flags, ''); +}); + +test('parse /a/b/c/i', (t) => { + const re = parse('/a/b/c/i'); + + t.true(re instanceof RegExp); + + t.true(re.test('a/B/c')); + t.true(re.test('123a/b/c123')); + t.false(re.test('abc')); + t.is(re.flags, 'i'); +}); diff --git a/packages/ws-worker/CHANGELOG.md b/packages/ws-worker/CHANGELOG.md index 70a309254..1d2956e96 100644 --- a/packages/ws-worker/CHANGELOG.md +++ b/packages/ws-worker/CHANGELOG.md @@ -1,5 +1,30 @@ # ws-worker +## 0.6.0 + +### Patch Changes + +- eb10b1f: Updated start env vars and arguments +- 281391b: Support attemptTimeoutMs in attempt options + Better server logging at startup + Support start arguments from the environment (but prefer CLI) +- 2857fe6: Send the exit reason to the attempt logs +- Updated dependencies [281391b] + - @openfn/engine-multi@0.3.0 + +## 0.6.0 + +### Minor Changes + +- 9b9ca0c: New worker pool engine + +### Patch Changes + +- Updated dependencies [0f22694] +- Updated dependencies [9b9ca0c] + - @openfn/runtime@0.2.5 + - @openfn/engine-multi@0.3.0 + ## 0.5.0 ### Minor Changes diff --git a/packages/ws-worker/package.json b/packages/ws-worker/package.json index cff4eeedb..dcbc05722 100644 --- a/packages/ws-worker/package.json +++ b/packages/ws-worker/package.json @@ -1,6 +1,6 @@ { "name": "@openfn/ws-worker", - "version": "0.5.0", + "version": "0.6.0", "description": "A Websocket Worker to connect Lightning to a Runtime Engine", "main": "dist/index.js", "type": "module", diff --git a/packages/ws-worker/src/api/execute.ts b/packages/ws-worker/src/api/execute.ts index 53787070d..2386a72f9 100644 --- a/packages/ws-worker/src/api/execute.ts +++ b/packages/ws-worker/src/api/execute.ts @@ -1,6 +1,5 @@ import { ATTEMPT_COMPLETE, - AttemptCompletePayload, ATTEMPT_LOG, AttemptLogPayload, ATTEMPT_START, @@ -17,17 +16,16 @@ import type { JSONLog, Logger } from '@openfn/logger'; import type { RuntimeEngine, Resolvers, - WorkflowCompletePayload, - WorkflowErrorPayload, WorkflowStartPayload, } from '@openfn/engine-multi'; import { ExecutionPlan } from '@openfn/runtime'; -import { calculateAttemptExitReason, calculateJobExitReason } from './reasons'; // TODO: I want to move all event handlers out into their own files // TODO just export the index yeah? import handleRunComplete from '../events/run-complete'; import handleRunStart from '../events/run-start'; +import handleAttemptComplete from '../events/atttempt-complete'; +import handleAttemptError from '../events/attempt-error'; import createThrottler from '../util/throttle'; const enc = new TextDecoder('utf-8'); @@ -112,9 +110,9 @@ export function execute( addEvent('job-error', throttle(onJobError)), addEvent('workflow-log', throttle(onJobLog)), // This will also resolve the promise - addEvent('workflow-complete', throttle(onWorkflowComplete)), + addEvent('workflow-complete', throttle(handleAttemptComplete)), - addEvent('workflow-error', throttle(onWorkflowError)) + addEvent('workflow-error', throttle(handleAttemptError)) // TODO send autoinstall logs ); @@ -146,7 +144,7 @@ export function execute( engine.execute(plan, { resolvers, ...options }); } catch (e: any) { // TODO what if there's an error? - onWorkflowError(context, { + handleAttemptError(context, { workflowId: plan.id!, message: e.message, type: e.type, @@ -202,50 +200,6 @@ export function onWorkflowStart( return sendEvent(channel, ATTEMPT_START); } -export async function onWorkflowComplete( - { state, channel, onFinish }: Context, - _event: WorkflowCompletePayload -) { - // TODO I dont think the attempt final dataclip IS the last job dataclip - // Especially not in parallelisation - const result = state.dataclips[state.lastDataclipId!]; - const reason = calculateAttemptExitReason(state); - await sendEvent(channel, ATTEMPT_COMPLETE, { - final_dataclip_id: state.lastDataclipId!, - ...reason, - }); - onFinish({ reason, state: result }); -} - -export async function onWorkflowError( - context: Context, - event: WorkflowErrorPayload -) { - const { state, channel, logger, onFinish } = context; - - try { - // Ok, let's try that, let's just generate a reason from the event - const reason = calculateJobExitReason('', { data: {} }, event); - - // If there's a job still running, make sure it gets marked complete - if (state.activeJob) { - await onJobError(context, { error: event }); - } - - await sendEvent(channel, ATTEMPT_COMPLETE, { - final_dataclip_id: state.lastDataclipId!, - ...reason, - }); - - onFinish({ reason }); - } catch (e: any) { - logger.error('ERROR in workflow-error handler:', e.message); - logger.error(e); - - onFinish({}); - } -} - export function onJobLog({ channel, state }: Context, event: JSONLog) { const timeInMicroseconds = BigInt(event.time) / BigInt(1e3); diff --git a/packages/ws-worker/src/api/reasons.ts b/packages/ws-worker/src/api/reasons.ts index 3c259d5a8..4fba5021a 100644 --- a/packages/ws-worker/src/api/reasons.ts +++ b/packages/ws-worker/src/api/reasons.ts @@ -41,7 +41,7 @@ const isLeafNode = (state: AttemptState, job: JobNode) => { return !hasDownstream; }; -const calculateAttemptExitReason = (state: AttemptState) => { +const calculateAttemptExitReason = (state: AttemptState): ExitReason => { if (state.plan && state.reasons) { // A crash or greater will trigger an error, and the error // basically becomes the exit reason diff --git a/packages/ws-worker/src/events/attempt-error.ts b/packages/ws-worker/src/events/attempt-error.ts new file mode 100644 index 000000000..5b6321db4 --- /dev/null +++ b/packages/ws-worker/src/events/attempt-error.ts @@ -0,0 +1,37 @@ +import { calculateJobExitReason } from '../api/reasons'; + +import type { WorkflowErrorPayload } from '@openfn/engine-multi'; + +import { ATTEMPT_COMPLETE, AttemptCompletePayload } from '../events'; +import { sendEvent, Context, onJobError } from '../api/execute'; +import logFinalReason from '../util/log-final-reason'; + +export default async function onAttemptError( + context: Context, + event: WorkflowErrorPayload +) { + const { state, channel, logger, onFinish } = context; + + try { + // Ok, let's try that, let's just generate a reason from the event + const reason = calculateJobExitReason('', { data: {} }, event); + // If there's a job still running, make sure it gets marked complete + if (state.activeJob) { + await onJobError(context, { error: event }); + } + + await logFinalReason(context, reason); + + await sendEvent(channel, ATTEMPT_COMPLETE, { + final_dataclip_id: state.lastDataclipId!, + ...reason, + }); + + onFinish({ reason }); + } catch (e: any) { + logger.error('ERROR in workflow-error handler:', e.message); + logger.error(e); + + onFinish({}); + } +} diff --git a/packages/ws-worker/src/events/atttempt-complete.ts b/packages/ws-worker/src/events/atttempt-complete.ts new file mode 100644 index 000000000..89c37f0ad --- /dev/null +++ b/packages/ws-worker/src/events/atttempt-complete.ts @@ -0,0 +1,27 @@ +import type { WorkflowCompletePayload } from '@openfn/engine-multi'; + +import { ATTEMPT_COMPLETE, AttemptCompletePayload } from '../events'; +import { calculateAttemptExitReason } from '../api/reasons'; +import { sendEvent, Context } from '../api/execute'; +import logFinalReason from '../util/log-final-reason'; + +export default async function onWorkflowComplete( + context: Context, + _event: WorkflowCompletePayload +) { + const { state, channel, onFinish } = context; + + // TODO I dont think the attempt final dataclip IS the last job dataclip + // Especially not in parallelisation + const result = state.dataclips[state.lastDataclipId!]; + + const reason = calculateAttemptExitReason(state); + await logFinalReason(context, reason); + + await sendEvent(channel, ATTEMPT_COMPLETE, { + final_dataclip_id: state.lastDataclipId!, + ...reason, + }); + + onFinish({ reason, state: result }); +} diff --git a/packages/ws-worker/src/mock/runtime-engine.ts b/packages/ws-worker/src/mock/runtime-engine.ts index 0108ca2af..068575fb2 100644 --- a/packages/ws-worker/src/mock/runtime-engine.ts +++ b/packages/ws-worker/src/mock/runtime-engine.ts @@ -36,6 +36,9 @@ const helpers = { fn: (f: Function) => (s: any) => f(s), wait: (duration: number) => (s: any) => new Promise((resolve) => setTimeout(() => resolve(s), duration)), + err: () => { + throw new Error('test_err'); + }, }; // The mock runtime engine creates a fake engine interface diff --git a/packages/ws-worker/src/server.ts b/packages/ws-worker/src/server.ts index 9a2cd3e1c..1a0647239 100644 --- a/packages/ws-worker/src/server.ts +++ b/packages/ws-worker/src/server.ts @@ -126,8 +126,6 @@ function createServer(engine: RuntimeEngine, options: ServerOptions = {}) { const logger = options.logger || createMockLogger(); const port = options.port || DEFAULT_PORT; - logger.debug('Starting server'); - const app = new Koa() as ServerApp; app.id = humanId({ separator: '-', capitalize: false }); const router = new Router(); @@ -146,7 +144,7 @@ function createServer(engine: RuntimeEngine, options: ServerOptions = {}) { app.destroyed = false; app.server = app.listen(port); - logger.success(`ws-worker ${app.id} listening on ${port}`); + logger.success(`Worker ${app.id} listening on ${port}`); process.send?.('READY'); diff --git a/packages/ws-worker/src/start.ts b/packages/ws-worker/src/start.ts index 9c76b8a3b..3be19c417 100644 --- a/packages/ws-worker/src/start.ts +++ b/packages/ws-worker/src/start.ts @@ -20,43 +20,54 @@ type Args = { capacity?: number; runMemory?: number; statePropsToRemove?: string[]; + maxRunDurationSeconds: number; }; const { + WORKER_BACKOFF, + WORKER_CAPACITY, + WORKER_LIGHTNING_SERVICE_URL, + WORKER_LOG_LEVEL, + WORKER_MAX_RUN_DURATION_SECONDS, + WORKER_MAX_RUN_MEMORY_MB, + WORKER_PORT, WORKER_REPO_DIR, WORKER_SECRET, - MAX_RUN_MEMORY, - STATE_PROPS_TO_REMOVE, + WORKER_STATE_PROPS_TO_REMOVE, } = process.env; const args = yargs(hideBin(process.argv)) .command('server', 'Start a ws-worker server') .option('port', { alias: 'p', - description: 'Port to run the server on', + description: 'Port to run the server on. Env: WORKER_PORT', type: 'number', - default: 2222, + default: WORKER_PORT || 2222, }) // TODO maybe this is positional and required? // frees up -l for the log .option('lightning', { - alias: 'l', + alias: ['l', 'lightning-service-url'], description: - 'Base url to Lightning websocket endpoint, eg, ws://localhost:4000/worker. Set to "mock" to use the default mock server', - default: 'ws://localhost:4000/worker', + 'Base url to Lightning websocket endpoint, eg, ws://localhost:4000/worker. Set to "mock" to use the default mock server. Env: WORKER_LIGHTNING_SERVICE_URL', + default: WORKER_LIGHTNING_SERVICE_URL || 'ws://localhost:4000/worker', }) .option('repo-dir', { alias: 'd', - description: 'Path to the runtime repo (where modules will be installed)', + description: + 'Path to the runtime repo (where modules will be installed). Env: WORKER_REPO_DIR', default: WORKER_REPO_DIR, }) .option('secret', { alias: 's', - description: 'Worker secret (comes from WORKER_SECRET by default)', + description: + 'Worker secret. (comes from WORKER_SECRET by default). Env: WORKER_SECRET', + default: WORKER_SECRET, }) .option('log', { - description: 'Worker secret (comes from WORKER_SECRET by default)', - default: 'info', + description: + 'Set the log level for stdout (default to info, set to debug for verbose output). Env: WORKER_LOG_LEVEL', + default: WORKER_LOG_LEVEL || 'debug', type: 'string', }) .option('loop', { @@ -70,28 +81,40 @@ const args = yargs(hideBin(process.argv)) type: 'boolean', }) .option('backoff', { - description: 'Claim backoff rules: min/max (s)', - default: '1/10', + description: + 'Claim backoff rules: min/max (in seconds). Env: WORKER_BACKOFF', + default: WORKER_BACKOFF || '1/10', }) .option('capacity', { - description: 'max concurrent workers', - default: 5, + description: 'max concurrent workers. Env: WORKER_CAPACITY', + default: WORKER_CAPACITY ? parseInt(WORKER_CAPACITY) : 5, type: 'number', }) .option('state-props-to-remove', { description: - 'A list of properties to remove from the final state returned by a job', - default: STATE_PROPS_TO_REMOVE ?? ['configuration', 'response'], + 'A list of properties to remove from the final state returned by a job. Env: WORKER_STATE_PROPS_TO_REMOVE', + default: WORKER_STATE_PROPS_TO_REMOVE ?? ['configuration', 'response'], type: 'array', }) .option('run-memory', { - description: 'Maximum memory allocated to a single run, in mb', + description: + 'Maximum memory allocated to a single run, in mb. Env: WORKER_MAX_RUN_MEMORY_MB', + type: 'number', + default: WORKER_MAX_RUN_MEMORY_MB + ? parseInt(WORKER_MAX_RUN_MEMORY_MB) + : 500, + }) + .option('max-run-duration-seconds', { + alias: 't', + description: + 'Default attempt timeout for the server, in seconds. Env: WORKER_MAX_RUN_DURATION_SECONDS', type: 'number', - default: MAX_RUN_MEMORY ? parseInt(MAX_RUN_MEMORY) : 500, + default: WORKER_MAX_RUN_DURATION_SECONDS || 60 * 5, // 5 minutes }) .parse() as Args; const logger = createLogger('SRV', { level: args.log }); + if (args.lightning === 'mock') { args.lightning = 'ws://localhost:8888/worker'; if (!args.secret) { @@ -99,19 +122,18 @@ if (args.lightning === 'mock') { args.secret = 'abdefg'; } } else if (!args.secret) { - if (!WORKER_SECRET) { - logger.error('WORKER_SECRET is not set'); - process.exit(1); - } - - args.secret = WORKER_SECRET; + logger.error('WORKER_SECRET is not set'); + process.exit(1); } + const [minBackoff, maxBackoff] = args.backoff .split('/') .map((n: string) => parseInt(n, 10) * 1000); function engineReady(engine: any) { - createWorker(engine, { + logger.debug('Creating worker server...'); + + const workerOptions = { port: args.port, lightning: args.lightning, logger, @@ -123,7 +145,11 @@ function engineReady(engine: any) { max: maxBackoff, }, maxWorkflows: args.capacity, - }); + }; + const { logger: _l, secret: _s, ...humanOptions } = workerOptions; + logger.debug('Worker options:', humanOptions); + + createWorker(engine, workerOptions); } if (args.mock) { @@ -132,13 +158,18 @@ if (args.mock) { engineReady(engine); }); } else { - createRTE({ + const engineOptions = { repoDir: args.repoDir, memoryLimitMb: args.runMemory, maxWorkers: args.capacity, statePropsToRemove: args.statePropsToRemove, - }).then((engine) => { - logger.debug('engine created'); + attemptTimeoutMs: args.maxRunDurationSeconds * 1000, + }; + logger.debug('Creating runtime engine...'); + logger.debug('Engine options:', engineOptions); + + createRTE(engineOptions).then((engine) => { + logger.debug('Engine created!'); engineReady(engine); }); } diff --git a/packages/ws-worker/src/types.d.ts b/packages/ws-worker/src/types.d.ts index baad44e99..3923a9e98 100644 --- a/packages/ws-worker/src/types.d.ts +++ b/packages/ws-worker/src/types.d.ts @@ -74,7 +74,18 @@ export type Attempt = { }; export type AttemptOptions = { + // This is what Lightning will ssend us + // Note that this is the NEW terminology, so it's the timeout for the whole "attempt" + runTimeout?: number; + + // this is the internal old terminology, which will be deprecated soon + attemptTimeoutMs?: number; + + attemptTimeout?: number; // deprecated + + // deprecated alias for timeout. Maps to "attemptTimeout" internally timeout?: number; + sanitize?: SanitizePolicies; }; diff --git a/packages/ws-worker/src/util/convert-attempt.ts b/packages/ws-worker/src/util/convert-attempt.ts index d2bbd14f7..28bc2e8ee 100644 --- a/packages/ws-worker/src/util/convert-attempt.ts +++ b/packages/ws-worker/src/util/convert-attempt.ts @@ -33,6 +33,18 @@ const mapTriggerEdgeCondition = (edge: Edge) => { return condition; }; +const mapOptions = (options: AttemptOptions): AttemptOptions => { + const { attemptTimeout, timeout, runTimeout, ...opts } = options; + + const to = runTimeout || attemptTimeout || timeout; + + if (to) { + (opts as AttemptOptions).attemptTimeoutMs = to; + } + + return opts; +}; + export default ( attempt: Attempt ): { plan: ExecutionPlan; options: AttemptOptions } => { @@ -125,6 +137,6 @@ export default ( return { plan: plan as ExecutionPlan, - options, + options: mapOptions(options), }; }; diff --git a/packages/ws-worker/src/util/log-final-reason.ts b/packages/ws-worker/src/util/log-final-reason.ts new file mode 100644 index 000000000..aaa37c492 --- /dev/null +++ b/packages/ws-worker/src/util/log-final-reason.ts @@ -0,0 +1,19 @@ +import { timestamp } from '@openfn/logger'; +import { Context, onJobLog } from '../api/execute'; +import { ExitReason } from '../types'; + +export default async (context: Context, reason: ExitReason) => { + const time = (timestamp() - BigInt(10e6)).toString(); + + let message = `Run complete with status: ${reason.reason}`; + if (reason.reason !== 'success') { + message += `\n${reason.error_type}: ${reason.error_message || 'unknown'}`; + } + + await onJobLog(context, { + time, + message: [message], + level: 'info', + name: 'R/T', + }); +}; diff --git a/packages/ws-worker/test/api/execute.test.ts b/packages/ws-worker/test/api/execute.test.ts index 1af641b48..6419bdc67 100644 --- a/packages/ws-worker/test/api/execute.test.ts +++ b/packages/ws-worker/test/api/execute.test.ts @@ -14,8 +14,6 @@ import { onJobLog, execute, onWorkflowStart, - onWorkflowComplete, - onWorkflowError, loadDataclip, loadCredential, sendEvent, @@ -27,11 +25,6 @@ import { stringify, createAttemptState } from '../../src/util'; import type { ExecutionPlan } from '@openfn/runtime'; import type { Attempt, AttemptState } from '../../src/types'; -import { - JOB_COMPLETE, - JOB_ERROR, - WORKFLOW_COMPLETE, -} from '@openfn/engine-multi'; const enc = new TextEncoder(); @@ -198,163 +191,56 @@ test('workflowStart should send an empty attempt:start event', async (t) => { await onWorkflowStart({ channel }); }); -test('workflowComplete should send an attempt:complete event', async (t) => { - const result = { answer: 42 }; - - const state = { - reasons: {}, - dataclips: { - x: result, - }, - lastDataclipId: 'x', - }; - - const channel = mockChannel({ - [ATTEMPT_COMPLETE]: (evt) => { - t.deepEqual(evt.final_dataclip_id, 'x'); - }, - }); - - const event = {}; - - const context = { channel, state, onFinish: () => {} }; - await onWorkflowComplete(context, event); -}); - -test('workflowComplete should call onFinish with final dataclip', async (t) => { - const result = { answer: 42 }; - - const state = { - reasons: {}, - dataclips: { - x: result, - }, - lastDataclipId: 'x', - }; - - const channel = mockChannel({ - [ATTEMPT_COMPLETE]: () => true, - }); - - const context = { - channel, - state, - onFinish: ({ state: finalState }) => { - t.deepEqual(result, finalState); - }, - }; - - const event = { state: result }; - - await onWorkflowComplete(context, event); -}); - -test('workflowError should trigger runComplete with a reason', async (t) => { - const jobId = 'job-1'; - - const state = { - reasons: {}, - dataclips: {}, - lastDataclipId: 'x', - activeJob: jobId, - activeRun: 'b', - errors: {}, - }; - - const channel = mockChannel({ - [RUN_COMPLETE]: (evt) => { - t.is(evt.reason, 'crash'); - t.is(evt.error_message, 'it crashed'); - return true; - }, - [ATTEMPT_COMPLETE]: () => true, - }); - - const event = { - severity: 'crash', - type: 'Err', - message: 'it crashed', - }; - - const context = { channel, state, onFinish: () => {} }; - - await onWorkflowError(context, event); -}); - -test('workflow error should send reason to onFinish', async (t) => { - const jobId = 'job-1'; - - const state = { - reasons: {}, - dataclips: {}, - lastDataclipId: 'x', - activeJob: jobId, - activeRun: 'b', - errors: {}, - }; - - const channel = mockChannel({ - [RUN_COMPLETE]: (evt) => true, - [ATTEMPT_COMPLETE]: () => true, - }); - - const event = { - error: { - severity: 'crash', - type: 'Err', - message: 'it crashed', - }, - state: {}, - }; - - const context = { - channel, - state, - onFinish: (evt) => { - t.is(evt.reason.reason, 'crash'); - }, - }; - - await onWorkflowError(context, event); -}); - -test('workflowError should not call job complete if the job is not active', async (t) => { - const state = { - reasons: {}, - dataclips: {}, - lastDataclipId: 'x', - activeJob: undefined, - activeRun: undefined, - errors: {}, - }; - - const channel = mockChannel({ - [RUN_COMPLETE]: (evt) => { - t.fail('should not call!'); - return true; - }, - [ATTEMPT_COMPLETE]: () => true, - }); - - const event = { - error: { - severity: 'crash', - type: 'Err', - message: 'it crashed', - }, - state: {}, - }; - - const context = { - channel, - state, - onFinish: () => { - t.pass(); - }, - }; - - await onWorkflowError(context, event); -}); +// test('workflowComplete should send an attempt:complete event', async (t) => { +// const result = { answer: 42 }; + +// const state = { +// reasons: {}, +// dataclips: { +// x: result, +// }, +// lastDataclipId: 'x', +// }; + +// const channel = mockChannel({ +// [ATTEMPT_COMPLETE]: (evt) => { +// t.deepEqual(evt.final_dataclip_id, 'x'); +// }, +// }); + +// const event = {}; + +// const context = { channel, state, onFinish: () => {} }; +// await onWorkflowComplete(context, event); +// }); + +// test('workflowComplete should call onFinish with final dataclip', async (t) => { +// const result = { answer: 42 }; + +// const state = { +// reasons: {}, +// dataclips: { +// x: result, +// }, +// lastDataclipId: 'x', +// }; + +// const channel = mockChannel({ +// [ATTEMPT_COMPLETE]: () => true, +// }); + +// const context = { +// channel, +// state, +// onFinish: ({ state: finalState }) => { +// t.deepEqual(result, finalState); +// }, +// }; + +// const event = { state: result }; + +// await onWorkflowComplete(context, event); +// }); // TODO what if an error? test('loadDataclip should fetch a dataclip', async (t) => { diff --git a/packages/ws-worker/test/channels/attempt.test.ts b/packages/ws-worker/test/channels/attempt.test.ts index e8d37d6a3..c387b563c 100644 --- a/packages/ws-worker/test/channels/attempt.test.ts +++ b/packages/ws-worker/test/channels/attempt.test.ts @@ -25,7 +25,7 @@ test('loadAttempt should return an execution plan and options', async (t) => { ...attempts['attempt-1'], options: { sanitize: 'obfuscate', - timeout: 10, + runTimeout: 10, }, }; @@ -45,7 +45,8 @@ test('loadAttempt should return an execution plan and options', async (t) => { }, ], }); - t.deepEqual(options, attempt.options); + t.is(options.sanitize, 'obfuscate'); + t.is(options.attemptTimeoutMs, 10); }); test('should join an attempt channel with a token', async (t) => { @@ -56,7 +57,7 @@ test('should join an attempt channel with a token', async (t) => { join: () => ({ status: 'ok' }), [GET_ATTEMPT]: () => ({ id: 'a', - options: { timeout: 10 }, + options: { runTimeout: 10 }, }), }), }); @@ -70,7 +71,7 @@ test('should join an attempt channel with a token', async (t) => { t.truthy(channel); t.deepEqual(plan, { id: 'a', jobs: [] }); - t.deepEqual(options, { timeout: 10 }); + t.deepEqual(options, { attemptTimeoutMs: 10 }); }); test('should fail to join an attempt channel with an invalid token', async (t) => { diff --git a/packages/ws-worker/test/events/attempt-complete.test.ts b/packages/ws-worker/test/events/attempt-complete.test.ts new file mode 100644 index 000000000..46c7669bd --- /dev/null +++ b/packages/ws-worker/test/events/attempt-complete.test.ts @@ -0,0 +1,144 @@ +import test from 'ava'; +import handleAttemptComplete from '../../src/events/atttempt-complete'; + +import { mockChannel } from '../../src/mock/sockets'; +import { ATTEMPT_COMPLETE, ATTEMPT_LOG } from '../../src/events'; +import { createAttemptState } from '../../src/util'; + +test('should send an attempt:complete event', async (t) => { + const result = { answer: 42 }; + const plan = { id: 'attempt-1', jobs: [] }; + + const state = createAttemptState(plan); + state.dataclips = { + x: result, + }; + state.lastDataclipId = 'x'; + + const channel = mockChannel({ + [ATTEMPT_LOG]: () => true, + [ATTEMPT_COMPLETE]: (evt) => { + t.deepEqual(evt.final_dataclip_id, 'x'); + }, + }); + + const event = {}; + + const context = { channel, state, onFinish: () => {} }; + await handleAttemptComplete(context, event); +}); + +test('should call onFinish with final dataclip', async (t) => { + const result = { answer: 42 }; + const plan = { id: 'attempt-1', jobs: [] }; + + const state = createAttemptState(plan); + state.dataclips = { + x: result, + }; + state.lastDataclipId = 'x'; + + const channel = mockChannel({ + [ATTEMPT_LOG]: () => true, + [ATTEMPT_COMPLETE]: () => true, + }); + + const context = { + channel, + state, + onFinish: ({ state: finalState }) => { + t.deepEqual(result, finalState); + }, + }; + + const event = { state: result }; + + await handleAttemptComplete(context, event); +}); + +test('should send a reason log and return reason for success', async (t) => { + const result = { answer: 42 }; + const plan = { id: 'attempt-1', jobs: [] }; + + const state = createAttemptState(plan); + state.dataclips = { + x: result, + }; + state.lastDataclipId = 'x'; + + let logEvent; + let completeEvent; + + const channel = mockChannel({ + [ATTEMPT_LOG]: (e) => { + logEvent = e; + }, + [ATTEMPT_COMPLETE]: (e) => { + completeEvent = e; + }, + }); + + const context = { + channel, + state, + onFinish: ({ state: finalState }) => { + t.deepEqual(result, finalState); + }, + }; + + const event = { state: result }; + + await handleAttemptComplete(context, event); + + t.is(logEvent.message[0], 'Run complete with status: success'); + t.is(completeEvent.reason, 'success'); + t.falsy(completeEvent.error_type); + t.falsy(completeEvent.error_message); +}); + +test('should send a reason log and return reason for fail', async (t) => { + const result = { answer: 42 }; + const plan = { id: 'attempt-1', jobs: [{ id: 'x' }] }; + + const state = createAttemptState(plan); + state.dataclips = { + x: result, + }; + state.lastDataclipId = 'x'; + state.reasons = { + x: { + reason: 'fail', + error_message: 'err', + error_type: 'TEST', + }, + }; + + let logEvent; + let completeEvent; + + const channel = mockChannel({ + [ATTEMPT_LOG]: (e) => { + logEvent = e; + }, + [ATTEMPT_COMPLETE]: (e) => { + completeEvent = e; + }, + }); + + const context = { + channel, + state, + onFinish: ({ state: finalState }) => { + t.deepEqual(result, finalState); + }, + }; + + const event = { state: result }; + + await handleAttemptComplete(context, event); + + t.is(logEvent.message[0], 'Run complete with status: fail\nTEST: err'); + t.is(completeEvent.reason, 'fail'); + t.is(completeEvent.error_type, 'TEST'); + t.is(completeEvent.error_message, 'err'); +}); diff --git a/packages/ws-worker/test/events/attempt-error.test.ts b/packages/ws-worker/test/events/attempt-error.test.ts new file mode 100644 index 000000000..0d1f540a9 --- /dev/null +++ b/packages/ws-worker/test/events/attempt-error.test.ts @@ -0,0 +1,138 @@ +import test from 'ava'; +import onAttemptError from '../../src/events/attempt-error'; + +import { mockChannel } from '../../src/mock/sockets'; +import { ATTEMPT_COMPLETE, ATTEMPT_LOG, RUN_COMPLETE } from '../../src/events'; +import { createAttemptState } from '../../src/util'; + +const plan = { id: 'attempt-1', jobs: [] }; + +test('attemptError should trigger runComplete with a reason', async (t) => { + const jobId = 'job-1'; + + const state = createAttemptState(plan); + state.lastDataclipId = 'x'; + state.activeRun = 'b'; + state.activeJob = jobId; + + const channel = mockChannel({ + [ATTEMPT_LOG]: () => true, + [RUN_COMPLETE]: (evt) => { + t.is(evt.reason, 'crash'); + t.is(evt.error_message, 'it crashed'); + return true; + }, + [ATTEMPT_COMPLETE]: () => true, + }); + + const event = { + severity: 'crash', + type: 'Err', + message: 'it crashed', + }; + + const context = { channel, state, onFinish: () => {} }; + + await onAttemptError(context, event); +}); + +test('workflow error should send reason to onFinish', async (t) => { + const jobId = 'job-1'; + + const state = createAttemptState(plan); + state.lastDataclipId = 'x'; + state.activeRun = 'b'; + state.activeJob = jobId; + + const channel = mockChannel({ + [ATTEMPT_LOG]: () => true, + [RUN_COMPLETE]: (evt) => true, + [ATTEMPT_COMPLETE]: () => true, + }); + + const event = { + error: { + severity: 'crash', + type: 'Err', + message: 'it crashed', + }, + state: {}, + }; + + const context = { + channel, + state, + onFinish: (evt) => { + t.is(evt.reason.reason, 'crash'); + }, + }; + + await onAttemptError(context, event); +}); + +test('attemptError should not call job complete if the job is not active', async (t) => { + const state = createAttemptState(plan); + state.lastDataclipId = 'x'; + + const channel = mockChannel({ + [ATTEMPT_LOG]: () => true, + [RUN_COMPLETE]: (evt) => { + t.fail('should not call!'); + return true; + }, + [ATTEMPT_COMPLETE]: () => true, + }); + + const event = { + error: { + severity: 'crash', + type: 'Err', + message: 'it crashed', + }, + state: {}, + }; + + const context = { + channel, + state, + onFinish: () => { + t.pass(); + }, + }; + + await onAttemptError(context, event); +}); + +test('attemptError should log the reason', async (t) => { + const jobId = 'job-1'; + + const state = createAttemptState({ + id: 'attempt-1', + jobs: [{ id: 'job-1' }], + }); + state.lastDataclipId = 'x'; + state.activeRun = 'b'; + state.activeJob = jobId; + + const event = { + severity: 'crash', + type: 'Err', + message: 'it crashed', + }; + state.reasons['x'] = event; + + let logEvent; + + const channel = mockChannel({ + [ATTEMPT_LOG]: (e) => { + logEvent = e; + }, + [RUN_COMPLETE]: (evt) => true, + [ATTEMPT_COMPLETE]: () => true, + }); + + const context = { channel, state, onFinish: () => {} }; + + await onAttemptError(context, event); + t.is(logEvent.message[0], 'Run complete with status: crash\nErr: it crashed'); +}); diff --git a/packages/ws-worker/test/lightning.test.ts b/packages/ws-worker/test/lightning.test.ts index b110bcc85..99242cc95 100644 --- a/packages/ws-worker/test/lightning.test.ts +++ b/packages/ws-worker/test/lightning.test.ts @@ -49,18 +49,15 @@ const getAttempt = (ext = {}, jobs?: any) => ({ ...ext, }); -test.serial( - `events: lightning should respond to a claim ${e.CLAIM} event`, - (t) => { - return new Promise((done) => { - lng.on(e.CLAIM, (evt) => { - const response = evt.payload; - t.deepEqual(response, []); - done(); - }); +test.serial(`events: lightning should respond to a ${e.CLAIM} event`, (t) => { + return new Promise((done) => { + lng.on(e.CLAIM, (evt) => { + const response = evt.payload; + t.deepEqual(response, []); + done(); }); - } -); + }); +}); test.serial( `events: lightning should respond to a ${e.CLAIM} event with an attempt id and token`, @@ -294,22 +291,20 @@ test.serial( `events: lightning should receive a ${e.RUN_COMPLETE} event even if the attempt fails`, (t) => { return new Promise((done) => { - // This attempt should timeout - const attempt = getAttempt({ options: { timeout: 100 } }, [ + const attempt = getAttempt({}, [ { id: 'z', adaptor: '@openfn/language-common@1.0.0', - body: 'wait(1000)', + body: 'err()', }, ]); lng.onSocketEvent(e.RUN_COMPLETE, attempt.id, ({ payload }) => { - t.not(payload.reason, 'success'); + t.is(payload.reason, 'fail'); t.pass('called run complete'); }); lng.onSocketEvent(e.ATTEMPT_COMPLETE, attempt.id, ({ payload }) => { - t.not(payload.reason, 'success'); done(); }); @@ -632,6 +627,63 @@ test.serial( } ); +test.serial(`worker should send a success reason in the logs`, (t) => { + return new Promise((done) => { + let log; + + const attempt = { + id: 'attempt-1', + jobs: [ + { + body: 'fn((s) => { return s })', + }, + ], + }; + + lng.onSocketEvent(e.ATTEMPT_LOG, attempt.id, ({ payload }) => { + if (payload.message[0].match(/Run complete with status: success/)) { + log = payload.message[0]; + } + }); + + lng.onSocketEvent(e.ATTEMPT_COMPLETE, attempt.id, () => { + t.truthy(log); + done(); + }); + + lng.enqueueAttempt(attempt); + }); +}); + +test.serial(`worker should send a fail reason in the logs`, (t) => { + return new Promise((done) => { + let log; + + const attempt = { + id: 'attempt-1', + jobs: [ + { + body: 'fn((s) => { throw "blah" })', + }, + ], + }; + + lng.onSocketEvent(e.ATTEMPT_LOG, attempt.id, ({ payload }) => { + if (payload.message[0].match(/Run complete with status: fail/)) { + log = payload.message[0]; + } + }); + + lng.onSocketEvent(e.ATTEMPT_COMPLETE, attempt.id, () => { + t.truthy(log); + t.regex(log, /JobError: blah/i); + done(); + }); + + lng.enqueueAttempt(attempt); + }); +}); + // Note that this test HAS to be last // Remember this uses the mock engine, so it's not a good test of workerpool's behaviours test.serial( diff --git a/packages/ws-worker/test/reasons.test.ts b/packages/ws-worker/test/reasons.test.ts index 8224b4ee5..cbb50208a 100644 --- a/packages/ws-worker/test/reasons.test.ts +++ b/packages/ws-worker/test/reasons.test.ts @@ -25,7 +25,6 @@ test.before(async () => { // Note: this is the REAL engine, not a mock engine = await createRTE({ maxWorkers: 1, - purge: false, logger, autoinstall: { handleIsInstalled: async () => false, @@ -40,7 +39,7 @@ test.before(async () => { test.after(async () => engine.destroy()); // Wrap up an execute call, capture the on complete state -const execute = async (plan) => +const execute = async (plan, options = {}) => new Promise<{ reason: ExitReason; state: any }>((done) => { // Ignore all channel events // In these test we assume that the correct messages are sent to the channel @@ -57,7 +56,7 @@ const execute = async (plan) => }; // @ts-ignore - doExecute(channel, engine, logger, plan, {}, onFinish); + doExecute(channel, engine, logger, plan, options, onFinish); }); test('success', async (t) => { @@ -222,6 +221,22 @@ test('exception: autoinstall error', async (t) => { ); }); +test('kill: timeout', async (t) => { + const plan = createPlan({ + id: 'x', + expression: 'export default [(s) => { while(true) { } }]', + }); + + const options = { + attemptTimeoutMs: 100, + }; + + const { reason } = await execute(plan, options); + t.is(reason.reason, 'kill'); + t.is(reason.error_type, 'TimeoutError'); + t.is(reason.error_message, 'Workflow failed to return within 100ms'); +}); + test.todo('crash: workflow validation error'); test.todo('fail: adaptor error'); test.todo('crash: import error'); diff --git a/packages/ws-worker/test/util/convert-attempt.test.ts b/packages/ws-worker/test/util/convert-attempt.test.ts index 9e8f19adf..1aa06a967 100644 --- a/packages/ws-worker/test/util/convert-attempt.test.ts +++ b/packages/ws-worker/test/util/convert-attempt.test.ts @@ -64,7 +64,7 @@ test('convert a single job with options', (t) => { edges: [], options: { sanitize: 'obfuscate', - timeout: 10, + attemptTimeoutMs: 10, }, }; const { plan, options } = convertAttempt(attempt as Attempt); @@ -507,3 +507,55 @@ test('convert edge condition always', (t) => { t.false(job.next.b.hasOwnProperty('condition')); }); + +test('convert random options', (t) => { + const attempt: Partial = { + id: 'w', + options: { + a: 1, + b: 2, + c: 3, + }, + }; + const { options } = convertAttempt(attempt as Attempt); + + t.deepEqual(options, { a: 1, b: 2, c: 3 }); +}); + +test('convert options, mapping timeout', (t) => { + const attempt: Partial = { + id: 'w', + options: { + timeout: 123, + }, + }; + const { options } = convertAttempt(attempt as Attempt); + + t.deepEqual(options, { attemptTimeoutMs: 123 }); +}); + +test('convert options, mapping runTimeout', (t) => { + const attempt: Partial = { + id: 'w', + options: { + runTimeout: 123, + }, + }; + const { options } = convertAttempt(attempt as Attempt); + + t.deepEqual(options, { attemptTimeoutMs: 123 }); +}); + +test('convert options, preferring runTimeout', (t) => { + const attempt: Partial = { + id: 'w', + options: { + runTimeout: 123, + attemptTimeoutMs: 456, + timeout: 789, + }, + }; + const { options } = convertAttempt(attempt as Attempt); + + t.deepEqual(options, { attemptTimeoutMs: 123 }); +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7e6157906..03bcddc98 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -383,16 +383,10 @@ importers: '@openfn/runtime': specifier: workspace:* version: link:../runtime - workerpool: - specifier: ^6.5.1 - version: 6.5.1 devDependencies: '@types/node': specifier: ^18.15.13 version: 18.15.13 - '@types/workerpool': - specifier: ^6.4.4 - version: 6.4.4 ava: specifier: 5.3.1 version: 5.3.1 @@ -412,6 +406,8 @@ importers: specifier: ^5.1.6 version: 5.1.6 + packages/engine-multi/tmp/a/b/c: {} + packages/engine-multi/tmp/repo: {} packages/lightning-mock: @@ -1921,12 +1917,6 @@ packages: resolution: {integrity: sha512-ONpcZAEYlbPx4EtJwfTyCDQJGUpKf4sEcuySdCVjK5Fj/3vHp5HII1fqa1/+qrsLnpYELCQTfVW/awsGJePoIg==} dev: true - /@types/workerpool@6.4.4: - resolution: {integrity: sha512-rpYFug3QyKzQ7+y/x8BCTEseMorTyr9DiY3ao5KxzWJPtFyx/HL0SSLtJlRjUSpBeaMd/zn7hnLaWOb8WRFnnQ==} - dependencies: - '@types/node': 18.15.13 - dev: true - /@types/wrap-ansi@3.0.0: resolution: {integrity: sha512-ltIpx+kM7g/MLRZfkbL7EsCEjfzCcScLpkg37eXEtx5kmrAKBkTJwd1GIAjDSL8wTpM6Hzn5YO4pSb91BEwu1g==} dev: false @@ -7821,10 +7811,6 @@ packages: resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==} dev: false - /workerpool@6.5.1: - resolution: {integrity: sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==} - dev: false - /wrap-ansi@6.2.0: resolution: {integrity: sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==} engines: {node: '>=8'}