From 4352e97b354429ecab7eb9d564d0afa59c9d6ce7 Mon Sep 17 00:00:00 2001 From: Nicolas Humbert Date: Fri, 8 Nov 2024 21:12:42 +0100 Subject: [PATCH] CLDSRV-574 implement KMS health check --- lib/kms/Cache.js | 65 +++++++++++ lib/kms/wrapper.js | 58 +++++++--- lib/utilities/healthcheckHandler.js | 7 +- tests/unit/encryption/checkHealth.js | 132 ++++++++++++++++++++++ tests/unit/encryption/healthCheckCache.js | 132 ++++++++++++++++++++++ 5 files changed, 374 insertions(+), 20 deletions(-) create mode 100644 lib/kms/Cache.js create mode 100644 tests/unit/encryption/checkHealth.js create mode 100644 tests/unit/encryption/healthCheckCache.js diff --git a/lib/kms/Cache.js b/lib/kms/Cache.js new file mode 100644 index 0000000000..c0ef71942a --- /dev/null +++ b/lib/kms/Cache.js @@ -0,0 +1,65 @@ +class Cache { + constructor() { + this.lastChecked = null; + this.result = null; + } + + /** + * Retrieves the cached result with the last checked timestamp. + * @returns {object|null} An object containing the result and lastChecked, or null if not set. + */ + getResult() { + if (!this.result) { + return null; + } + + return Object.assign({}, this.result, { + lastChecked: this.lastChecked ? new Date(this.lastChecked).toISOString() : null, + }); + } + + /** + * Retrieves the last checked timestamp. + * @returns {number|null} The timestamp of the last check or null if never checked. + */ + getLastChecked() { + return this.lastChecked; + } + + /** + * Updates the cache with a new result and timestamp. + * @param {object} result - The result to cache. + * @returns {undefined} + */ + setResult(result) { + this.lastChecked = Date.now(); + this.result = result; + } + + /** + * Determines if the cache should be refreshed based on the last checked time. + * @param {number} duration - Duration in milliseconds for cache validity. + * @returns {boolean} true if the cache should be refreshed, else false. + */ + shouldRefresh(duration = 1 * 60 * 60 * 1000) { // Default: 1 hour + if (!this.lastChecked) { + return true; + } + + const now = Date.now(); + const elapsed = now - this.lastChecked; + const jitter = Math.floor(Math.random() * 15 * 60 * 1000); // Up to 15 minutes + return elapsed > (duration - jitter); + } + + /** + * Clears the cache. + * @returns {undefined} + */ + clear() { + this.lastChecked = null; + this.result = null; + } +} + +module.exports = Cache; diff --git a/lib/kms/wrapper.js b/lib/kms/wrapper.js index 4a927f9d94..51f8823de7 100644 --- a/lib/kms/wrapper.js +++ b/lib/kms/wrapper.js @@ -8,6 +8,8 @@ const inMemory = require('./in_memory/backend').backend; const file = require('./file/backend'); const KMIPClient = require('arsenal').network.kmipClient; const Common = require('./common'); +const Cache = require('./Cache'); +const cache = new Cache(); let scalityKMS; let scalityKMSImpl; try { @@ -306,21 +308,47 @@ class KMS { [implName]: { code: 200, message: 'OK' }, }); } - return client.healthcheck(log, err => { - const respBody = {}; - if (err) { - respBody[implName] = { - error: err.description, - code: err.code, - }; - } else { - respBody[implName] = { - code: 200, - message: 'OK', - }; - } - return cb(null, respBody); - }); + + const cachedResult = cache.getResult(); + logger.debug('current KMS cache state', { result: cachedResult }); + + const shouldRefreshCache = cache.shouldRefresh(); + + if (shouldRefreshCache) { + logger.debug('health check for KMS backend'); + return client.healthcheck(log, err => { + let res; + if (err) { + res = { + // The following response makes sure that if KMS is down, + // cloudserver health check is still healthy. + // Simply including an error code in the response won't cause the health check to fail. + // Instead, the healthCheck logic detects errors by checking for the "error" field. + code: err.code, + message: 'KMS health check failed', + description: err.description, + }; + logger.warn('KMS health check failed', { errorCode: err.code, error: err.description }); + } else { + res = { + code: 200, + message: 'OK', + }; + logger.info('KMS health check succeeded', { res }); + } + + cache.setResult(res); + const updatedResult = cache.getResult(); + logger.debug('updated KMS cache:', { result: updatedResult }); + + const respBody = { [implName]: updatedResult }; + return cb(null, respBody); + }); + } + + // Use cached healthcheck result if within a 1-hour window + logger.debug('using cached KMS health check', { cachedResult }); + return cb(null, { [implName]: cachedResult }); } } diff --git a/lib/utilities/healthcheckHandler.js b/lib/utilities/healthcheckHandler.js index 5bb06c53b2..4b6340e028 100644 --- a/lib/utilities/healthcheckHandler.js +++ b/lib/utilities/healthcheckHandler.js @@ -4,6 +4,7 @@ const { data } = require('../data/wrapper'); const vault = require('../auth/vault'); const metadata = require('../metadata/wrapper'); const async = require('async'); +const kms = require('../kms/wrapper'); // current function utility is minimal, but will be expanded function isHealthy() { @@ -40,15 +41,11 @@ function writeResponse(res, error, log, results, cb) { * @return {undefined} */ function clientCheck(flightCheckOnStartUp, log, cb) { - // FIXME S3C-4833 KMS healthchecks have been disabled: - // - they should be reworked to avoid blocking all requests, - // including unencrypted requests - // - they should not prevent Cloudserver from starting up const clients = [ data, metadata, vault, - // kms, + kms, ]; const clientTasks = []; clients.forEach(client => { diff --git a/tests/unit/encryption/checkHealth.js b/tests/unit/encryption/checkHealth.js new file mode 100644 index 0000000000..e6b5d90c64 --- /dev/null +++ b/tests/unit/encryption/checkHealth.js @@ -0,0 +1,132 @@ +const assert = require('assert'); +const sinon = require('sinon'); +const { errors } = require('arsenal'); + +const Cache = require('../../../lib/kms/Cache'); +const { DummyRequestLogger } = require('../helpers'); +const memBackend = require('../../../lib/kms/in_memory/backend'); +const kms = require('../../../lib/kms/wrapper'); + +const log = new DummyRequestLogger(); + +describe('KMS.checkHealth', () => { + let setResultSpy; + let shouldRefreshStub; + let clock; + + beforeEach(() => { + clock = sinon.useFakeTimers({ + now: 1625077800000, + toFake: ['Date'], + }); + + setResultSpy = sinon.spy(Cache.prototype, 'setResult'); + shouldRefreshStub = sinon.stub(Cache.prototype, 'shouldRefresh').returns(true); + + delete memBackend.backend.healthcheck; + }); + + afterEach(() => { + sinon.restore(); + if (clock) { + clock.restore(); + } + }); + + it('should return OK when kms backend does not have healthcheck method', done => { + kms.checkHealth(log, (err, result) => { + assert.ifError(err); + assert.deepStrictEqual(result, { + memoryKms: { code: 200, message: 'OK' }, + }); + + assert(shouldRefreshStub.notCalled, 'shouldRefresh should not be called'); + assert(setResultSpy.notCalled, 'setResult should not be called'); + + done(); + }); + }); + + it('should return OK when healthcheck succeeds', done => { + memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(null)); + + kms.checkHealth(log, (err, result) => { + assert.ifError(err); + + const expectedLastChecked = new Date(clock.now).toISOString(); + + assert.deepStrictEqual(result, { + memoryKms: { code: 200, message: 'OK', lastChecked: expectedLastChecked }, + }); + + assert(shouldRefreshStub.calledOnce, 'shouldRefresh should be called once'); + + assert(setResultSpy.calledOnceWithExactly({ + code: 200, + message: 'OK', + })); + + done(); + }); + }); + + it('should return failure message when healthcheck fails', done => { + memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(errors.InternalError)); + + kms.checkHealth(log, (err, result) => { + assert.ifError(err); + + const expectedLastChecked = new Date(clock.now).toISOString(); + + assert.deepStrictEqual(result, { + memoryKms: { + code: 500, + message: 'KMS health check failed', + description: 'We encountered an internal error. Please try again.', + lastChecked: expectedLastChecked, + }, + }); + + assert(shouldRefreshStub.calledOnce, 'shouldRefresh should be called once'); + + assert(setResultSpy.calledOnceWithExactly({ + code: 500, + message: 'KMS health check failed', + description: 'We encountered an internal error. Please try again.', + })); + + done(); + }); + }); + + it('should use cached result when not refreshing', done => { + memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(null)); + // first call to populate the cache + kms.checkHealth(log, err => { + assert.ifError(err); + shouldRefreshStub.returns(false); + + // second call should use the cached result + kms.checkHealth(log, (err, result) => { + assert.ifError(err); + + const expectedLastChecked = new Date(clock.now).toISOString(); + assert.deepStrictEqual(result, { + memoryKms: { + code: 200, + message: 'OK', + lastChecked: expectedLastChecked, + }, + }); + + // once each call + assert.strictEqual(shouldRefreshStub.callCount, 2, 'shouldRefresh should be called twice'); + + // only the first call + assert.strictEqual(setResultSpy.callCount, 1, 'setResult should be called once'); + + done(); + }); + }); + }); +}); diff --git a/tests/unit/encryption/healthCheckCache.js b/tests/unit/encryption/healthCheckCache.js new file mode 100644 index 0000000000..4d21232e8f --- /dev/null +++ b/tests/unit/encryption/healthCheckCache.js @@ -0,0 +1,132 @@ +const assert = require('assert'); +const sinon = require('sinon'); +const Cache = require('../../../lib/kms/Cache'); + +describe('Cache Class', () => { + let cache; + let sandbox; + + beforeEach(() => { + cache = new Cache(); + sandbox = sinon.createSandbox(); + }); + + afterEach(() => { + sandbox.restore(); + }); + + describe('getResult()', () => { + it('should return null when no result is set', () => { + assert.strictEqual(cache.getResult(), null); + }); + + it('should return the cached result after setResult is called', () => { + const fakeTimestamp = 1625077800000; + sandbox.stub(Date, 'now').returns(fakeTimestamp); + const result = { data: 'test' }; + cache.setResult(result); + const expected = Object.assign({}, result, { + lastChecked: new Date(fakeTimestamp).toISOString(), + }); + assert.deepStrictEqual(cache.getResult(), expected); + }); + }); + + describe('getLastChecked()', () => { + it('should return null when cache has never been set', () => { + assert.strictEqual(cache.getLastChecked(), null); + }); + + it('should return the timestamp after setResult is called', () => { + const fakeTimestamp = 1625077800000; + sandbox.stub(Date, 'now').returns(fakeTimestamp); + cache.setResult({ data: 'test' }); + assert.strictEqual(cache.getLastChecked(), fakeTimestamp); + }); + }); + + describe('setResult()', () => { + it('should set the result and update lastChecked', () => { + const fakeTimestamp = 1625077800000; + sandbox.stub(Date, 'now').returns(fakeTimestamp); + + const result = { data: 'test' }; + cache.setResult(result); + + const expectedResult = Object.assign({}, result, { + lastChecked: new Date(fakeTimestamp).toISOString(), + }); + assert.deepStrictEqual(cache.getResult(), expectedResult); + assert.strictEqual(cache.getLastChecked(), fakeTimestamp); + }); + }); + + describe('shouldRefresh()', () => { + it('should return true if cache has never been set', () => { + assert.strictEqual(cache.shouldRefresh(), true); + }); + + it('should return false if elapsed time is less than duration minus maximum jitter', () => { + const fakeNow = 1625077800000; + const fakeLastChecked = fakeNow - (45 * 60 * 1000); // 45 minutes ago + sandbox.stub(Date, 'now').returns(fakeNow); + sandbox.stub(Math, 'random').returns(0); + cache.lastChecked = fakeLastChecked; + + // elapsed = 45 minutes, duration - jitter = 60 minutes + // 45 < 60 => shouldRefresh = false + assert.strictEqual(cache.shouldRefresh(), false); + }); + + it('should return true if elapsed time is greater than duration minus maximum jitter', () => { + const fakeNow = 1625077800000; + const fakeLastChecked = fakeNow - (61 * 60 * 1000); // 61 minutes ago + sandbox.stub(Date, 'now').returns(fakeNow); + sandbox.stub(Math, 'random').returns(0); + cache.lastChecked = fakeLastChecked; + + // elapsed = 61 minutes, duration - jitter = 60 minutes + // 61 > 60 => shouldRefresh = true + assert.strictEqual(cache.shouldRefresh(), true); + }); + + it('should use custom duration if provided', () => { + const customDuration = 6 * 60 * 60 * 1000; // 6 hours in milliseconds + const fakeNow = 1625077800000; + sandbox.stub(Date, 'now').returns(fakeNow); + + // Elapsed time = 5 hours + const fakeLastChecked1 = fakeNow - (5 * 60 * 60 * 1000); + cache.lastChecked = fakeLastChecked1; + + sandbox.stub(Math, 'random').returns(0); + + // 5 hours < 6 hours => shouldRefresh = false + assert.strictEqual( + cache.shouldRefresh(customDuration), + false, + 'Cache should not refresh within custom duration' + ); + + // Elapsed time = 7 hours + const fakeLastChecked2 = fakeNow - (7 * 60 * 60 * 1000); + cache.lastChecked = fakeLastChecked2; + + // 7 hours > 6 hours => shouldRefresh = true + assert.strictEqual( + cache.shouldRefresh(customDuration), + true, + 'Cache should refresh after custom duration' + ); + }); + }); + + describe('clear()', () => { + it('should reset lastChecked and result to null', () => { + cache.setResult({ data: 'test' }); + cache.clear(); + assert.strictEqual(cache.getResult(), null); + assert.strictEqual(cache.getLastChecked(), null); + }); + }); +});