Skip to content

Commit

Permalink
CLDSRV-574 implement KMS health check
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolas2bert committed Nov 21, 2024
1 parent 712b474 commit 86149db
Show file tree
Hide file tree
Showing 5 changed files with 377 additions and 20 deletions.
66 changes: 66 additions & 0 deletions lib/kms/Cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
class Cache {
constructor() {
this.lastChecked = null;
this.result = null;
}

/**
* Retrieves the cached result with the last checked timestamp.
* @returns {object|null} An object containing the result and lastChecked, or null if not set.
*/
getResult() {
if (!this.result) {
return null;
}

return {
...this.result,
lastChecked: this.lastChecked ? new Date(this.lastChecked).toISOString() : null,
};
}

/**
* Retrieves the last checked timestamp.
* @returns {number|null} The timestamp of the last check or null if never checked.
*/
getLastChecked() {
return this.lastChecked;
}

/**
* Updates the cache with a new result and timestamp.
* @param {object} result - The result to cache.
* @returns {undefined}
*/
setResult(result) {
this.lastChecked = Date.now();
this.result = result;
}

/**
* Determines if the cache should be refreshed based on the last checked time.
* @param {number} duration - Duration in milliseconds for cache validity.
* @returns {boolean} true if the cache should be refreshed, else false.
*/
shouldRefresh(duration = 1 * 60 * 60 * 1000) { // Default: 1 hour
if (!this.lastChecked) {
return true;
}

const now = Date.now();
const elapsed = now - this.lastChecked;
const jitter = Math.floor(Math.random() * 15 * 60 * 1000); // Up to 15 minutes
return elapsed > (duration - jitter);
}

/**
* Clears the cache.
* @returns {undefined}
*/
clear() {
this.lastChecked = null;
this.result = null;
}
}

module.exports = Cache;
58 changes: 43 additions & 15 deletions lib/kms/wrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ const KMIPClient = require('arsenal').network.kmipClient;
const { KmsAWSClient } = require('arsenal').network;
const Common = require('./common');
const vault = require('../auth/vault');
const Cache = require('./Cache');
const cache = new Cache();
let scalityKMS;
let scalityKMSImpl;
try {
Expand Down Expand Up @@ -360,21 +362,47 @@ class KMS {
[implName]: { code: 200, message: 'OK' },
});
}
return client.healthcheck(log, err => {
const respBody = {};
if (err) {
respBody[implName] = {
error: err.description,
code: err.code,
};
} else {
respBody[implName] = {
code: 200,
message: 'OK',
};
}
return cb(null, respBody);
});

const cachedResult = cache.getResult();
logger.debug('current KMS cache state', { result: cachedResult });

const shouldRefreshCache = cache.shouldRefresh();

if (shouldRefreshCache) {
logger.debug('health check for KMS backend');
return client.healthcheck(log, (err) => {
let res;
if (err) {
res = {
// The following response makes sure that if KMS is down,
// cloudserver health check is still healthy.
// Simply including an error code in the response won't cause the health check to fail.
// Instead, the healthCheck logic detects errors by checking for the "error" field.
code: err.code,
message: 'KMS health check failed',
description: err.description,
};
logger.warn('KMS health check failed', { errorCode: err.code, error: err.description });
} else {
res = {
code: 200,
message: 'OK',
};
logger.info('KMS health check succeeded', { res });
}

cache.setResult(res);
const updatedResult = cache.getResult();
logger.debug('updated KMS cache:', { result: updatedResult });

const respBody = { [implName]: updatedResult };
return cb(null, respBody);
});
}

// Use cached healthcheck result if within a 1-hour window
logger.debug('using cached KMS health check', { cachedResult });
return cb(null, { [implName]: cachedResult });
}
}

Expand Down
7 changes: 2 additions & 5 deletions lib/utilities/healthcheckHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const { data } = require('../data/wrapper');
const vault = require('../auth/vault');
const metadata = require('../metadata/wrapper');
const async = require('async');
const kms = require('../kms/wrapper');

// current function utility is minimal, but will be expanded
function isHealthy() {
Expand Down Expand Up @@ -40,15 +41,11 @@ function writeResponse(res, error, log, results, cb) {
* @return {undefined}
*/
function clientCheck(flightCheckOnStartUp, log, cb) {
// FIXME S3C-4833 KMS healthchecks have been disabled:
// - they should be reworked to avoid blocking all requests,
// including unencrypted requests
// - they should not prevent Cloudserver from starting up
const clients = [
data,
metadata,
vault,
// kms,
kms,
];
const clientTasks = [];
clients.forEach(client => {
Expand Down
132 changes: 132 additions & 0 deletions tests/unit/encryption/checkHealth.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
const assert = require('assert');
const sinon = require('sinon');
const { errors } = require('arsenal');

const Cache = require('../../../lib/kms/Cache');
const { DummyRequestLogger } = require('../helpers');
const memBackend = require('../../../lib/kms/in_memory/backend');
const kms = require('../../../lib/kms/wrapper');

const log = new DummyRequestLogger();

describe('KMS.checkHealth', () => {
let setResultSpy;
let shouldRefreshStub;
let clock;

beforeEach(() => {
clock = sinon.useFakeTimers({
now: 1625077800000,
toFake: ['Date'],
});

setResultSpy = sinon.spy(Cache.prototype, 'setResult');
shouldRefreshStub = sinon.stub(Cache.prototype, 'shouldRefresh').returns(true);

delete memBackend.backend.healthcheck;
});

afterEach(() => {
sinon.restore();
if (clock) {
clock.restore();
}
});

it('should return OK when kms backend does not have healthcheck method', (done) => {
kms.checkHealth(log, (err, result) => {
assert.ifError(err);
assert.deepStrictEqual(result, {
['memoryKms']: { code: 200, message: 'OK' },
});

assert(shouldRefreshStub.notCalled, 'shouldRefresh should not be called');
assert(setResultSpy.notCalled, 'setResult should not be called');

done();
});
});

it('should return OK when healthcheck succeeds', (done) => {
memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(null));

kms.checkHealth(log, (err, result) => {
assert.ifError(err);

const expectedLastChecked = new Date(clock.now).toISOString();

assert.deepStrictEqual(result, {
['memoryKms']: { code: 200, message: 'OK', lastChecked: expectedLastChecked },
});

assert(shouldRefreshStub.calledOnce, 'shouldRefresh should be called once');

assert(setResultSpy.calledOnceWithExactly({
code: 200,
message: 'OK',
}));

done();
});
});

it('should return failure message when healthcheck fails', (done) => {
memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(errors.InternalError));

kms.checkHealth(log, (err, result) => {
assert.ifError(err);

const expectedLastChecked = new Date(clock.now).toISOString();

assert.deepStrictEqual(result, {
['memoryKms']: {
code: 500,
message: 'KMS health check failed',
description: 'We encountered an internal error. Please try again.',
lastChecked: expectedLastChecked,
},
});

assert(shouldRefreshStub.calledOnce, 'shouldRefresh should be called once');

assert(setResultSpy.calledOnceWithExactly({
code: 500,
message: 'KMS health check failed',
description: 'We encountered an internal error. Please try again.',
}));

done();
});
});

it('should use cached result when not refreshing', (done) => {
memBackend.backend.healthcheck = sinon.stub().callsFake((log, cb) => cb(null));
// first call to populate the cache
kms.checkHealth(log, err => {
assert.ifError(err);
shouldRefreshStub.returns(false);

// second call should use the cached result
kms.checkHealth(log, (err, result) => {
assert.ifError(err);

const expectedLastChecked = new Date(clock.now).toISOString();
assert.deepStrictEqual(result, {
['memoryKms']: {
code: 200,
message: 'OK',
lastChecked: expectedLastChecked,
},
});

// once each call
assert.strictEqual(shouldRefreshStub.callCount, 2, 'shouldRefresh should be called twice');

// only the first call
assert.strictEqual(setResultSpy.callCount, 1, 'setResult should be called once');

done();
});
});
});
});
Loading

0 comments on commit 86149db

Please sign in to comment.