From 99811fb505ad51d08e4d348a67a5290ff876daab Mon Sep 17 00:00:00 2001 From: Kerem Kazan Date: Mon, 14 Oct 2024 13:26:41 -0400 Subject: [PATCH] adding a timeout to stream metadata health checks to increase failure visibility (#1251) in the event of a hanging request, the health check failure is not registered by the stream metadata service, but it is registered by the load balancer. this causes the service to get terminated, but the service is unable to detect the failure and log the error. this PR allows us to capture and log the error so that we can respond to it better --- packages/stream-metadata/src/environment.ts | 3 +++ packages/stream-metadata/src/routes/health.ts | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/stream-metadata/src/environment.ts b/packages/stream-metadata/src/environment.ts index d702755ef..6b27efa8c 100644 --- a/packages/stream-metadata/src/environment.ts +++ b/packages/stream-metadata/src/environment.ts @@ -74,6 +74,9 @@ function makeConfig() { tracingEnabled: envMain.TRACING_ENABLED, profilingEnabled: envMain.PROFILING_ENABLED, }, + healthCheck: { + timeout: 5000, // 5 seconds + }, } } diff --git a/packages/stream-metadata/src/routes/health.ts b/packages/stream-metadata/src/routes/health.ts index d14d0cb60..2db1915c8 100644 --- a/packages/stream-metadata/src/routes/health.ts +++ b/packages/stream-metadata/src/routes/health.ts @@ -1,17 +1,28 @@ import { FastifyReply, FastifyRequest } from 'fastify' import { getRiverRegistry } from '../evmRpcClient' +import { config } from '../environment' export async function checkHealth(request: FastifyRequest, reply: FastifyReply) { const logger = request.log.child({ name: checkHealth.name }) // Do a health check on the river registry try { - await getRiverRegistry().getAllNodes() + logger.info('Running riverRegistry health check') + await Promise.race([ + getRiverRegistry().getAllNodes(), + new Promise((_, reject) => + setTimeout( + () => reject(new Error('Timed out waiting for the riverRegistry check')), + config.healthCheck.timeout, + ), + ), + ]) + logger.info('Health check passed') // healthy return reply.code(200).send({ status: 'ok' }) } catch (error) { // unhealthy - logger.error(error, 'Failed to get river registry') + logger.error(error, 'Health check failed') return reply.code(500).send({ status: 'error' }) } }