Skip to content

Commit

Permalink
v2 liveness probe to cover Solana indexing flakiness (#3295)
Browse files Browse the repository at this point in the history
### Description

Will restart the pod if:
- the relayer serial submitter's latest nonce is lower than that of the
validators
- the relayer serial submitter's latest nonce hasn't increased in the
last 10 mins (to avoid a case where the relayer has just spun up and is
trying to catch up to the validators)

Fixes hyperlane-xyz/issues#1102

### Drive-by changes

<!--
Are there any minor or drive-by changes also included?
-->

### Related issues

<!--
- Fixes #[issue number here]
-->

### Backward compatibility

<!--
Are these changes backward compatible? Are there any infrastructure
implications, e.g. changes that would prohibit deploying older commits
using this infra tooling?

Yes/No
-->

### Testing

<!--
What kind of testing have these changes undergone?

None/Manual/Unit Tests
-->
  • Loading branch information
tkporter authored Feb 22, 2024
1 parent fa48113 commit 4141ba9
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 2 deletions.
4 changes: 3 additions & 1 deletion rust/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ RUN apt-get update && \
apt-get install -y \
openssl \
ca-certificates \
tini && \
tini \
curl \
jq && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app
Expand Down
50 changes: 50 additions & 0 deletions rust/helm/hyperlane-agent/templates/relayer-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,56 @@ spec:
ports:
- name: metrics
containerPort: {{ .Values.hyperlane.metrics.port }}
{{- if .Values.hyperlane.relayer.livenessProbe }}
{{/*
* It's probably better to not have this hardcoded into the helm chart and to instead parametrize this -
* however passing in multi-line strings is a total pain in Helm, especially with our setup of using
* CLI args to set values. So for now, we'll just hardcode this in the chart and enable it only if
* a liveness probe is asked for.
*/}}
livenessProbe:
exec:
command:
- /bin/sh
- -c
- |
# Weird logging practices courtesy of https://stackoverflow.com/a/75257695
# otherwise they get swallowed
# Make a query to see the difference between the relayer's Solana index and the highest validator Solana index.
# We look at the highest validator index from 5 mins ago to give the relayer some time to catch up.
RELAYER_CMP_VALIDATOR_QUERY=$(curl 'http://prometheus-server.monitoring.svc.cluster.local/api/v1/query' --data-urlencode 'query=max by (origin) (hyperlane_last_known_message_nonce{phase="processor_loop", hyperlane_deployment="mainnet2", origin="solana"}) - max by (origin) (label_replace(hyperlane_latest_checkpoint{hyperlane_deployment="mainnet2", chain="solana"} offset 5m, "origin", "$1", "chain", "(.*)"))')
echo "Liveness probe: relayer Solana index - validator Solana index: $RELAYER_CMP_VALIDATOR_QUERY" > /proc/1/fd/1
# If the value is negative, the relayer is behind the validators. This env var
# will be empty if the relayer is not behind the validators, so the presence of
# this env var is a signal that the relayer is unhealthy.
RELAYER_CMP_VALIDATOR_NEGATIVE=$(echo $RELAYER_CMP_VALIDATOR_QUERY | jq -r '.data.result[0].value[1] | select(. < "0")')
echo "Liveness probe: relayer Solana index - validator Solana negative value: $RELAYER_CMP_VALIDATOR_NEGATIVE" > /proc/1/fd/1
# Make a query to see if the relayer's Solana index is increasing over the last 5 mins.
RELAYER_SOLANA_INDEX_DERIV_QUERY=$(curl 'http://prometheus-server.monitoring.svc.cluster.local/api/v1/query' --data-urlencode 'query=deriv(hyperlane_last_known_message_nonce{phase="processor_loop", hyperlane_deployment="mainnet2", origin="solana", remote="any"}[5m])')
echo "Liveness probe: relayer Solana index deriv: $RELAYER_SOLANA_INDEX_DERIV" > /proc/1/fd/1
# This env var will be empty if the value is increasing, so the presence of this
# env var is a signal that the relayer is unhealthy.
RELAYER_SOLANA_INDEX_DERIV_VALUE_NOT_INCREASING=$(echo $RELAYER_SOLANA_INDEX_DERIV_QUERY | jq -r '.data.result[0].value[1] | select(. < "0.0001")')
echo "Liveness probe: relayer Solana index deriv value not increasing: $RELAYER_SOLANA_INDEX_DERIV_VALUE_NOT_INCREASING" > /proc/1/fd/1
# If either is empty, the relayer is healthy because the relayer is not behind the validators, or the relayer
# is currently catching up to the validators
if [ -z "$RELAYER_CMP_VALIDATOR_NEGATIVE" ] || [ -z "$RELAYER_SOLANA_INDEX_DERIV_VALUE_NOT_INCREASING" ]; then
echo "Liveness probe: Relayer is healthy" > /proc/1/fd/1
exit 0
else
echo "Liveness probe: Relayer is unhealthy" > /proc/1/fd/1
exit 1
fi
initialDelaySeconds: 300
periodSeconds: 60
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
2 changes: 1 addition & 1 deletion typescript/infra/config/environments/mainnet2/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ const hyperlane: RootAgentConfig = {
connectionType: AgentConnectionType.HttpFallback,
docker: {
repo,
tag: '49a581b-20240203-151524',
tag: '42d50c6-20240221-113013',
},
blacklist: [
...releaseCandidateHelloworldMatchingList,
Expand Down
8 changes: 8 additions & 0 deletions typescript/infra/src/agents/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,19 @@ export class RelayerHelmManager extends OmniscientAgentHelmManager {
}

async helmValues(): Promise<HelmRootAgentValues> {
// Only use the liveness probe for the mainnet2 Hyperlane context
// and if solana is a relayer chain.
let livenessProbe =
this.context === Contexts.Hyperlane &&
this.environment === 'mainnet2' &&
this.config.contextChainNames.relayer.includes('solana');

const values = await super.helmValues();
values.hyperlane.relayer = {
enabled: true,
aws: this.config.requiresAwsCredentials,
config: await this.config.buildConfig(),
livenessProbe,
};

const signers = await this.config.signers();
Expand Down
1 change: 1 addition & 0 deletions typescript/infra/src/config/agent/relayer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ export interface RelayerConfig
// This is at `.hyperlane.relayer` in the values file.
export interface HelmRelayerValues extends HelmStatefulSetValues {
aws: boolean;
livenessProbe?: boolean;
config?: RelayerConfig;
}

Expand Down

0 comments on commit 4141ba9

Please sign in to comment.