From 62ee73ad7083a2754deb889284fb4e23aa222588 Mon Sep 17 00:00:00 2001 From: Julian Ventura Date: Wed, 4 Dec 2024 10:09:58 -0300 Subject: [PATCH 1/2] Add quorum reached and task responded latency gauges --- aggregator/pkg/aggregator.go | 17 + .../aligned/aggregator_batcher.json | 292 +++++++++++++++++- metrics/metrics.go | 20 ++ 3 files changed, 323 insertions(+), 6 deletions(-) diff --git a/aggregator/pkg/aggregator.go b/aggregator/pkg/aggregator.go index 467b3b52b..daffcb04b 100644 --- a/aggregator/pkg/aggregator.go +++ b/aggregator/pkg/aggregator.go @@ -67,6 +67,9 @@ type Aggregator struct { // Stores the TaskResponse for each batch by batchIdentifierHash batchDataByIdentifierHash map[[32]byte]BatchData + // Stores the start time for each batch of the aggregator by task index + batchStartTimeByIdx map[uint32]time.Time + // This task index is to communicate with the local BLS // Service. // Note: In case of a reboot it can start from 0 again @@ -78,6 +81,7 @@ type Aggregator struct { // - batchCreatedBlockByIdx // - batchDataByIdentifierHash // - nextBatchIndex + // - batchStartTimeByIdx taskMutex *sync.Mutex // Mutex to protect ethereum wallet @@ -124,6 +128,7 @@ func NewAggregator(aggregatorConfig config.AggregatorConfig) (*Aggregator, error batchesIdxByIdentifierHash := make(map[[32]byte]uint32) batchDataByIdentifierHash := make(map[[32]byte]BatchData) batchCreatedBlockByIdx := make(map[uint32]uint64) + batchStartTimeByIdx := make(map[uint32]time.Time) chainioConfig := sdkclients.BuildAllConfig{ EthHttpUrl: aggregatorConfig.BaseConfig.EthRpcUrl, @@ -172,6 +177,7 @@ func NewAggregator(aggregatorConfig config.AggregatorConfig) (*Aggregator, error batchesIdxByIdentifierHash: batchesIdxByIdentifierHash, batchDataByIdentifierHash: batchDataByIdentifierHash, batchCreatedBlockByIdx: batchCreatedBlockByIdx, + batchStartTimeByIdx: batchStartTimeByIdx, nextBatchIndex: nextBatchIndex, taskMutex: &sync.Mutex{}, walletMutex: &sync.Mutex{}, @@ -233,6 +239,7 @@ func (agg *Aggregator) handleBlsAggServiceResponse(blsAggServiceResp blsagg.BlsA batchIdentifierHash := agg.batchesIdentifierHashByIdx[blsAggServiceResp.TaskIndex] batchData := agg.batchDataByIdentifierHash[batchIdentifierHash] taskCreatedBlock := agg.batchCreatedBlockByIdx[blsAggServiceResp.TaskIndex] + taskCreatedAt := agg.batchStartTimeByIdx[blsAggServiceResp.TaskIndex] agg.taskMutex.Unlock() agg.AggregatorConfig.BaseConfig.Logger.Info("- Unlocked Resources: Fetching task data") @@ -266,6 +273,9 @@ func (agg *Aggregator) handleBlsAggServiceResponse(blsAggServiceResp blsagg.BlsA agg.telemetry.LogQuorumReached(batchData.BatchMerkleRoot) + // Only observe quorum reached if successful + agg.metrics.ObserveTaskQuorumReached(time.Since(taskCreatedAt)) + agg.logger.Info("Threshold reached", "taskIndex", blsAggServiceResp.TaskIndex, "batchIdentifierHash", "0x"+hex.EncodeToString(batchIdentifierHash[:])) @@ -320,6 +330,8 @@ func (agg *Aggregator) sendAggregatedResponse(batchIdentifierHash [32]byte, batc agg.metrics.IncBumpedGasPriceForAggregatedResponse() agg.telemetry.BumpedTaskGasPrice(batchMerkleRoot, bumpedGasPrice.String()) } + + startTime := time.Now() receipt, err := agg.avsWriter.SendAggregatedResponse( batchIdentifierHash, batchMerkleRoot, @@ -338,6 +350,9 @@ func (agg *Aggregator) sendAggregatedResponse(batchIdentifierHash [32]byte, batc return nil, err } + // We only send the latency metric if the response is successul + agg.metrics.ObserveLatencyForRespondToTask(time.Since(startTime)) + agg.walletMutex.Unlock() agg.logger.Infof("- Unlocked Wallet Resources: Sending aggregated response for batch %s", hex.EncodeToString(batchIdentifierHash[:])) @@ -383,6 +398,7 @@ func (agg *Aggregator) AddNewTask(batchMerkleRoot [32]byte, senderAddress [20]by BatchMerkleRoot: batchMerkleRoot, SenderAddress: senderAddress, } + agg.batchStartTimeByIdx[batchIndex] = time.Now() agg.logger.Info( "Task Info added in aggregator:", "Task", batchIndex, @@ -447,6 +463,7 @@ func (agg *Aggregator) ClearTasksFromMaps() { delete(agg.batchCreatedBlockByIdx, i) delete(agg.batchesIdentifierHashByIdx, i) delete(agg.batchDataByIdentifierHash, batchIdentifierHash) + delete(agg.batchStartTimeByIdx, i) } else { agg.logger.Warn("Task not found in maps", "taskIndex", i) } diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 929ff29ea..5440cb9d7 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 4, + "id": 2, "links": [], "liveNow": false, "panels": [ @@ -153,7 +153,6 @@ }, { "datasource": { - "default": true, "type": "prometheus", "uid": "prometheus" }, @@ -2451,7 +2450,32 @@ ] } }, - "overrides": [] + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] }, "gridPos": { "h": 7, @@ -2625,6 +2649,262 @@ } ], "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 61 + }, + "id": 43, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "quantile_over_time(0.95, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "Latency q95", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "quantile_over_time(0.50, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "Latency q50" + } + ], + "title": "Respond to task latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 69 + }, + "id": 44, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "quantile_over_time(0.95, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "Latency q95", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "quantile_over_time(0.50, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "Latency q50" + } + ], + "title": "Quorum reached latency", + "type": "timeseries" } ], "refresh": "5s", @@ -2635,13 +2915,13 @@ "list": [] }, "time": { - "from": "now-30m", + "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 9, + "version": 14, "weekStart": "" -} \ No newline at end of file +} diff --git a/metrics/metrics.go b/metrics/metrics.go index dda2f7a04..ba0c187fc 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -21,6 +21,8 @@ type Metrics struct { aggregatorGasCostPaidForBatcherTotal prometheus.Gauge aggregatorNumTimesPaidForBatcher prometheus.Counter numBumpedGasPriceForAggregatedResponse prometheus.Counter + aggregatorRespondToTaskLatency prometheus.Gauge + aggregatorTaskQuorumReachedLatency prometheus.Gauge } const alignedNamespace = "aligned" @@ -59,6 +61,16 @@ func NewMetrics(ipPortAddress string, reg prometheus.Registerer, logger logging. Name: "respond_to_task_gas_price_bumped_count", Help: "Number of times gas price was bumped while sending aggregated response", }), + aggregatorRespondToTaskLatency: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Namespace: alignedNamespace, + Name: "aggregator_respond_to_task_latency", + Help: "Latency of last call to respondToTask on Aligned Service Manager", + }), + aggregatorTaskQuorumReachedLatency: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Namespace: alignedNamespace, + Name: "aggregator_task_quorum_reached_latency", + Help: "Time it takes for a task to reach quorum", + }), } } @@ -116,3 +128,11 @@ func (m *Metrics) AddAggregatorGasPaidForBatcher(value float64) { func (m *Metrics) IncBumpedGasPriceForAggregatedResponse() { m.numBumpedGasPriceForAggregatedResponse.Inc() } + +func (m *Metrics) ObserveLatencyForRespondToTask(elapsed time.Duration) { + m.aggregatorRespondToTaskLatency.Set(elapsed.Seconds()) +} + +func (m *Metrics) ObserveTaskQuorumReached(elapsed time.Duration) { + m.aggregatorTaskQuorumReachedLatency.Set(elapsed.Seconds()) +} From a9db04cdaefe252296518921c0e2f5a674b07639 Mon Sep 17 00:00:00 2001 From: Julian Ventura Date: Wed, 4 Dec 2024 17:19:52 -0300 Subject: [PATCH 2/2] Modify dashboard --- .../aligned/aggregator_batcher.json | 98 ++++--------------- 1 file changed, 18 insertions(+), 80 deletions(-) diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 5440cb9d7..52de76921 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -2655,6 +2655,7 @@ "type": "prometheus", "uid": "prometheus" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -2704,22 +2705,10 @@ "value": 80 } ] - } + }, + "unit": "s" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}" - }, - "properties": [ - { - "id": "displayName", - "value": "Latency" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -2733,7 +2722,7 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom", + "placement": "right", "showLegend": false }, "tooltip": { @@ -2742,40 +2731,21 @@ } }, "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "quantile_over_time(0.95, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])", - "format": "time_series", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "interval": "", - "legendFormat": "{{label_name}}", - "range": true, - "refId": "Latency q95", - "useBackend": false - }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "editorMode": "code", - "expr": "quantile_over_time(0.50, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])", + "expr": "aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}", "hide": false, "instant": false, - "legendFormat": "__auto", + "legendFormat": "Latest latency", "range": true, - "refId": "Latency q50" + "refId": "Latency" } ], - "title": "Respond to task latency", + "title": "Latest respond to task latency", "type": "timeseries" }, { @@ -2834,20 +2804,7 @@ ] } }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}" - }, - "properties": [ - { - "id": "displayName", - "value": "Latency" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -2861,7 +2818,7 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom", + "placement": "right", "showLegend": false }, "tooltip": { @@ -2870,44 +2827,25 @@ } }, "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "quantile_over_time(0.95, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])", - "format": "time_series", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "interval": "", - "legendFormat": "{{label_name}}", - "range": true, - "refId": "Latency q95", - "useBackend": false - }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "editorMode": "code", - "expr": "quantile_over_time(0.50, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])", + "expr": "aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}", "hide": false, "instant": false, - "legendFormat": "__auto", + "legendFormat": "Latest latency", "range": true, - "refId": "Latency q50" + "refId": "A" } ], - "title": "Quorum reached latency", + "title": "Latest quorum reached latency", "type": "timeseries" } ], - "refresh": "5s", + "refresh": "", "schemaVersion": 38, "style": "dark", "tags": [], @@ -2915,13 +2853,13 @@ "list": [] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 14, + "version": 19, "weekStart": "" }