From 62ee73ad7083a2754deb889284fb4e23aa222588 Mon Sep 17 00:00:00 2001
From: Julian Ventura <julian.ventura@lambdaclass.com>
Date: Wed, 4 Dec 2024 10:09:58 -0300
Subject: [PATCH 1/2] Add quorum reached and task responded latency gauges

---
 aggregator/pkg/aggregator.go                  |  17 +
 .../aligned/aggregator_batcher.json           | 292 +++++++++++++++++-
 metrics/metrics.go                            |  20 ++
 3 files changed, 323 insertions(+), 6 deletions(-)

diff --git a/aggregator/pkg/aggregator.go b/aggregator/pkg/aggregator.go
index 467b3b52b..daffcb04b 100644
--- a/aggregator/pkg/aggregator.go
+++ b/aggregator/pkg/aggregator.go
@@ -67,6 +67,9 @@ type Aggregator struct {
 	// Stores the TaskResponse for each batch by batchIdentifierHash
 	batchDataByIdentifierHash map[[32]byte]BatchData
 
+	// Stores the start time for each batch of the aggregator by task index
+	batchStartTimeByIdx map[uint32]time.Time
+
 	// This task index is to communicate with the local BLS
 	// Service.
 	// Note: In case of a reboot it can start from 0 again
@@ -78,6 +81,7 @@ type Aggregator struct {
 	// - batchCreatedBlockByIdx
 	// - batchDataByIdentifierHash
 	// - nextBatchIndex
+	// - batchStartTimeByIdx
 	taskMutex *sync.Mutex
 
 	// Mutex to protect ethereum wallet
@@ -124,6 +128,7 @@ func NewAggregator(aggregatorConfig config.AggregatorConfig) (*Aggregator, error
 	batchesIdxByIdentifierHash := make(map[[32]byte]uint32)
 	batchDataByIdentifierHash := make(map[[32]byte]BatchData)
 	batchCreatedBlockByIdx := make(map[uint32]uint64)
+	batchStartTimeByIdx := make(map[uint32]time.Time)
 
 	chainioConfig := sdkclients.BuildAllConfig{
 		EthHttpUrl:                 aggregatorConfig.BaseConfig.EthRpcUrl,
@@ -172,6 +177,7 @@ func NewAggregator(aggregatorConfig config.AggregatorConfig) (*Aggregator, error
 		batchesIdxByIdentifierHash: batchesIdxByIdentifierHash,
 		batchDataByIdentifierHash:  batchDataByIdentifierHash,
 		batchCreatedBlockByIdx:     batchCreatedBlockByIdx,
+		batchStartTimeByIdx:        batchStartTimeByIdx,
 		nextBatchIndex:             nextBatchIndex,
 		taskMutex:                  &sync.Mutex{},
 		walletMutex:                &sync.Mutex{},
@@ -233,6 +239,7 @@ func (agg *Aggregator) handleBlsAggServiceResponse(blsAggServiceResp blsagg.BlsA
 	batchIdentifierHash := agg.batchesIdentifierHashByIdx[blsAggServiceResp.TaskIndex]
 	batchData := agg.batchDataByIdentifierHash[batchIdentifierHash]
 	taskCreatedBlock := agg.batchCreatedBlockByIdx[blsAggServiceResp.TaskIndex]
+	taskCreatedAt := agg.batchStartTimeByIdx[blsAggServiceResp.TaskIndex]
 	agg.taskMutex.Unlock()
 	agg.AggregatorConfig.BaseConfig.Logger.Info("- Unlocked Resources: Fetching task data")
 
@@ -266,6 +273,9 @@ func (agg *Aggregator) handleBlsAggServiceResponse(blsAggServiceResp blsagg.BlsA
 
 	agg.telemetry.LogQuorumReached(batchData.BatchMerkleRoot)
 
+	// Only observe quorum reached if successful
+	agg.metrics.ObserveTaskQuorumReached(time.Since(taskCreatedAt))
+
 	agg.logger.Info("Threshold reached", "taskIndex", blsAggServiceResp.TaskIndex,
 		"batchIdentifierHash", "0x"+hex.EncodeToString(batchIdentifierHash[:]))
 
@@ -320,6 +330,8 @@ func (agg *Aggregator) sendAggregatedResponse(batchIdentifierHash [32]byte, batc
 		agg.metrics.IncBumpedGasPriceForAggregatedResponse()
 		agg.telemetry.BumpedTaskGasPrice(batchMerkleRoot, bumpedGasPrice.String())
 	}
+
+	startTime := time.Now()
 	receipt, err := agg.avsWriter.SendAggregatedResponse(
 		batchIdentifierHash,
 		batchMerkleRoot,
@@ -338,6 +350,9 @@ func (agg *Aggregator) sendAggregatedResponse(batchIdentifierHash [32]byte, batc
 		return nil, err
 	}
 
+	// We only send the latency metric if the response is successul
+	agg.metrics.ObserveLatencyForRespondToTask(time.Since(startTime))
+
 	agg.walletMutex.Unlock()
 	agg.logger.Infof("- Unlocked Wallet Resources: Sending aggregated response for batch %s", hex.EncodeToString(batchIdentifierHash[:]))
 
@@ -383,6 +398,7 @@ func (agg *Aggregator) AddNewTask(batchMerkleRoot [32]byte, senderAddress [20]by
 		BatchMerkleRoot: batchMerkleRoot,
 		SenderAddress:   senderAddress,
 	}
+	agg.batchStartTimeByIdx[batchIndex] = time.Now()
 	agg.logger.Info(
 		"Task Info added in aggregator:",
 		"Task", batchIndex,
@@ -447,6 +463,7 @@ func (agg *Aggregator) ClearTasksFromMaps() {
 				delete(agg.batchCreatedBlockByIdx, i)
 				delete(agg.batchesIdentifierHashByIdx, i)
 				delete(agg.batchDataByIdentifierHash, batchIdentifierHash)
+				delete(agg.batchStartTimeByIdx, i)
 			} else {
 				agg.logger.Warn("Task not found in maps", "taskIndex", i)
 			}
diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
index 929ff29ea..5440cb9d7 100644
--- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
+++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
@@ -18,7 +18,7 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": 4,
+  "id": 2,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -153,7 +153,6 @@
     },
     {
       "datasource": {
-        "default": true,
         "type": "prometheus",
         "uid": "prometheus"
       },
@@ -2451,7 +2450,32 @@
             ]
           }
         },
-        "overrides": []
+        "overrides": [
+          {
+            "__systemRef": "hideSeriesFrom",
+            "matcher": {
+              "id": "byNames",
+              "options": {
+                "mode": "exclude",
+                "names": [
+                  "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}"
+                ],
+                "prefix": "All except:",
+                "readOnly": true
+              }
+            },
+            "properties": [
+              {
+                "id": "custom.hideFrom",
+                "value": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": true
+                }
+              }
+            ]
+          }
+        ]
       },
       "gridPos": {
         "h": 7,
@@ -2625,6 +2649,262 @@
         }
       ],
       "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}"
+            },
+            "properties": [
+              {
+                "id": "displayName",
+                "value": "Latency"
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 61
+      },
+      "id": 43,
+      "interval": "1s",
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "quantile_over_time(0.95, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])",
+          "format": "time_series",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "{{label_name}}",
+          "range": true,
+          "refId": "Latency q95",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "quantile_over_time(0.50, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "Latency q50"
+        }
+      ],
+      "title": "Respond to task latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}"
+            },
+            "properties": [
+              {
+                "id": "displayName",
+                "value": "Latency"
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 69
+      },
+      "id": 44,
+      "interval": "1s",
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "quantile_over_time(0.95, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])",
+          "format": "time_series",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "{{label_name}}",
+          "range": true,
+          "refId": "Latency q95",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "quantile_over_time(0.50, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "Latency q50"
+        }
+      ],
+      "title": "Quorum reached latency",
+      "type": "timeseries"
     }
   ],
   "refresh": "5s",
@@ -2635,13 +2915,13 @@
     "list": []
   },
   "time": {
-    "from": "now-30m",
+    "from": "now-15m",
     "to": "now"
   },
   "timepicker": {},
   "timezone": "browser",
   "title": "System Data",
   "uid": "aggregator",
-  "version": 9,
+  "version": 14,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/metrics/metrics.go b/metrics/metrics.go
index dda2f7a04..ba0c187fc 100644
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@@ -21,6 +21,8 @@ type Metrics struct {
 	aggregatorGasCostPaidForBatcherTotal   prometheus.Gauge
 	aggregatorNumTimesPaidForBatcher       prometheus.Counter
 	numBumpedGasPriceForAggregatedResponse prometheus.Counter
+	aggregatorRespondToTaskLatency         prometheus.Gauge
+	aggregatorTaskQuorumReachedLatency     prometheus.Gauge
 }
 
 const alignedNamespace = "aligned"
@@ -59,6 +61,16 @@ func NewMetrics(ipPortAddress string, reg prometheus.Registerer, logger logging.
 			Name:      "respond_to_task_gas_price_bumped_count",
 			Help:      "Number of times gas price was bumped while sending aggregated response",
 		}),
+		aggregatorRespondToTaskLatency: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
+			Namespace: alignedNamespace,
+			Name:      "aggregator_respond_to_task_latency",
+			Help:      "Latency of last call to respondToTask on Aligned Service Manager",
+		}),
+		aggregatorTaskQuorumReachedLatency: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
+			Namespace: alignedNamespace,
+			Name:      "aggregator_task_quorum_reached_latency",
+			Help:      "Time it takes for a task to reach quorum",
+		}),
 	}
 }
 
@@ -116,3 +128,11 @@ func (m *Metrics) AddAggregatorGasPaidForBatcher(value float64) {
 func (m *Metrics) IncBumpedGasPriceForAggregatedResponse() {
 	m.numBumpedGasPriceForAggregatedResponse.Inc()
 }
+
+func (m *Metrics) ObserveLatencyForRespondToTask(elapsed time.Duration) {
+	m.aggregatorRespondToTaskLatency.Set(elapsed.Seconds())
+}
+
+func (m *Metrics) ObserveTaskQuorumReached(elapsed time.Duration) {
+	m.aggregatorTaskQuorumReachedLatency.Set(elapsed.Seconds())
+}

From a9db04cdaefe252296518921c0e2f5a674b07639 Mon Sep 17 00:00:00 2001
From: Julian Ventura <julian.ventura@lambdaclass.com>
Date: Wed, 4 Dec 2024 17:19:52 -0300
Subject: [PATCH 2/2] Modify dashboard

---
 .../aligned/aggregator_batcher.json           | 98 ++++---------------
 1 file changed, 18 insertions(+), 80 deletions(-)

diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
index 5440cb9d7..52de76921 100644
--- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
+++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json
@@ -2655,6 +2655,7 @@
         "type": "prometheus",
         "uid": "prometheus"
       },
+      "description": "",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -2704,22 +2705,10 @@
                 "value": 80
               }
             ]
-          }
+          },
+          "unit": "s"
         },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}"
-            },
-            "properties": [
-              {
-                "id": "displayName",
-                "value": "Latency"
-              }
-            ]
-          }
-        ]
+        "overrides": []
       },
       "gridPos": {
         "h": 8,
@@ -2733,7 +2722,7 @@
         "legend": {
           "calcs": [],
           "displayMode": "list",
-          "placement": "bottom",
+          "placement": "right",
           "showLegend": false
         },
         "tooltip": {
@@ -2742,40 +2731,21 @@
         }
       },
       "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "disableTextWrap": false,
-          "editorMode": "code",
-          "exemplar": false,
-          "expr": "quantile_over_time(0.95, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])",
-          "format": "time_series",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "interval": "",
-          "legendFormat": "{{label_name}}",
-          "range": true,
-          "refId": "Latency q95",
-          "useBackend": false
-        },
         {
           "datasource": {
             "type": "prometheus",
             "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "quantile_over_time(0.50, aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}[1m])",
+          "expr": "aligned_aggregator_respond_to_task_latency{bot=\"aggregator\"}",
           "hide": false,
           "instant": false,
-          "legendFormat": "__auto",
+          "legendFormat": "Latest latency",
           "range": true,
-          "refId": "Latency q50"
+          "refId": "Latency"
         }
       ],
-      "title": "Respond to task latency",
+      "title": "Latest respond to task latency",
       "type": "timeseries"
     },
     {
@@ -2834,20 +2804,7 @@
             ]
           }
         },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "{bot=\"aggregator\", instance=\"host.docker.internal:9091\", job=\"aligned-aggregator\"}"
-            },
-            "properties": [
-              {
-                "id": "displayName",
-                "value": "Latency"
-              }
-            ]
-          }
-        ]
+        "overrides": []
       },
       "gridPos": {
         "h": 8,
@@ -2861,7 +2818,7 @@
         "legend": {
           "calcs": [],
           "displayMode": "list",
-          "placement": "bottom",
+          "placement": "right",
           "showLegend": false
         },
         "tooltip": {
@@ -2870,44 +2827,25 @@
         }
       },
       "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "disableTextWrap": false,
-          "editorMode": "code",
-          "exemplar": false,
-          "expr": "quantile_over_time(0.95, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])",
-          "format": "time_series",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "interval": "",
-          "legendFormat": "{{label_name}}",
-          "range": true,
-          "refId": "Latency q95",
-          "useBackend": false
-        },
         {
           "datasource": {
             "type": "prometheus",
             "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "quantile_over_time(0.50, aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}[1m])",
+          "expr": "aligned_aggregator_task_quorum_reached_latency{bot=\"aggregator\"}",
           "hide": false,
           "instant": false,
-          "legendFormat": "__auto",
+          "legendFormat": "Latest latency",
           "range": true,
-          "refId": "Latency q50"
+          "refId": "A"
         }
       ],
-      "title": "Quorum reached latency",
+      "title": "Latest quorum reached latency",
       "type": "timeseries"
     }
   ],
-  "refresh": "5s",
+  "refresh": "",
   "schemaVersion": 38,
   "style": "dark",
   "tags": [],
@@ -2915,13 +2853,13 @@
     "list": []
   },
   "time": {
-    "from": "now-15m",
+    "from": "now-30m",
     "to": "now"
   },
   "timepicker": {},
   "timezone": "browser",
   "title": "System Data",
   "uid": "aggregator",
-  "version": 14,
+  "version": 19,
   "weekStart": ""
 }