Skip to content

Commit

Permalink
wip: rq metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Daichi Sakaue <[email protected]>
  • Loading branch information
yokaze committed Nov 30, 2023
1 parent 20eadbd commit 6cc71a3
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ CKE exposes the following metrics with the Prometheus format at `/metrics` REST
| node_reboot_status | The reboot status of a node. | Gauge | `node`, `status` |
| operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` |
| operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | |
| reboot_queue_enabled | True (=1) if reboot queue is enabled. | Gauge | |
| reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | |
| reboot_queue_items | The number reboot queue entries remaining per status. | Gauge | `status` |
| reboot_queue_running | True (=1) if reboot queue is enabled and the queue is not empty. | Gauge | |
| sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | |
| sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | |
| sabakan_workers | The number of worker nodes for each role. | Gauge | `role` |
Expand Down
29 changes: 29 additions & 0 deletions metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type metricGroup struct {
// This abstraction is for mock test.
type storage interface {
IsSabakanDisabled(context.Context) (bool, error)
IsRebootQueueDisabled(ctx context.Context) (bool, error)
GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error)
GetCluster(ctx context.Context) (*cke.Cluster, error)
}
Expand Down Expand Up @@ -131,15 +132,25 @@ type nodeMetricsCollector struct {
var _ prometheus.Collector = &nodeMetricsCollector{}

func (c nodeMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- rebootQueueEnabled
ch <- rebootQueueEntries
ch <- rebootQueueItems
ch <- rebootQueueRunning
ch <- nodeRebootStatus
}

func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()

rqDisabled, err := c.storage.IsRebootQueueDisabled(ctx)
if err != nil {
log.Error("failed to get if reboot queue is enabled", map[string]interface{}{
log.FnError: err,
})
return
}

rqEntries, err := c.storage.GetRebootsEntries(ctx)
if err != nil {
log.Error("failed to get reboots entries", map[string]interface{}{
Expand All @@ -148,6 +159,14 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
return
}

var rqEnabled, rqRunning float64
if !rqDisabled {
rqEnabled = 1
}
if !rqDisabled && len(rqEntries) > 0 {
rqRunning = 1
}

cluster, err := c.storage.GetCluster(ctx)
if err != nil {
log.Error("failed to get cluster", map[string]interface{}{
Expand All @@ -158,11 +177,21 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
itemCounts := cke.CountRebootQueueEntries(rqEntries)
nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries)

ch <- prometheus.MustNewConstMetric(
rebootQueueEnabled,
prometheus.GaugeValue,
rqEnabled,
)
ch <- prometheus.MustNewConstMetric(
rebootQueueEntries,
prometheus.GaugeValue,
float64(len(rqEntries)),
)
ch <- prometheus.MustNewConstMetric(
rebootQueueRunning,
prometheus.GaugeValue,
rqRunning,
)
for status, count := range itemCounts {
ch <- prometheus.MustNewConstMetric(
rebootQueueItems,
Expand Down
14 changes: 14 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ var operationPhaseTimestampSeconds = prometheus.NewGauge(
},
)

var rebootQueueEnabled = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "reboot_queue_enabled"),
"1 if reboot queue is enabled.",
nil,
nil,
)

var rebootQueueEntries = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "reboot_queue_entries"),
"The number of reboot queue entries remaining.",
Expand All @@ -47,6 +54,13 @@ var rebootQueueItems = prometheus.NewDesc(
nil,
)

var rebootQueueRunning = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "reboot_queue_running"),
"1 if reboot queue is enabled and the queue is not empty.",
nil,
nil,
)

var nodeRebootStatus = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_reboot_status"),
"The reboot status of a node.",
Expand Down

0 comments on commit 6cc71a3

Please sign in to comment.