Skip to content

Commit

Permalink
Add repair-queue metrics
Browse files Browse the repository at this point in the history
Signed-off-by: morimoto-cybozu <[email protected]>
  • Loading branch information
morimoto-cybozu committed Dec 26, 2023
1 parent 4ccf1dc commit 3105433
Show file tree
Hide file tree
Showing 8 changed files with 432 additions and 17 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve

## [Unreleased]

### Added

- Implement repair queue in [#692](https://github.com/cybozu-go/cke/pull/692)

## [1.27.1]

### Added
Expand Down
31 changes: 17 additions & 14 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,23 @@ Metrics

CKE exposes the following metrics with the Prometheus format at `/metrics` REST API endpoint. All these metrics are prefixed with `cke_`

| Name | Description | Type | Labels |
| ------------------------------------- | -------------------------------------------------------------------------- | ----- | ---------------- |
| leader | True (=1) if this server is the leader of CKE. | Gauge | |
| node_reboot_status | The reboot status of a node. | Gauge | `node`, `status` |
| operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` |
| operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | |
| reboot_queue_enabled | True (=1) if reboot queue is enabled. | Gauge | |
| reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | |
| reboot_queue_items | The number reboot queue entries remaining per status. | Gauge | `status` |
| reboot_queue_running | True (=1) if reboot queue is running. | Gauge | |
| sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | |
| sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | |
| sabakan_workers | The number of worker nodes for each role. | Gauge | `role` |
| sabakan_unused_machines | The number of unused machines. | Gauge | |
| Name | Description | Type | Labels |
| ------------------------------------- | -------------------------------------------------------------------------- | ----- | ------------------- |
| leader | True (=1) if this server is the leader of CKE. | Gauge | |
| node_reboot_status | The reboot status of a node. | Gauge | `node`, `status` |
| machine_repair_status | The repair status of a machine. | Gauge | `address`, `status` |
| operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` |
| operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | |
| reboot_queue_enabled | True (=1) if reboot queue is enabled. | Gauge | |
| reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | |
| reboot_queue_items | The number of reboot queue entries remaining per status. | Gauge | `status` |
| reboot_queue_running | True (=1) if reboot queue is running. | Gauge | |
| repair_queue_enabled | True (=1) if repair queue is enabled. | Gauge | |
| repair_queue_items | The number of repair queue entries remaining per status. | Gauge | `status` |
| sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | |
| sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | |
| sabakan_workers | The number of worker nodes for each role. | Gauge | `role` |
| sabakan_unused_machines | The number of unused machines. | Gauge | |

All metrics but `leader` are available only when the server is the leader of CKE.
`sabakan_*` metrics are available only when [Sabakan integration](sabakan-integration.md) is enabled.
Expand Down
79 changes: 77 additions & 2 deletions metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ type storage interface {
IsRebootQueueDisabled(ctx context.Context) (bool, error)
IsRebootQueueRunning(ctx context.Context) (bool, error)
GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error)
IsRepairQueueDisabled(ctx context.Context) (bool, error)
GetRepairsEntries(ctx context.Context) ([]*cke.RepairQueueEntry, error)
GetCluster(ctx context.Context) (*cke.Cluster, error)
}

Expand All @@ -58,9 +60,9 @@ func NewCollector(storage storage) prometheus.Collector {
collectors: []prometheus.Collector{operationPhase, operationPhaseTimestampSeconds},
isAvailable: isOperationPhaseAvailable,
},
"reboot": {
"node": {
collectors: []prometheus.Collector{nodeMetricsCollector{storage}},
isAvailable: isRebootAvailable,
isAvailable: isNodeAvailable,
},
"sabakan_integration": {
collectors: []prometheus.Collector{sabakanIntegrationSuccessful, sabakanIntegrationTimestampSeconds, sabakanWorkers, sabakanUnusedMachines},
Expand Down Expand Up @@ -138,9 +140,18 @@ func (c nodeMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- rebootQueueItems
ch <- rebootQueueRunning
ch <- nodeRebootStatus

ch <- repairQueueEnabled
ch <- repairQueueItems
ch <- machineRepairStatus
}

func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
c.collectReboot(ch)
c.collectRepair(ch)
}

func (c nodeMetricsCollector) collectReboot(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()

Expand Down Expand Up @@ -225,3 +236,67 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
}
}
}

func (c nodeMetricsCollector) collectRepair(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()

disabled, err := c.storage.IsRepairQueueDisabled(ctx)
if err != nil {
log.Error("failed to get if repair queue is enabled", map[string]interface{}{
log.FnError: err,
})
return
}
var enabled float64
if !disabled {
enabled = 1
}

entries, err := c.storage.GetRepairsEntries(ctx)
if err != nil {
log.Error("failed to get repairs entries", map[string]interface{}{
log.FnError: err,
})
return
}

cluster, err := c.storage.GetCluster(ctx)
if err != nil {
log.Error("failed to get cluster", map[string]interface{}{
log.FnError: err,
})
return
}
itemCounts := cke.CountRepairQueueEntries(entries)
machineStatus := cke.BuildMachineRepairStatus(cluster.Nodes, entries)

ch <- prometheus.MustNewConstMetric(
repairQueueEnabled,
prometheus.GaugeValue,
enabled,
)
for status, count := range itemCounts {
ch <- prometheus.MustNewConstMetric(
repairQueueItems,
prometheus.GaugeValue,
float64(count),
status,
)
}
for address, statuses := range machineStatus {
for status, matches := range statuses {
value := float64(0)
if matches {
value = 1
}
ch <- prometheus.MustNewConstMetric(
machineRepairStatus,
prometheus.GaugeValue,
value,
address,
status,
)
}
}
}
21 changes: 21 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,27 @@ var nodeRebootStatus = prometheus.NewDesc(
nil,
)

var repairQueueEnabled = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "repair_queue_enabled"),
"1 if repair queue is enabled.",
nil,
nil,
)

var repairQueueItems = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "repair_queue_items"),
"The number of repair queue entries remaining per status.",
[]string{"status"},
nil,
)

var machineRepairStatus = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "machine_repair_status"),
"The repair status of a machine.",
[]string{"address", "status"},
nil,
)

var sabakanIntegrationSuccessful = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Expand Down
2 changes: 1 addition & 1 deletion metrics/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func isOperationPhaseAvailable(_ context.Context, _ storage) (bool, error) {
return isLeader, nil
}

func isRebootAvailable(_ context.Context, _ storage) (bool, error) {
func isNodeAvailable(_ context.Context, _ storage) (bool, error) {
return isLeader, nil
}

Expand Down
Loading

0 comments on commit 3105433

Please sign in to comment.