Skip to content

Commit

Permalink
Merge pull request #660 from cybozu-go/fix-node-metrics
Browse files Browse the repository at this point in the history
Fix `cke_node_reboot_status` metrics
  • Loading branch information
zoetrope authored Sep 12, 2023
2 parents c798e29 + 336e07d commit e5a1269
Show file tree
Hide file tree
Showing 10 changed files with 174 additions and 154 deletions.
75 changes: 71 additions & 4 deletions metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"github.com/cybozu-go/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
v3 "go.etcd.io/etcd/client/v3"
)

type logger struct{}
Expand Down Expand Up @@ -40,10 +39,13 @@ type metricGroup struct {
// This abstraction is for mock test.
type storage interface {
IsSabakanDisabled(context.Context) (bool, error)
GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error)
GetCluster(ctx context.Context) (*cke.Cluster, error)
}

// NewCollector returns a new prometheus.Collector.
func NewCollector(client *v3.Client) prometheus.Collector {
func NewCollector(storage storage) prometheus.Collector {

return &collector{
metrics: map[string]metricGroup{
"leader": {
Expand All @@ -55,15 +57,15 @@ func NewCollector(client *v3.Client) prometheus.Collector {
isAvailable: isOperationPhaseAvailable,
},
"reboot": {
collectors: []prometheus.Collector{rebootQueueEntries, rebootQueueItems, nodeRebootStatus},
collectors: []prometheus.Collector{nodeMetricsCollector{storage}},
isAvailable: isRebootAvailable,
},
"sabakan_integration": {
collectors: []prometheus.Collector{sabakanIntegrationSuccessful, sabakanIntegrationTimestampSeconds, sabakanWorkers, sabakanUnusedMachines},
isAvailable: isSabakanIntegrationAvailable,
},
},
storage: &cke.Storage{Client: client},
storage: storage,
}
}

Expand Down Expand Up @@ -120,3 +122,68 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
}
wg.Wait()
}

// nodeMetricsCollector implements prometheus.Collector interface.
type nodeMetricsCollector struct {
storage storage
}

var _ prometheus.Collector = &nodeMetricsCollector{}

func (c nodeMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- rebootQueueEntries
ch <- rebootQueueItems
ch <- nodeRebootStatus
}

func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()

rqEntries, err := c.storage.GetRebootsEntries(ctx)
if err != nil {
log.Error("failed to get reboots entries", map[string]interface{}{
log.FnError: err,
})
return
}

cluster, err := c.storage.GetCluster(ctx)
if err != nil {
log.Error("failed to get cluster", map[string]interface{}{
log.FnError: err,
})
return
}
itemCounts := cke.CountRebootQueueEntries(rqEntries)
nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries)

ch <- prometheus.MustNewConstMetric(
rebootQueueEntries,
prometheus.GaugeValue,
float64(len(rqEntries)),
)
for status, count := range itemCounts {
ch <- prometheus.MustNewConstMetric(
rebootQueueItems,
prometheus.GaugeValue,
float64(count),
status,
)
}
for node, statuses := range nodeStatus {
for status, matches := range statuses {
value := float64(0)
if matches {
value = 1
}
ch <- prometheus.MustNewConstMetric(
nodeRebootStatus,
prometheus.GaugeValue,
value,
node,
status,
)
}
}
}
32 changes: 14 additions & 18 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,25 @@ var operationPhaseTimestampSeconds = prometheus.NewGauge(
},
)

var rebootQueueEntries = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "reboot_queue_entries",
Help: "The number of reboot queue entries remaining.",
},
var rebootQueueEntries = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "reboot_queue_entries"),
"The number of reboot queue entries remaining.",
nil,
nil,
)

var rebootQueueItems = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "reboot_queue_items",
Help: "The number of reboot queue entries remaining per status.",
},
var rebootQueueItems = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "reboot_queue_items"),
"The number of reboot queue entries remaining per status.",
[]string{"status"},
nil,
)

var nodeRebootStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "node_reboot_status",
Help: "The reboot status of a node.",
}, []string{"node", "status"},
var nodeRebootStatus = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_reboot_status"),
"The reboot status of a node.",
[]string{"node", "status"},
nil,
)

var sabakanIntegrationSuccessful = prometheus.NewGauge(
Expand Down
30 changes: 0 additions & 30 deletions metrics/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,36 +39,6 @@ func isOperationPhaseAvailable(_ context.Context, _ storage) (bool, error) {
return isLeader, nil
}

// UpdateRebootQueueEntries updates "reboot_queue_entries".
func UpdateRebootQueueEntries(numEntries int) {
rebootQueueEntries.Set(float64(numEntries))
}

// UpdateRebootQueueItems updates "reboot_queue_items".
func UpdateRebootQueueItems(counts map[string]int) {
for status, count := range counts {
rebootQueueItems.With(map[string]string{
"status": status,
}).Set(float64(count))
}
}

// UpdateNodeRebootStatus updates "node_reboot_status".
func UpdateNodeRebootStatus(nodeStatus map[string]map[string]bool) {
for node, statuses := range nodeStatus {
for status, matches := range statuses {
value := float64(0)
if matches {
value = 1
}
nodeRebootStatus.With(map[string]string{
"node": node,
"status": status,
}).Set(value)
}
}
}

func isRebootAvailable(_ context.Context, _ storage) (bool, error) {
return isLeader, nil
}
Expand Down
Loading

0 comments on commit e5a1269

Please sign in to comment.