diff --git a/metrics/collector.go b/metrics/collector.go index a711562a9..f00b2aef7 100644 --- a/metrics/collector.go +++ b/metrics/collector.go @@ -40,10 +40,14 @@ type metricGroup struct { // This abstraction is for mock test. type storage interface { IsSabakanDisabled(context.Context) (bool, error) + GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error) + GetCluster(ctx context.Context) (*cke.Cluster, error) } // NewCollector returns a new prometheus.Collector. func NewCollector(client *v3.Client) prometheus.Collector { + + storage := &cke.Storage{Client: client} return &collector{ metrics: map[string]metricGroup{ "leader": { @@ -55,7 +59,7 @@ func NewCollector(client *v3.Client) prometheus.Collector { isAvailable: isOperationPhaseAvailable, }, "reboot": { - collectors: []prometheus.Collector{rebootQueueEntries, rebootQueueItems, nodeRebootStatus}, + collectors: []prometheus.Collector{nodeMetricsCollector{storage}}, isAvailable: isRebootAvailable, }, "sabakan_integration": { @@ -63,7 +67,7 @@ func NewCollector(client *v3.Client) prometheus.Collector { isAvailable: isSabakanIntegrationAvailable, }, }, - storage: &cke.Storage{Client: client}, + storage: storage, } } @@ -120,3 +124,60 @@ func (c collector) Collect(ch chan<- prometheus.Metric) { } wg.Wait() } + +type nodeMetricsCollector struct { + storage storage +} + +var _ prometheus.Collector = &nodeMetricsCollector{} + +func (c nodeMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- rebootQueueEntries + ch <- rebootQueueItems + ch <- nodeRebootStatus +} + +func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + rqEntries, err := c.storage.GetRebootsEntries(ctx) + if err != nil { + return + } + cluster, err := c.storage.GetCluster(ctx) + if err != nil { + return + } + itemCounts := cke.CountRebootQueueEntries(rqEntries) + nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries) + + ch <- prometheus.MustNewConstMetric( + rebootQueueEntries, + prometheus.GaugeValue, + float64(len(rqEntries)), + ) + for status, count := range itemCounts { + ch <- prometheus.MustNewConstMetric( + rebootQueueItems, + prometheus.GaugeValue, + float64(count), + status, + ) + } + for node, statuses := range nodeStatus { + for status, matches := range statuses { + value := float64(0) + if matches { + value = 1 + } + ch <- prometheus.MustNewConstMetric( + nodeRebootStatus, + prometheus.GaugeValue, + value, + node, + status, + ) + } + } +} diff --git a/metrics/metrics.go b/metrics/metrics.go index 7aa387ed5..e9531e23e 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -33,29 +33,25 @@ var operationPhaseTimestampSeconds = prometheus.NewGauge( }, ) -var rebootQueueEntries = prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "reboot_queue_entries", - Help: "The number of reboot queue entries remaining.", - }, +var rebootQueueEntries = prometheus.NewDesc( + "reboot_queue_entries", + "The number of reboot queue entries remaining.", + nil, + nil, ) -var rebootQueueItems = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "reboot_queue_items", - Help: "The number of reboot queue entries remaining per status.", - }, +var rebootQueueItems = prometheus.NewDesc( + "reboot_queue_items", + "The number of reboot queue entries remaining per status.", []string{"status"}, + nil, ) -var nodeRebootStatus = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "node_reboot_status", - Help: "The reboot status of a node.", - }, []string{"node", "status"}, +var nodeRebootStatus = prometheus.NewDesc( + "node_reboot_status", + "The reboot status of a node.", + []string{"node", "status"}, + nil, ) var sabakanIntegrationSuccessful = prometheus.NewGauge( diff --git a/metrics/updater.go b/metrics/updater.go index 1579732ce..e805b04d9 100644 --- a/metrics/updater.go +++ b/metrics/updater.go @@ -39,36 +39,6 @@ func isOperationPhaseAvailable(_ context.Context, _ storage) (bool, error) { return isLeader, nil } -// UpdateRebootQueueEntries updates "reboot_queue_entries". -func UpdateRebootQueueEntries(numEntries int) { - rebootQueueEntries.Set(float64(numEntries)) -} - -// UpdateRebootQueueItems updates "reboot_queue_items". -func UpdateRebootQueueItems(counts map[string]int) { - for status, count := range counts { - rebootQueueItems.With(map[string]string{ - "status": status, - }).Set(float64(count)) - } -} - -// UpdateNodeRebootStatus updates "node_reboot_status". -func UpdateNodeRebootStatus(nodeStatus map[string]map[string]bool) { - for node, statuses := range nodeStatus { - for status, matches := range statuses { - value := float64(0) - if matches { - value = 1 - } - nodeRebootStatus.With(map[string]string{ - "node": node, - "status": status, - }).Set(value) - } - } -} - func isRebootAvailable(_ context.Context, _ storage) (bool, error) { return isLeader, nil } diff --git a/metrics/updater_test.go b/metrics/updater_test.go index 4a67f5b83..9b10bc57b 100644 --- a/metrics/updater_test.go +++ b/metrics/updater_test.go @@ -274,8 +274,6 @@ func testUpdateRebootQueueEntries(t *testing.T) { collector, _ := newTestCollector() handler := GetHandler(collector) - UpdateRebootQueueEntries(tt.input) - w := httptest.NewRecorder() req := httptest.NewRequest("GET", "/metrics", nil) handler.ServeHTTP(w, req) @@ -343,8 +341,6 @@ func testUpdateRebootQueueItems(t *testing.T) { collector, _ := newTestCollector() handler := GetHandler(collector) - UpdateRebootQueueItems(tt.input) - w := httptest.NewRecorder() req := httptest.NewRequest("GET", "/metrics", nil) handler.ServeHTTP(w, req) @@ -396,8 +392,6 @@ func testUpdateNodeRebootStatus(t *testing.T) { collector, _ := newTestCollector() handler := GetHandler(collector) - UpdateNodeRebootStatus(input) - w := httptest.NewRecorder() req := httptest.NewRequest("GET", "/metrics", nil) handler.ServeHTTP(w, req) @@ -597,6 +591,14 @@ func (s *testStorage) IsSabakanDisabled(_ context.Context) (bool, error) { return !s.sabakanEnabled, nil } +func (s *testStorage) GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error) { + return nil, nil +} + +func (s *testStorage) GetCluster(ctx context.Context) (*cke.Cluster, error) { + return nil, nil +} + func labelToMap(labelPair []*dto.LabelPair) map[string]string { res := make(map[string]string) for _, l := range labelPair { diff --git a/mtest/reboot_test.go b/mtest/reboot_test.go index 827952def..30488232e 100644 --- a/mtest/reboot_test.go +++ b/mtest/reboot_test.go @@ -94,7 +94,6 @@ func testRebootOperations() { // - RebootDrainTimeoutOp // - RebootUncordonOp // - RebootDequeueOp - // - RebootRecalcMetricsOp cluster := getCluster() for i := 0; i < 3; i++ { diff --git a/op/reboot.go b/op/reboot.go index 2a9533072..f86ef94e6 100644 --- a/op/reboot.go +++ b/op/reboot.go @@ -9,7 +9,6 @@ import ( "time" "github.com/cybozu-go/cke" - "github.com/cybozu-go/cke/metrics" "github.com/cybozu-go/log" "github.com/cybozu-go/well" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -524,64 +523,6 @@ func (c rebootDequeueCommand) Command() cke.Command { } } -// - -type rebootRecalcMetricsOp struct { - finished bool -} - -// RebootRecalcMetricsOp returns an Operator to racalc metrics. -func RebootRecalcMetricsOp() cke.Operator { - return &rebootRecalcMetricsOp{} -} - -func (o *rebootRecalcMetricsOp) Name() string { - return "reboot-recalc-metrics" -} - -func (o *rebootRecalcMetricsOp) NextCommand() cke.Commander { - if o.finished { - return nil - } - - o.finished = true - return rebootRecalcMetricsCommand{} -} - -func (o *rebootRecalcMetricsOp) Targets() []string { - return []string{} -} - -type rebootRecalcMetricsCommand struct { -} - -func (c rebootRecalcMetricsCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error { - rqEntries, err := inf.Storage().GetRebootsEntries(ctx) - if err != nil { - return err - } - cluster, err := inf.Storage().GetCluster(ctx) - if err != nil { - return err - } - - metrics.UpdateRebootQueueEntries(len(rqEntries)) - itemCounts := cke.CountRebootQueueEntries(rqEntries) - metrics.UpdateRebootQueueItems(itemCounts) - nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries) - metrics.UpdateNodeRebootStatus(nodeStatus) - - return nil -} - -func (c rebootRecalcMetricsCommand) Command() cke.Command { - return cke.Command{ - Name: "rebootRecalcMetricsCommand", - } -} - -// - func listProtectedNamespaces(ctx context.Context, cs *kubernetes.Clientset, ls *metav1.LabelSelector) (map[string]bool, error) { selector, err := metav1.LabelSelectorAsSelector(ls) if err != nil { diff --git a/sabakan/generator.go b/sabakan/generator.go index 4ddf51941..b28128406 100644 --- a/sabakan/generator.go +++ b/sabakan/generator.go @@ -52,6 +52,8 @@ func MachineToNode(m *Machine, tmpl *cke.Node) *cke.Node { n.Labels["cke.cybozu.com/rack"] = strconv.Itoa(m.Spec.Rack) n.Labels["cke.cybozu.com/index-in-rack"] = strconv.Itoa(m.Spec.IndexInRack) n.Labels["cke.cybozu.com/role"] = m.Spec.Role + n.Labels["cke.cybozu.com/retire-date"] = m.Spec.RetireDate.Format("2006-01") + n.Labels["cke.cybozu.com/register-date"] = m.Spec.RegisterDate.Format("2006-01") n.Labels["node-role.kubernetes.io/"+m.Spec.Role] = "true" if n.ControlPlane { n.Labels["node-role.kubernetes.io/master"] = "true" diff --git a/server/control.go b/server/control.go index e216134b3..c76e52292 100644 --- a/server/control.go +++ b/server/control.go @@ -314,9 +314,6 @@ func (c Controller) runOnce(ctx context.Context, leaderKey string, tick <-chan t if err != nil { return err } - metrics.UpdateRebootQueueEntries(len(rqEntries)) - itemCounts := cke.CountRebootQueueEntries(rqEntries) - metrics.UpdateRebootQueueItems(itemCounts) rqEntries = cke.DedupRebootQueueEntries(rqEntries) if len(rqEntries) > 0 { diff --git a/server/strategy.go b/server/strategy.go index b7d2f1072..f6e8fc80a 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -719,7 +719,6 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp } if len(ops) > 0 { phaseReboot = true - ops = append(ops, op.RebootRecalcMetricsOp()) } return ops, phaseReboot diff --git a/server/strategy_test.go b/server/strategy_test.go index e10dfe09d..6a517c4d3 100644 --- a/server/strategy_test.go +++ b/server/strategy_test.go @@ -1207,7 +1207,7 @@ func TestDecideOps(t *testing.T) { d.Status.Kubernetes.MasterEndpointSlice.Endpoints[2].Conditions.Ready = &endpointReady d.Status.Kubernetes.EtcdEndpointSlice.Endpoints[2].Conditions.Ready = &endpointReady }), - ExpectedOps: []opData{{"reboot-drain-start", 1}, {"reboot-recalc-metrics", 0}}, + ExpectedOps: []opData{{"reboot-drain-start", 1}}, }, { Name: "EndpointsWithCancelledRebootEntry", @@ -1224,7 +1224,7 @@ func TestDecideOps(t *testing.T) { Status: cke.RebootStatusCancelled, }, }), - ExpectedOps: []opData{{"reboot-dequeue", 1}, {"reboot-recalc-metrics", 0}}, + ExpectedOps: []opData{{"reboot-dequeue", 1}}, }, { Name: "UserResourceAdd", @@ -2065,7 +2065,6 @@ func TestDecideOps(t *testing.T) { }), ExpectedOps: []opData{ {"reboot-drain-start", 1}, - {"reboot-recalc-metrics", 0}, }, }, { @@ -2102,7 +2101,6 @@ func TestDecideOps(t *testing.T) { }), ExpectedOps: []opData{ {"reboot-reboot", 1}, - {"reboot-recalc-metrics", 0}, }, }, { @@ -2122,7 +2120,6 @@ func TestDecideOps(t *testing.T) { }), ExpectedOps: []opData{ {"reboot-drain-timeout", 1}, - {"reboot-recalc-metrics", 0}, }, }, { @@ -2142,7 +2139,6 @@ func TestDecideOps(t *testing.T) { }), ExpectedOps: []opData{ {"reboot-dequeue", 1}, - {"reboot-recalc-metrics", 0}, }, }, { @@ -2181,7 +2177,6 @@ func TestDecideOps(t *testing.T) { }), ExpectedOps: []opData{ {"reboot-dequeue", 1}, - {"reboot-recalc-metrics", 0}, }, }, }