Skip to content

Commit

Permalink
controller: fix OCL status reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
djoshy committed Aug 26, 2024
1 parent 15c4928 commit bcc4dd9
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 32 deletions.
4 changes: 4 additions & 0 deletions pkg/controller/common/layered_node_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ func (l *LayeredNodeState) isDesiredImageEqualToBuild(mosc *mcfgv1alpha1.Machine
return l.isImageAnnotationEqualToBuild(daemonconsts.DesiredImageAnnotationKey, mosc)
}

func (l *LayeredNodeState) IsCurrentImageEqualToBuild(mosc *mcfgv1alpha1.MachineOSConfig) bool {
return l.isImageAnnotationEqualToBuild(daemonconsts.CurrentImageAnnotationKey, mosc)
}

func (l *LayeredNodeState) isDesiredMachineConfigEqualToBuild(mosb *mcfgv1alpha1.MachineOSBuild) bool {
return l.node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] == mosb.Spec.DesiredConfig.Name

Expand Down
15 changes: 8 additions & 7 deletions pkg/controller/node/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {

// Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes
if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] &&
isNodeDone(curNode) {
isNodeDone(curNode, false) {
ctrl.logPoolNode(pool, curNode, "Completed update to %s", curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey])
changed = true
} else {
Expand Down Expand Up @@ -1010,10 +1010,10 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
}

klog.V(4).Infof("Continuing updates for layered pool %s", pool.Name)
} else {
klog.V(4).Infof("Pool %s is not layered", pool.Name)
}

klog.V(4).Infof("Pool %s is not layered", pool.Name)

nodes, err := ctrl.getNodesForPool(pool)
if err != nil {
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
Expand Down Expand Up @@ -1166,7 +1166,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
if mosb == nil {
if lns.IsDesiredEqualToPool(pool, layered) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
return nil

}
Expand All @@ -1175,7 +1175,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
} else {
if lns.IsDesiredEqualToBuild(mosc, mosb) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
return nil
}
// ensure this is happening. it might not be.
Expand Down Expand Up @@ -1207,7 +1207,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig, build *mcfgv1alpha1.MachineOSBuild, pool *mcfgv1.MachineConfigPool, nodesInPool []*corev1.Node, maxUnavailable int) ([]*corev1.Node, uint) {
unavail := getUnavailableMachines(nodesInPool, pool, layered, build)
if len(unavail) >= maxUnavailable {
klog.Infof("No nodes available for updates")
klog.V(4).Infof("Pool %s: No nodes available for updates", pool.Name)
return nil, 0
}
capacity := maxUnavailable - len(unavail)
Expand All @@ -1226,7 +1226,7 @@ func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig,
} else {
if lns.IsDesiredEqualToBuild(config, build) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: layered node %s: no update is needed", pool.Name, node.Name)
continue
}
}
Expand All @@ -1235,6 +1235,7 @@ func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig,
klog.V(4).Infof("node %s skipped during candidate selection as it is currently unscheduled", node.Name)
continue
}
klog.V(4).Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
nodes = append(nodes, node)
}
// Nodes which are failing to target this config also count against
Expand Down
16 changes: 10 additions & 6 deletions pkg/controller/node/node_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -725,9 +725,9 @@ func TestGetCandidateMachines(t *testing.T) {
helpers.NewNodeBuilder("node-7").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeReady().Node(),
helpers.NewNodeBuilder("node-8").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeReady().Node(),
},
expected: []string{"node-4", "node-5"},
otherCandidates: []string{"node-6"},
capacity: 2,
expected: []string{"node-4"},
otherCandidates: []string{"node-5", "node-6"},
capacity: 1,
layeredPool: true,
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
Expand All @@ -752,9 +752,9 @@ func TestGetCandidateMachines(t *testing.T) {
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
Node(),
},
expected: nil,
otherCandidates: nil,
capacity: 0,
expected: []string{"node-1"},
otherCandidates: []string{"node-2"},
capacity: 1,
layeredPool: true,
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
Expand Down Expand Up @@ -799,11 +799,15 @@ func TestGetCandidateMachines(t *testing.T) {
pool := pb.MachineConfigPool()

// TODO: Double check that mosb, mosc should be nil here and layered should be false
// NOTE: By doing this, we end up ignoring all the layered checks(only MCs diffs will be done to
// determine if a node is available and ready for an update). This will need to reworked.
got := getCandidateMachines(pool, nil, nil, test.nodes, test.progress, false)
nodeNames := getNamesFromNodes(got)
assert.Equal(t, test.expected, nodeNames)

// TODO: Double check that mosb, mosc should be nil here and layered should be false
// NOTE: By doing this, we end up ignoring all the layered checks(only MCs diffs will be done to
// determine if a node is available and ready for an update). This will need to reworked.
allCandidates, capacity := getAllCandidateMachines(false, nil, nil, pool, test.nodes, test.progress)
assert.Equal(t, test.capacity, capacity)
var otherCandidates []string
Expand Down
50 changes: 36 additions & 14 deletions pkg/controller/node/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ func isNodeManaged(node *corev1.Node) bool {
}

// isNodeDone returns true if the current == desired and the MCD has marked done.
func isNodeDone(node *corev1.Node) bool {
func isNodeDone(node *corev1.Node, layered bool) bool {
if node.Annotations == nil {
return false
}
Expand All @@ -361,12 +361,37 @@ func isNodeDone(node *corev1.Node) bool {
return false
}

if layered {
// The MachineConfig annotations are loaded on boot-up by the daemon which
// isn't currently done for the image annotations, so the comparisons here
// are a bit more nuanced.
cimage, cok := node.Annotations[daemonconsts.CurrentImageAnnotationKey]
dimage, dok := node.Annotations[daemonconsts.DesiredImageAnnotationKey]

// If desired image is not set, but the pool is layered, this node can
// be considered ready for an update. This is the very first time node
// is being opted into layering.
if !dok {
return true
}

// If we're here, we know that a desired image annotation exists.
// If the current image annotation does not exist, it means that the node is
// not "done", as it is doing its very first update as a layered node.
if !cok {
return false
}
// Current image annotation exists; compare with desired values to determine if the node is done
return cconfig == dconfig && cimage == dimage && isNodeMCDState(node, daemonconsts.MachineConfigDaemonStateDone)

}

return cconfig == dconfig && isNodeMCDState(node, daemonconsts.MachineConfigDaemonStateDone)
}

// isNodeDoneAt checks whether a node is fully updated to a targetConfig
func isNodeDoneAt(node *corev1.Node, pool *mcfgv1.MachineConfigPool) bool {
return isNodeDone(node) && node.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == pool.Spec.Configuration.Name
func isNodeDoneAt(node *corev1.Node, pool *mcfgv1.MachineConfigPool, layered bool) bool {
return isNodeDone(node, layered) && node.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == pool.Spec.Configuration.Name
}

// isNodeMCDState checks the MCD state against the state parameter
Expand Down Expand Up @@ -397,7 +422,8 @@ func getUpdatedMachines(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node, mo
lns := ctrlcommon.NewLayeredNodeState(node)
if mosb != nil && mosc != nil {
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
if layered && mosbState.IsBuildSuccess() && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name {
// It seems like pool image annotations are no longer being used, so node specific checks were required here
if layered && mosbState.IsBuildSuccess() && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name && isNodeDoneAt(node, pool, layered) && lns.IsCurrentImageEqualToBuild(mosc) {
updated = append(updated, node)
}
} else if lns.IsDoneAt(pool, layered) {
Expand Down Expand Up @@ -450,13 +476,13 @@ func isNodeReady(node *corev1.Node) bool {

// isNodeUnavailable is a helper function for getUnavailableMachines
// See the docs of getUnavailableMachines for more info
func isNodeUnavailable(node *corev1.Node) bool {
func isNodeUnavailable(node *corev1.Node, layered bool) bool {
// Unready nodes are unavailable
if !isNodeReady(node) {
return true
}
// Ready nodes are not unavailable
if isNodeDone(node) {
// If the node is working towards a new image/MC, it is not available
if isNodeDone(node, layered) {
return false
}
// Now we know the node isn't ready - the current config must not
Expand All @@ -480,17 +506,13 @@ func getUnavailableMachines(nodes []*corev1.Node, pool *mcfgv1.MachineConfigPool
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
// if node is unavail, desiredConfigs match, and the build is a success, then we are unavail.
// not sure on this one honestly
if layered && isNodeUnavailable(node) && mosb.Spec.DesiredConfig.Name == pool.Status.Configuration.Name && mosbState.IsBuildSuccess() {
unavail = append(unavail, node)
}
} else {
lns := ctrlcommon.NewLayeredNodeState(node)
if lns.IsUnavailable(pool, layered) {
if layered && isNodeUnavailable(node, layered) && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name && mosbState.IsBuildSuccess() {
unavail = append(unavail, node)
}
} else if isNodeUnavailable(node, layered) {
unavail = append(unavail, node)
}
}

return unavail
}

Expand Down
13 changes: 8 additions & 5 deletions pkg/controller/node/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithNodeReady().Node(),
},
unavail: []string{"node-0", "node-2"},
unavail: []string{"node-0", "node-2", "node-3"},
layeredPoolWithImage: true,
}, {
name: "Mismatched unlayered node and layered pool with image unavailable",
Expand All @@ -502,9 +502,9 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-1").WithEqualConfigsAndImages(machineConfigV1, imageV1).Node(),
helpers.NewNodeBuilder("node-2").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithNodeReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigsAndImages(machineConfigV0, imageV1).WithNodeReady().Node(),
},
unavail: []string{"node-3"},
unavail: []string{"node-0", "node-2", "node-3"},
layeredPool: true,
}, {
name: "Mismatched layered node and unlayered pool",
Expand All @@ -515,7 +515,7 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithEqualImages(imageV1).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithEqualImages(imageV1).WithNodeReady().Node(),
},
unavail: []string{"node-0"},
unavail: []string{"node-0", "node-2", "node-3"},
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
name: "nodes working toward layered should not be considered available",
Expand Down Expand Up @@ -550,11 +550,13 @@ func TestGetUnavailableMachines(t *testing.T) {
// Need to set WithNodeReady() on all nodes to avoid short-circuiting.
helpers.NewNodeBuilder("node-0").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV0).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
helpers.NewNodeBuilder("node-1").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV0).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
Expand All @@ -566,11 +568,12 @@ func TestGetUnavailableMachines(t *testing.T) {
Node(),
helpers.NewNodeBuilder("node-3").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV1).WithCurrentImage("").
WithDesiredImage(imageV1).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
},
layeredPool: true,
layeredPoolWithImage: true,
unavail: []string{"node-2", "node-3"},
},
Expand Down

0 comments on commit bcc4dd9

Please sign in to comment.