Skip to content

Commit

Permalink
feat(nodeResources): add GPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
DexterYan committed Dec 19, 2024
1 parent fcfdc63 commit 763b64c
Show file tree
Hide file tree
Showing 17 changed files with 578 additions and 17 deletions.
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.replicated.com_analyzers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.replicated.com_preflights.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.replicated.com_supportbundles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_analyzers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_preflights.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_supportbundles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,10 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
389 changes: 387 additions & 2 deletions pkg/analyze/files/nodes.json

Large diffs are not rendered by default.

60 changes: 46 additions & 14 deletions pkg/analyze/node_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta

for _, outcome := range analyzer.Outcomes {
if outcome.Fail != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Fail.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Fail.When, matchingNodes, analyzer.Filters)

if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -100,7 +101,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}
} else if outcome.Warn != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Warn.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Warn.When, matchingNodes, analyzer.Filters)
if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -116,7 +117,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}
} else if outcome.Pass != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Pass.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Pass.When, matchingNodes, analyzer.Filters)
if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -137,7 +138,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}

func compareNodeResourceConditionalToActual(conditional string, matchingNodes []corev1.Node) (res bool, err error) {
func compareNodeResourceConditionalToActual(conditional string, matchingNodes []corev1.Node, filters *troubleshootv1beta2.NodeResourceFilters) (res bool, err error) {
res = false
err = nil

Expand Down Expand Up @@ -190,18 +191,23 @@ func compareNodeResourceConditionalToActual(conditional string, matchingNodes []

function := match[1]
property := match[2]
resourceName := ""

if filters != nil && filters.ResourceName != "" {
resourceName = filters.ResourceName
}

var actualValue interface{}

switch function {
case "count":
actualValue = len(matchingNodes)
case "min":
actualValue = findMin(matchingNodes, property)
actualValue = findMin(matchingNodes, property, resourceName)
case "max":
actualValue = findMax(matchingNodes, property)
actualValue = findMax(matchingNodes, property, resourceName)
case "sum":
actualValue = findSum(matchingNodes, property)
actualValue = findSum(matchingNodes, property, resourceName)
case "nodeCondition":
operatorChecker := regexp.MustCompile(`={1,3}`)
if !operatorChecker.MatchString(operator) {
Expand Down Expand Up @@ -311,7 +317,7 @@ func compareNodeResourceConditionalToActual(conditional string, matchingNodes []
return
}

func getQuantity(node corev1.Node, property string) *resource.Quantity {
func getQuantity(node corev1.Node, property string, resourceName string) *resource.Quantity {
switch property {
case "cpuCapacity":
return node.Status.Capacity.Cpu()
Expand All @@ -329,27 +335,33 @@ func getQuantity(node corev1.Node, property string) *resource.Quantity {
return node.Status.Capacity.StorageEphemeral()
case "ephemeralStorageAllocatable":
return node.Status.Allocatable.StorageEphemeral()
case "resourceAllocatable":
allocatable, ok := node.Status.Allocatable[corev1.ResourceName(resourceName)]
if !ok {
return nil
}
return &allocatable
}
return nil
}

func findSum(nodes []corev1.Node, property string) *resource.Quantity {
func findSum(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
sum := resource.Quantity{}

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
sum.Add(*quant)
}
}

return &sum
}

func findMin(nodes []corev1.Node, property string) *resource.Quantity {
func findMin(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
var min *resource.Quantity

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
if min == nil {
min = quant
} else if quant.Cmp(*min) == -1 {
Expand All @@ -361,11 +373,11 @@ func findMin(nodes []corev1.Node, property string) *resource.Quantity {
return min
}

func findMax(nodes []corev1.Node, property string) *resource.Quantity {
func findMax(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
var max *resource.Quantity

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
if max == nil {
max = quant
} else if quant.Cmp(*max) == 1 {
Expand All @@ -382,6 +394,26 @@ func nodeMatchesFilters(node corev1.Node, filters *troubleshootv1beta2.NodeResou
return true, nil
}

if filters.ResourceName != "" {
if filters.ResourceAllocatable != "" {
parsed, err := resource.ParseQuantity(filters.ResourceAllocatable)
if err != nil {
return false, errors.Wrap(err, "failed to parse resource allocatable")
}

allocatable, ok := node.Status.Allocatable[corev1.ResourceName(filters.ResourceName)]
if !ok {
// Resource not found on the node
return false, nil
}

// Compare the allocatable value with the parsed value
if allocatable.Cmp(parsed) == -1 {
return false, nil
}
}
}

// all filters must pass for this to pass
if filters.Selector != nil {
selector, err := metav1.LabelSelectorAsSelector(
Expand Down
82 changes: 81 additions & 1 deletion pkg/analyze/node_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) {
"ephemeral-storage": resource.MustParse("19316009748"),
"memory": resource.MustParse("16Ki"),
"pods": resource.MustParse("14"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
},
Expand All @@ -57,6 +58,7 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) {
"ephemeral-storage": resource.MustParse("12316009748"),
"memory": resource.MustParse("7848976Ki"),
"pods": resource.MustParse("12"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
},
Expand All @@ -65,11 +67,46 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) {
tests := []struct {
name string
conditional string
filters *troubleshootv1beta2.NodeResourceFilters
totalNodeCount int
matchingNodes []corev1.Node
expected bool
isError bool
}{
{
name: "GPU min(resourceAllocatable) == 1 (true)",
conditional: "min(resourceAllocatable) == 1",
filters: &troubleshootv1beta2.NodeResourceFilters{
ResourceName: "nvidia.com/gpu",
},
matchingNodes: nodeData,
totalNodeCount: len(nodeData),
expected: true,
isError: false,
},
{
name: "GPU max(resourceAllocatable) > 1 (false)",
conditional: "max(resourceAllocatable) > 1",
filters: &troubleshootv1beta2.NodeResourceFilters{
ResourceName: "nvidia.com/gpu",
},
matchingNodes: nodeData,
totalNodeCount: 0,
expected: false,
isError: false,
},
{
name: "GPU count() == 2 (true)",
conditional: "count() == 2",
filters: &troubleshootv1beta2.NodeResourceFilters{
ResourceName: "nvidia.com/gpu",
ResourceAllocatable: "1",
},
matchingNodes: nodeData,
totalNodeCount: len(nodeData),
expected: true,
isError: false,
},
{
name: "=",
conditional: "= 2",
Expand Down Expand Up @@ -372,7 +409,7 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) {
t.Run(test.name, func(t *testing.T) {
req := require.New(t)

actual, err := compareNodeResourceConditionalToActual(test.conditional, test.matchingNodes)
actual, err := compareNodeResourceConditionalToActual(test.conditional, test.matchingNodes, test.filters)
if test.isError {
req.Error(err)
} else {
Expand Down Expand Up @@ -404,6 +441,7 @@ func Test_nodeMatchesFilters(t *testing.T) {
"hugepages-2Mi": resource.MustParse("0"),
"memory": resource.MustParse("7951376Ki"),
"pods": resource.MustParse("29"),
"nvidia.com/gpu": resource.MustParse("1"),
},
Allocatable: corev1.ResourceList{
"attachable-volumes-aws-ebs": resource.MustParse("25"),
Expand All @@ -413,6 +451,7 @@ func Test_nodeMatchesFilters(t *testing.T) {
"hugepages-2Mi": resource.MustParse("0"),
"memory": resource.MustParse("7848976Ki"),
"pods": resource.MustParse("29"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
}
Expand Down Expand Up @@ -626,6 +665,15 @@ func Test_nodeMatchesFilters(t *testing.T) {
},
expectResult: false,
},
{
name: "true when gpu is available",
node: node,
filters: &troubleshootv1beta2.NodeResourceFilters{
ResourceName: "nvidia.com/gpu",
ResourceAllocatable: "1",
},
expectResult: true,
},
}

for _, test := range tests {
Expand Down Expand Up @@ -1244,6 +1292,38 @@ func Test_analyzeNodeResources(t *testing.T) {
IconURI: "https://troubleshoot.sh/images/analyzer-icons/node-resources.svg?w=16&h=18",
},
},

{
name: "1 GPU in nodes", // validate that the pass message is not always shown
analyzer: &troubleshootv1beta2.NodeResources{
AnalyzeMeta: troubleshootv1beta2.AnalyzeMeta{
CheckName: "GPU filter",
},
Outcomes: []*troubleshootv1beta2.Outcome{
{
Pass: &troubleshootv1beta2.SingleOutcome{
When: "count() >= 1",
Message: "There is a node with at least 1 GPU",
URI: "",
},
},
},
Filters: &troubleshootv1beta2.NodeResourceFilters{
ResourceName: "nvidia.com/gpu",
ResourceAllocatable: "1",
},
},
want: &AnalyzeResult{
IsPass: true,
IsFail: false,
IsWarn: false,
Title: "GPU filter",
Message: "There is a node with at least 1 GPU",
URI: "",
IconKey: "kubernetes_node_resources",
IconURI: "https://troubleshoot.sh/images/analyzer-icons/node-resources.svg?w=16&h=18",
},
},
}

getExampleNodeContents := func(nodeName string) ([]byte, error) {
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/troubleshoot/v1beta1/analyzer_shared.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ type NodeResourceFilters struct {
EphemeralStorageCapacity string `json:"ephemeralStorageCapacity,omitempty" yaml:"ephemeralStorageCapacity,omitempty"`
EphemeralStorageAllocatable string `json:"ephemeralStorageAllocatable,omitempty" yaml:"ephemeralStorageAllocatable,omitempty"`
Selector *NodeResourceSelectors `json:"selector,omitempty" yaml:"selector,omitempty"`
ResourceName string `json:"resourceName,omitempty" yaml:"resourceName,omitempty"`
ResourceAllocatable string `json:"resourceAllocatable,omitempty" yaml:"resourceAllocatable,omitempty"`
}

type NodeResourceSelectors struct {
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/troubleshoot/v1beta2/analyzer_shared.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ type NodeResourceFilters struct {
EphemeralStorageCapacity string `json:"ephemeralStorageCapacity,omitempty" yaml:"ephemeralStorageCapacity,omitempty"`
EphemeralStorageAllocatable string `json:"ephemeralStorageAllocatable,omitempty" yaml:"ephemeralStorageAllocatable,omitempty"`
Selector *NodeResourceSelectors `json:"selector,omitempty" yaml:"selector,omitempty"`
ResourceName string `json:"resourceName,omitempty" yaml:"resourceName,omitempty"`
ResourceAllocatable string `json:"resourceAllocatable,omitempty" yaml:"resourceAllocatable,omitempty"`
}

type NodeResourceSelectors struct {
Expand Down
6 changes: 6 additions & 0 deletions schemas/analyzer-troubleshoot-v1beta1.json
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,12 @@
"podCapacity": {
"type": "string"
},
"resourceAllocatable": {
"type": "string"
},
"resourceName": {
"type": "string"
},
"selector": {
"type": "object",
"properties": {
Expand Down
Loading

0 comments on commit 763b64c

Please sign in to comment.