Add StepOutOfMemory as reason for BuildRun failure

shipwright-io · May 6, 2024 · 8c86adf · 8c86adf
1 parent ede2927
commit 8c86adf
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 18 deletions.
diff --git a/docs/buildrun.md b/docs/buildrun.md
@@ -391,6 +391,7 @@ The following table illustrates the different states a BuildRun can have under i
 | False   | BuildRunAmbiguousBuild                  | Yes                   | The defined `BuildRun` uses both `spec.build.name` and `spec.build.spec`. Only one of them is allowed at the same time.                                                                                                                                                                               |
 | False   | BuildRunBuildFieldOverrideForbidden     | Yes                   | The defined `BuildRun` uses an override (e.g. `timeout`, `paramValues`, `output`, or `env`) in combination with `spec.build.spec`, which is not allowed. Use the `spec.build.spec` to directly specify the respective value.                                                                          |
 | False   | PodEvicted                              | Yes                   | The BuildRun Pod was evicted from the node it was running on. See [API-initiated Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/) and [Node-pressure Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/) for more information. |
+| False   | StepOutOfMemory                         | Yes                   | The BuildRun Pod failed because a step went out of memory. |
 
 **Note**: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/tektoncd/pipeline/blob/main/docs/taskruns.md#monitoring-execution-status) for populating the BuildRun ones, with some exceptions.
 

diff --git a/pkg/apis/build/v1beta1/buildrun_types.go b/pkg/apis/build/v1beta1/buildrun_types.go
@@ -106,6 +106,9 @@ const (
 	// BuildRunStatePodEvicted indicates that if the pods got evicted
 	// due to some reason. (Probably ran out of ephemeral storage)
 	BuildRunStatePodEvicted = "PodEvicted"
+
+	// BuildRunStateStepOutOfMemory indicates that a step failed because it went out of memory.
+	BuildRunStateStepOutOfMemory = "StepOutOfMemory"
 )
 
 // SourceResult holds the results emitted from the different sources

diff --git a/pkg/reconciler/buildrun/resources/conditions.go b/pkg/reconciler/buildrun/resources/conditions.go
@@ -81,7 +81,7 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
 
 	case pipelineapi.TaskRunReasonFailed:
 		if taskRun.Status.CompletionTime != nil {
-			pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun)
+			pod, failedContainer, failedContainerStatus, err := extractFailedPodAndContainer(ctx, client, taskRun)
 			if err != nil {
 				// when trying to customize the Condition Message field, ensure the Message cover the case
 				// when a Pod is deleted.
@@ -105,19 +105,27 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
 				message = pod.Status.Message
 				reason = buildv1beta1.BuildRunStatePodEvicted
 				if failedContainer != nil {
-					//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
 					buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
 				}
 			} else if failedContainer != nil {
-				//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
 				buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
-				message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s",
-					failedContainer.Name,
-					pod.Name,
-					pod.Namespace,
-					pod.Name,
-					failedContainer.Name,
-				)
+
+				if failedContainerStatus != nil && failedContainerStatus.State.Terminated != nil && failedContainerStatus.State.Terminated.Reason == "OOMKilled" {
+					reason = buildv1beta1.BuildRunStateStepOutOfMemory
+					message = fmt.Sprintf("buildrun step %s failed due to out-of-memory, for detailed information: kubectl --namespace %s logs %s --container=%s",
+						failedContainer.Name,
+						pod.Namespace,
+						pod.Name,
+						failedContainer.Name,
+					)
+				} else {
+					message = fmt.Sprintf("buildrun step %s failed, for detailed information: kubectl --namespace %s logs %s --container=%s",
+						failedContainer.Name,
+						pod.Namespace,
+						pod.Name,
+						failedContainer.Name,
+					)
+				}
 			} else {
 				message = fmt.Sprintf("buildrun failed due to an unexpected error in pod %s: for detailed information: kubectl --namespace %s logs %s --all-containers",
 					pod.Name,

diff --git a/pkg/reconciler/buildrun/resources/conditions_test.go b/pkg/reconciler/buildrun/resources/conditions_test.go
@@ -242,6 +242,7 @@ var _ = Describe("Conditions", func() {
 					},
 				},
 				Status: corev1.PodStatus{
+					Phase:  corev1.PodFailed,
 					Reason: "Evicted",
 				},
 			}
@@ -281,6 +282,68 @@ var _ = Describe("Conditions", func() {
 			).To(Equal(build.BuildRunStatePodEvicted))
 		})
 
+		It("updates BuildRun condition when TaskRun fails and container is out of memory", func() {
+			// Generate a pod with the status to out of memory
+			failedTaskRunOOMContainer := corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "evilpod",
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{
+						{
+							Name: "evilpod-container",
+						},
+					},
+				},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodFailed,
+					ContainerStatuses: []corev1.ContainerStatus{{
+						Name: "evilpod-container",
+						State: corev1.ContainerState{
+							Terminated: &corev1.ContainerStateTerminated{
+								ExitCode: 1,
+								Reason:   "OOMKilled",
+							},
+						},
+					}},
+				},
+			}
+
+			// stub a GET API call with to pass the created pod
+			getClientStub := func(_ context.Context, nn types.NamespacedName, object crc.Object, _ ...crc.GetOption) error {
+				switch object := object.(type) {
+				case *corev1.Pod:
+					failedTaskRunOOMContainer.DeepCopyInto(object)
+					return nil
+				}
+				return k8serrors.NewNotFound(schema.GroupResource{}, nn.Name)
+			}
+
+			// fake the calls with the above stub
+			client.GetCalls(getClientStub)
+
+			// Now we need to create a fake failed taskrun so that it hits the code
+			fakeTRCondition := &apis.Condition{
+				Type:    apis.ConditionSucceeded,
+				Reason:  "Failed",
+				Message: "not relevant",
+			}
+
+			// We call the function with all the info
+			Expect(resources.UpdateBuildRunUsingTaskRunCondition(
+				context.TODO(),
+				client,
+				br,
+				tr,
+				fakeTRCondition,
+			)).To(BeNil())
+
+			// Finally, check the output of the buildRun
+			Expect(br.Status.GetCondition(
+				build.Succeeded).Reason,
+			).To(Equal(build.BuildRunStateStepOutOfMemory))
+		})
+
 		It("updates a BuildRun condition when the related TaskRun fails and pod containers are not available", func() {
 
 			taskRunGeneratedPod := corev1.Pod{

diff --git a/pkg/reconciler/buildrun/resources/failure_details.go b/pkg/reconciler/buildrun/resources/failure_details.go
@@ -64,31 +64,34 @@ func extractFailureReasonAndMessage(taskRun *pipelineapi.TaskRun) (errorReason s
 	return errorReason, errorMessage
 }
 
-func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, error) {
+func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, *corev1.ContainerStatus, error) {
 	var pod corev1.Pod
 	if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil {
 		ctxlog.Error(ctx, err, "failed to get pod for failure extraction", namespace, taskRun.Namespace, name, taskRun.Status.PodName)
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 
-	failures := make(map[string]struct{})
+	failures := make(map[string]*corev1.ContainerStatus)
 	// Find the names of all containers with failure status
-	for _, containerStatus := range pod.Status.ContainerStatuses {
+	for i := range pod.Status.ContainerStatuses {
+		containerStatus := pod.Status.ContainerStatuses[i]
 		if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
-			failures[containerStatus.Name] = struct{}{}
+			failures[containerStatus.Name] = &containerStatus
 		}
 	}
 
 	// Find the first container that has a failure status
 	var failedContainer *corev1.Container
+	var failedContainerStatus *corev1.ContainerStatus
 	for i, container := range pod.Spec.Containers {
-		if _, has := failures[container.Name]; has {
+		if containerStatus, has := failures[container.Name]; has {
 			failedContainer = &pod.Spec.Containers[i]
+			failedContainerStatus = containerStatus
 			break
 		}
 	}
 
-	return &pod, failedContainer, nil
+	return &pod, failedContainer, failedContainerStatus, nil
 }
 
 func extractFailureDetails(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (failure *buildv1beta1.FailureDetails) {
@@ -97,7 +100,7 @@ func extractFailureDetails(ctx context.Context, client client.Client, taskRun *p
 	failure.Reason, failure.Message = extractFailureReasonAndMessage(taskRun)
 
 	failure.Location = &buildv1beta1.Location{Pod: taskRun.Status.PodName}
-	pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun)
+	pod, container, _, _ := extractFailedPodAndContainer(ctx, client, taskRun)
 
 	if pod != nil && container != nil {
 		failure.Location.Pod = pod.Name