Skip to content

Commit

Permalink
Add StepOutOfMemory as reason for BuildRun failure
Browse files Browse the repository at this point in the history
  • Loading branch information
SaschaSchwarze0 committed May 6, 2024
1 parent ede2927 commit 8c86adf
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 18 deletions.
1 change: 1 addition & 0 deletions docs/buildrun.md
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ The following table illustrates the different states a BuildRun can have under i
| False | BuildRunAmbiguousBuild | Yes | The defined `BuildRun` uses both `spec.build.name` and `spec.build.spec`. Only one of them is allowed at the same time. |
| False | BuildRunBuildFieldOverrideForbidden | Yes | The defined `BuildRun` uses an override (e.g. `timeout`, `paramValues`, `output`, or `env`) in combination with `spec.build.spec`, which is not allowed. Use the `spec.build.spec` to directly specify the respective value. |
| False | PodEvicted | Yes | The BuildRun Pod was evicted from the node it was running on. See [API-initiated Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/) and [Node-pressure Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/) for more information. |
| False | StepOutOfMemory | Yes | The BuildRun Pod failed because a step went out of memory. |

**Note**: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/tektoncd/pipeline/blob/main/docs/taskruns.md#monitoring-execution-status) for populating the BuildRun ones, with some exceptions.

Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/build/v1beta1/buildrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ const (
// BuildRunStatePodEvicted indicates that if the pods got evicted
// due to some reason. (Probably ran out of ephemeral storage)
BuildRunStatePodEvicted = "PodEvicted"

// BuildRunStateStepOutOfMemory indicates that a step failed because it went out of memory.
BuildRunStateStepOutOfMemory = "StepOutOfMemory"
)

// SourceResult holds the results emitted from the different sources
Expand Down
28 changes: 18 additions & 10 deletions pkg/reconciler/buildrun/resources/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie

case pipelineapi.TaskRunReasonFailed:
if taskRun.Status.CompletionTime != nil {
pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun)
pod, failedContainer, failedContainerStatus, err := extractFailedPodAndContainer(ctx, client, taskRun)
if err != nil {
// when trying to customize the Condition Message field, ensure the Message cover the case
// when a Pod is deleted.
Expand All @@ -105,19 +105,27 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
message = pod.Status.Message
reason = buildv1beta1.BuildRunStatePodEvicted
if failedContainer != nil {
//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
}
} else if failedContainer != nil {
//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)

if failedContainerStatus != nil && failedContainerStatus.State.Terminated != nil && failedContainerStatus.State.Terminated.Reason == "OOMKilled" {
reason = buildv1beta1.BuildRunStateStepOutOfMemory
message = fmt.Sprintf("buildrun step %s failed due to out-of-memory, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)
} else {
message = fmt.Sprintf("buildrun step %s failed, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)
}
} else {
message = fmt.Sprintf("buildrun failed due to an unexpected error in pod %s: for detailed information: kubectl --namespace %s logs %s --all-containers",
pod.Name,
Expand Down
63 changes: 63 additions & 0 deletions pkg/reconciler/buildrun/resources/conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ var _ = Describe("Conditions", func() {
},
},
Status: corev1.PodStatus{
Phase: corev1.PodFailed,
Reason: "Evicted",
},
}
Expand Down Expand Up @@ -281,6 +282,68 @@ var _ = Describe("Conditions", func() {
).To(Equal(build.BuildRunStatePodEvicted))
})

It("updates BuildRun condition when TaskRun fails and container is out of memory", func() {
// Generate a pod with the status to out of memory
failedTaskRunOOMContainer := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "evilpod",
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "evilpod-container",
},
},
},
Status: corev1.PodStatus{
Phase: corev1.PodFailed,
ContainerStatuses: []corev1.ContainerStatus{{
Name: "evilpod-container",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "OOMKilled",
},
},
}},
},
}

// stub a GET API call with to pass the created pod
getClientStub := func(_ context.Context, nn types.NamespacedName, object crc.Object, _ ...crc.GetOption) error {
switch object := object.(type) {
case *corev1.Pod:
failedTaskRunOOMContainer.DeepCopyInto(object)
return nil
}
return k8serrors.NewNotFound(schema.GroupResource{}, nn.Name)
}

// fake the calls with the above stub
client.GetCalls(getClientStub)

// Now we need to create a fake failed taskrun so that it hits the code
fakeTRCondition := &apis.Condition{
Type: apis.ConditionSucceeded,
Reason: "Failed",
Message: "not relevant",
}

// We call the function with all the info
Expect(resources.UpdateBuildRunUsingTaskRunCondition(
context.TODO(),
client,
br,
tr,
fakeTRCondition,
)).To(BeNil())

// Finally, check the output of the buildRun
Expect(br.Status.GetCondition(
build.Succeeded).Reason,
).To(Equal(build.BuildRunStateStepOutOfMemory))
})

It("updates a BuildRun condition when the related TaskRun fails and pod containers are not available", func() {

taskRunGeneratedPod := corev1.Pod{
Expand Down
19 changes: 11 additions & 8 deletions pkg/reconciler/buildrun/resources/failure_details.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,31 +64,34 @@ func extractFailureReasonAndMessage(taskRun *pipelineapi.TaskRun) (errorReason s
return errorReason, errorMessage
}

func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, error) {
func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, *corev1.ContainerStatus, error) {
var pod corev1.Pod
if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil {
ctxlog.Error(ctx, err, "failed to get pod for failure extraction", namespace, taskRun.Namespace, name, taskRun.Status.PodName)
return nil, nil, err
return nil, nil, nil, err
}

failures := make(map[string]struct{})
failures := make(map[string]*corev1.ContainerStatus)
// Find the names of all containers with failure status
for _, containerStatus := range pod.Status.ContainerStatuses {
for i := range pod.Status.ContainerStatuses {
containerStatus := pod.Status.ContainerStatuses[i]
if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
failures[containerStatus.Name] = struct{}{}
failures[containerStatus.Name] = &containerStatus
}
}

// Find the first container that has a failure status
var failedContainer *corev1.Container
var failedContainerStatus *corev1.ContainerStatus
for i, container := range pod.Spec.Containers {
if _, has := failures[container.Name]; has {
if containerStatus, has := failures[container.Name]; has {
failedContainer = &pod.Spec.Containers[i]
failedContainerStatus = containerStatus
break
}
}

return &pod, failedContainer, nil
return &pod, failedContainer, failedContainerStatus, nil
}

func extractFailureDetails(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (failure *buildv1beta1.FailureDetails) {
Expand All @@ -97,7 +100,7 @@ func extractFailureDetails(ctx context.Context, client client.Client, taskRun *p
failure.Reason, failure.Message = extractFailureReasonAndMessage(taskRun)

failure.Location = &buildv1beta1.Location{Pod: taskRun.Status.PodName}
pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun)
pod, container, _, _ := extractFailedPodAndContainer(ctx, client, taskRun)

if pod != nil && container != nil {
failure.Location.Pod = pod.Name
Expand Down

0 comments on commit 8c86adf

Please sign in to comment.