Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StepOutOfMemory as reason for BuildRun failure #1588

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/buildrun.md
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ The following table illustrates the different states a BuildRun can have under i
| False | BuildRunAmbiguousBuild | Yes | The defined `BuildRun` uses both `spec.build.name` and `spec.build.spec`. Only one of them is allowed at the same time. |
| False | BuildRunBuildFieldOverrideForbidden | Yes | The defined `BuildRun` uses an override (e.g. `timeout`, `paramValues`, `output`, or `env`) in combination with `spec.build.spec`, which is not allowed. Use the `spec.build.spec` to directly specify the respective value. |
| False | PodEvicted | Yes | The BuildRun Pod was evicted from the node it was running on. See [API-initiated Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/) and [Node-pressure Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/) for more information. |
| False | StepOutOfMemory | Yes | The BuildRun Pod failed because a step went out of memory. |

**Note**: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/tektoncd/pipeline/blob/main/docs/taskruns.md#monitoring-execution-status) for populating the BuildRun ones, with some exceptions.

Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/build/v1beta1/buildrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ const (
// BuildRunStatePodEvicted indicates that if the pods got evicted
// due to some reason. (Probably ran out of ephemeral storage)
BuildRunStatePodEvicted = "PodEvicted"

// BuildRunStateStepOutOfMemory indicates that a step failed because it went out of memory.
BuildRunStateStepOutOfMemory = "StepOutOfMemory"
)

// SourceResult holds the results emitted from the different sources
Expand Down
28 changes: 18 additions & 10 deletions pkg/reconciler/buildrun/resources/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie

case pipelineapi.TaskRunReasonFailed:
if taskRun.Status.CompletionTime != nil {
pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun)
pod, failedContainer, failedContainerStatus, err := extractFailedPodAndContainer(ctx, client, taskRun)
if err != nil {
// when trying to customize the Condition Message field, ensure the Message cover the case
// when a Pod is deleted.
Expand All @@ -105,19 +105,27 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
message = pod.Status.Message
reason = buildv1beta1.BuildRunStatePodEvicted
if failedContainer != nil {
//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
}
} else if failedContainer != nil {
//nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails
buildRun.Status.FailureDetails.Location.Container = failedContainer.Name
message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)

if failedContainerStatus != nil && failedContainerStatus.State.Terminated != nil && failedContainerStatus.State.Terminated.Reason == "OOMKilled" {
reason = buildv1beta1.BuildRunStateStepOutOfMemory
message = fmt.Sprintf("buildrun step %s failed due to out-of-memory, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)
} else {
message = fmt.Sprintf("buildrun step %s failed, for detailed information: kubectl --namespace %s logs %s --container=%s",
failedContainer.Name,
pod.Namespace,
pod.Name,
failedContainer.Name,
)
}
} else {
message = fmt.Sprintf("buildrun failed due to an unexpected error in pod %s: for detailed information: kubectl --namespace %s logs %s --all-containers",
pod.Name,
Expand Down
63 changes: 63 additions & 0 deletions pkg/reconciler/buildrun/resources/conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ var _ = Describe("Conditions", func() {
},
},
Status: corev1.PodStatus{
Phase: corev1.PodFailed,
Reason: "Evicted",
},
}
Expand Down Expand Up @@ -281,6 +282,68 @@ var _ = Describe("Conditions", func() {
).To(Equal(build.BuildRunStatePodEvicted))
})

It("updates BuildRun condition when TaskRun fails and container is out of memory", func() {
// Generate a pod with the status to out of memory
failedTaskRunOOMContainer := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "evilpod",
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "evilpod-container",
},
},
},
Status: corev1.PodStatus{
Phase: corev1.PodFailed,
ContainerStatuses: []corev1.ContainerStatus{{
Name: "evilpod-container",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "OOMKilled",
},
},
}},
},
}

// stub a GET API call with to pass the created pod
getClientStub := func(_ context.Context, nn types.NamespacedName, object crc.Object, _ ...crc.GetOption) error {
switch object := object.(type) {
case *corev1.Pod:
failedTaskRunOOMContainer.DeepCopyInto(object)
return nil
}
return k8serrors.NewNotFound(schema.GroupResource{}, nn.Name)
}

// fake the calls with the above stub
client.GetCalls(getClientStub)

// Now we need to create a fake failed taskrun so that it hits the code
fakeTRCondition := &apis.Condition{
Type: apis.ConditionSucceeded,
Reason: "Failed",
Message: "not relevant",
}

// We call the function with all the info
Expect(resources.UpdateBuildRunUsingTaskRunCondition(
context.TODO(),
client,
br,
tr,
fakeTRCondition,
)).To(BeNil())

// Finally, check the output of the buildRun
Expect(br.Status.GetCondition(
build.Succeeded).Reason,
).To(Equal(build.BuildRunStateStepOutOfMemory))
})

It("updates a BuildRun condition when the related TaskRun fails and pod containers are not available", func() {

taskRunGeneratedPod := corev1.Pod{
Expand Down
19 changes: 11 additions & 8 deletions pkg/reconciler/buildrun/resources/failure_details.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,31 +64,34 @@ func extractFailureReasonAndMessage(taskRun *pipelineapi.TaskRun) (errorReason s
return errorReason, errorMessage
}

func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, error) {
func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, *corev1.ContainerStatus, error) {
var pod corev1.Pod
if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil {
ctxlog.Error(ctx, err, "failed to get pod for failure extraction", namespace, taskRun.Namespace, name, taskRun.Status.PodName)
return nil, nil, err
return nil, nil, nil, err
}

failures := make(map[string]struct{})
failures := make(map[string]*corev1.ContainerStatus)
// Find the names of all containers with failure status
for _, containerStatus := range pod.Status.ContainerStatuses {
for i := range pod.Status.ContainerStatuses {
containerStatus := pod.Status.ContainerStatuses[i]
if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
failures[containerStatus.Name] = struct{}{}
failures[containerStatus.Name] = &containerStatus
}
}

// Find the first container that has a failure status
var failedContainer *corev1.Container
var failedContainerStatus *corev1.ContainerStatus
for i, container := range pod.Spec.Containers {
if _, has := failures[container.Name]; has {
if containerStatus, has := failures[container.Name]; has {
failedContainer = &pod.Spec.Containers[i]
failedContainerStatus = containerStatus
break
}
}

return &pod, failedContainer, nil
return &pod, failedContainer, failedContainerStatus, nil
}

func extractFailureDetails(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (failure *buildv1beta1.FailureDetails) {
Expand All @@ -97,7 +100,7 @@ func extractFailureDetails(ctx context.Context, client client.Client, taskRun *p
failure.Reason, failure.Message = extractFailureReasonAndMessage(taskRun)

failure.Location = &buildv1beta1.Location{Pod: taskRun.Status.PodName}
pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun)
pod, container, _, _ := extractFailedPodAndContainer(ctx, client, taskRun)

if pod != nil && container != nil {
failure.Location.Pod = pod.Name
Expand Down
2 changes: 1 addition & 1 deletion test/integration/buildruns_to_taskruns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ var _ = Describe("Integration tests BuildRuns and TaskRuns", func() {
condition := buildRun.Status.GetCondition(v1beta1.Succeeded)
Expect(condition.Status).To(Equal(corev1.ConditionFalse))
Expect(condition.Reason).To(Equal("Failed"))
Expect(condition.Message).To(ContainSubstring("buildrun step %s failed in pod %s", "step-step-build-and-push", taskRun.Status.PodName))
Expect(condition.Message).To(ContainSubstring("buildrun step %s failed", "step-step-build-and-push"))
})
})
})
Expand Down