Skip to content

Commit

Permalink
cleanup: add new --stopped-only option
Browse files Browse the repository at this point in the history
The podman container cleanup process runs asynchronous and by the time
it gets the lock it is possible another podman process already did the
cleanup and then did a new init() to start it again. If the cleanup
process gets the lock there it will cause very weird things.

This can be observed in the remote start API as CI flakes.

Fixes #23754

Signed-off-by: Paul Holzinger <[email protected]>
  • Loading branch information
Luap99 committed Aug 27, 2024
1 parent bf74797 commit a89fef6
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 7 deletions.
5 changes: 5 additions & 0 deletions cmd/podman/containers/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ func init() {

flags.BoolVar(&cleanupOptions.Remove, "rm", false, "After cleanup, remove the container entirely")
flags.BoolVar(&cleanupOptions.RemoveImage, "rmi", false, "After cleanup, remove the image entirely")

stoppedOnlyFlag := "stopped-only"
flags.BoolVar(&cleanupOptions.StoppedOnly, stoppedOnlyFlag, false, "Only cleanup when the container is in the stopped state")
_ = flags.MarkHidden(stoppedOnlyFlag)

validate.AddLatestFlag(cleanupCommand, &cleanupOptions.Latest)
}

Expand Down
9 changes: 7 additions & 2 deletions libpod/container_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,10 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
}

// Cleanup unmounts all mount points in container and cleans up container storage
// It also cleans up the network stack
func (c *Container) Cleanup(ctx context.Context) error {
// It also cleans up the network stack.
// onlyStopped is set by the podman container cleanup to ensure we only cleanup a stopped container,
// all other states mean another process already called cleanup before us which is fine in such cases.
func (c *Container) Cleanup(ctx context.Context, onlyStopped bool) error {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
Expand All @@ -808,6 +810,9 @@ func (c *Container) Cleanup(ctx context.Context) error {
if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated, define.ContainerStateStopped, define.ContainerStateStopping, define.ContainerStateExited) {
return fmt.Errorf("container %s is running or paused, refusing to clean up: %w", c.ID(), define.ErrCtrStateInvalid)
}
if onlyStopped && !c.ensureState(define.ContainerStateStopped) {
return fmt.Errorf("container %s is not stopped and only cleanup for a stopped container was requested: %w", c.ID(), define.ErrCtrStateInvalid)
}

// if the container was not created in the oci runtime or was already cleaned up, then do nothing
if c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) {
Expand Down
4 changes: 2 additions & 2 deletions libpod/pod_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
}

if cleanup {
err := c.Cleanup(ctx)
err := c.Cleanup(ctx, false)
if err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) {
return err
}
Expand Down Expand Up @@ -299,7 +299,7 @@ func (p *Pod) Cleanup(ctx context.Context) (map[string]error, error) {
c := ctr
logrus.Debugf("Adding parallel job to clean up container %s", c.ID())
retChan := parallel.Enqueue(ctx, func() error {
return c.Cleanup(ctx)
return c.Cleanup(ctx, false)
})

ctrErrChan[c.ID()] = retChan
Expand Down
1 change: 1 addition & 0 deletions pkg/domain/entities/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ type ContainerCleanupOptions struct {
Latest bool
Remove bool
RemoveImage bool
StoppedOnly bool // Only cleanup if the container is stopped, i.e. was running before
}

// ContainerCleanupReport describes the response from a
Expand Down
4 changes: 2 additions & 2 deletions pkg/domain/infra/abi/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ func (ic *ContainerEngine) ContainerStop(ctx context.Context, namesOrIds []strin
}
}
} else {
if err = c.Cleanup(ctx); err != nil {
if err = c.Cleanup(ctx, false); err != nil {
// The container could still have been removed, as we unlocked
// after we stopped it.
if errors.Is(err, define.ErrNoSuchCtr) || errors.Is(err, define.ErrCtrRemoved) {
Expand Down Expand Up @@ -1320,7 +1320,7 @@ func (ic *ContainerEngine) ContainerCleanup(ctx context.Context, namesOrIds []st
report.RmErr = fmt.Errorf("failed to clean up and remove container %v: %w", ctr.ID(), err)
}
} else {
err := ctr.Cleanup(ctx)
err := ctr.Cleanup(ctx, options.StoppedOnly)
// ignore error if ctr is removed or cannot be cleaned up, likely the ctr was already restarted by another process
if err != nil && !errors.Is(err, define.ErrNoSuchCtr) && !errors.Is(err, define.ErrCtrStateInvalid) {
report.CleanErr = fmt.Errorf("failed to clean up container %v: %w", ctr.ID(), err)
Expand Down
4 changes: 3 additions & 1 deletion pkg/specgenutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,9 @@ func CreateExitCommandArgs(storageConfig storageTypes.StoreOptions, config *conf
command = append(command, "--module", module)
}

command = append(command, []string{"container", "cleanup"}...)
// --stopped-only is used to ensure we only cleanup stopped containers and do not race
// against other processes that did a cleanup() + init() again before we had the chance to run
command = append(command, []string{"container", "cleanup", "--stopped-only"}...)

if rm {
command = append(command, "--rm")
Expand Down

0 comments on commit a89fef6

Please sign in to comment.