Skip to content

Commit

Permalink
fix: run checkpoint GC more aggressively to ensure tensorboards are G…
Browse files Browse the repository at this point in the history
…C'd (#10017)

Unfortunately we can't be clever about skipping checkpoint GC since there still may
be tensorboard files.
  • Loading branch information
stoksc authored Oct 16, 2024
1 parent a14525f commit e796b92
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
19 changes: 10 additions & 9 deletions master/internal/api_experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,15 +494,16 @@ func (a *apiServer) deleteExperiments(exps []*model.Experiment, userModel *model
log.WithError(err).Errorf("failed to delete experiment: %d", exp.ID)
return err
}
if len(checkpoints) > 0 {
if err := runCheckpointGCForCheckpoints(
a.m.rm, a.m.db, exp.JobID, exp.StartTime,
&taskSpec, exp.ID, exp.Config, checkpoints,
[]string{fullDeleteGlob}, true, agentUserGroup, userModel, nil,
); err != nil {
log.WithError(err).Errorf("failed to gc checkpoints for experiment: %d", exp.ID)
return err
}

// Unfortunately we can't be clever and not run if there aren't checkpoints since we can't
// know if there are tensorboards to GC or not.
if err := runCheckpointGCForCheckpoints(
a.m.rm, a.m.db, exp.JobID, exp.StartTime,
&taskSpec, exp.ID, exp.Config, checkpoints,
[]string{fullDeleteGlob}, true, agentUserGroup, userModel, nil,
); err != nil {
log.WithError(err).Errorf("failed to gc checkpoints for experiment: %d", exp.ID)
return err
}

// delete jobs per experiment
Expand Down
2 changes: 1 addition & 1 deletion master/internal/experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ func (e *internalExperiment) stop() error {
"failure to delete snapshots for experiment: %d", e.Experiment.ID)
}

// May be no checkpoints to gc, if so skip
// May be no checkpoints to GC, if so skip. We can do this since we don't want to GC tensorboards.
if len(checkpoints) > 0 {
go func() {
if err := runCheckpointGCForCheckpoints(
Expand Down

0 comments on commit e796b92

Please sign in to comment.