From e796b921ef48d404aaa05f68c5a4c418e1ea9105 Mon Sep 17 00:00:00 2001 From: Bradley Laney Date: Wed, 16 Oct 2024 17:25:56 -0400 Subject: [PATCH] fix: run checkpoint GC more aggressively to ensure tensorboards are GC'd (#10017) Unfortunately we can't be clever about skipping checkpoint GC since there still may be tensorboard files. --- master/internal/api_experiment.go | 19 ++++++++++--------- master/internal/experiment.go | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/master/internal/api_experiment.go b/master/internal/api_experiment.go index fa742a1779b..9fe3daa6c81 100644 --- a/master/internal/api_experiment.go +++ b/master/internal/api_experiment.go @@ -494,15 +494,16 @@ func (a *apiServer) deleteExperiments(exps []*model.Experiment, userModel *model log.WithError(err).Errorf("failed to delete experiment: %d", exp.ID) return err } - if len(checkpoints) > 0 { - if err := runCheckpointGCForCheckpoints( - a.m.rm, a.m.db, exp.JobID, exp.StartTime, - &taskSpec, exp.ID, exp.Config, checkpoints, - []string{fullDeleteGlob}, true, agentUserGroup, userModel, nil, - ); err != nil { - log.WithError(err).Errorf("failed to gc checkpoints for experiment: %d", exp.ID) - return err - } + + // Unfortunately we can't be clever and not run if there aren't checkpoints since we can't + // know if there are tensorboards to GC or not. + if err := runCheckpointGCForCheckpoints( + a.m.rm, a.m.db, exp.JobID, exp.StartTime, + &taskSpec, exp.ID, exp.Config, checkpoints, + []string{fullDeleteGlob}, true, agentUserGroup, userModel, nil, + ); err != nil { + log.WithError(err).Errorf("failed to gc checkpoints for experiment: %d", exp.ID) + return err } // delete jobs per experiment diff --git a/master/internal/experiment.go b/master/internal/experiment.go index c980ccdb1e5..42c9b7307ee 100644 --- a/master/internal/experiment.go +++ b/master/internal/experiment.go @@ -488,7 +488,7 @@ func (e *internalExperiment) stop() error { "failure to delete snapshots for experiment: %d", e.Experiment.ID) } - // May be no checkpoints to gc, if so skip + // May be no checkpoints to GC, if so skip. We can do this since we don't want to GC tensorboards. if len(checkpoints) > 0 { go func() { if err := runCheckpointGCForCheckpoints(