Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
jgongd committed Oct 18, 2024
1 parent 27035f4 commit 50b0ebe
Show file tree
Hide file tree
Showing 17 changed files with 346 additions and 326 deletions.
26 changes: 17 additions & 9 deletions master/internal/db/postgres_experiments_intg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,21 +439,29 @@ func TestActiveLogPatternPolicies(t *testing.T) {
eccErrorSignal := "ECC Error"
cudaOOMSignal := "CUDA OOM"
expected := expconf.LogPoliciesConfig{
expconf.LogPolicy{RawPattern: ".*uncorrectable ECC error encountered.*", RawSignal: &eccErrorSignal},
expconf.LogPolicy{RawPattern: ".*CUDA out of memory.*", RawSignal: &cudaOOMSignal},
expconf.LogPolicy{
RawPattern: ".*uncorrectable ECC error encountered.*",
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeSignal, Signal: &eccErrorSignal}},
},
expconf.LogPolicy{
RawPattern: ".*CUDA out of memory.*",
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeSignal, Signal: &cudaOOMSignal}},
},
}

require.Equal(t, expected, policies)

activeConfig, err := db.ActiveExperimentConfig(exp.ID)
require.NoError(t, err)
activeConfig.RawLogPolicies = &expconf.LogPoliciesConfig{
expconf.LogPolicy{RawPattern: "sub", RawActions: []expconf.LogAction{{
RawCancelRetries: &expconf.LogActionCancelRetries{},
}}},
expconf.LogPolicy{RawPattern: `\d{5}$`, RawActions: []expconf.LogAction{{
RawExcludeNode: &expconf.LogActionExcludeNode{},
}}},
activeConfig.RawLogPolicies = expconf.LogPoliciesConfig{
expconf.LogPolicy{
RawPattern: "sub",
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeCancelRetries}},
},
expconf.LogPolicy{
RawPattern: `\d{5}$`,
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeExcludeNode}},
},
}

v, err := json.Marshal(activeConfig)
Expand Down
8 changes: 4 additions & 4 deletions master/internal/logpattern/logpattern.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,21 @@ func (l *LogPatternPolicies) monitor(ctx context.Context,
if compiledRegex.MatchString(log.Log) {
if actions := policy.Actions(); len(actions) > 0 {
for _, a := range actions {
switch a.GetUnionMember().(type) {
case expconf.LogActionCancelRetries:
switch a.Type {
case expconf.LogActionTypeCancelRetries:
if err := addDontRetry(
ctx, model.TaskID(log.TaskID), *log.AgentID, policy.Pattern(), log.Log,
); err != nil {
return fmt.Errorf("adding don't retry: %w", err)
}

case expconf.LogActionExcludeNode:
case expconf.LogActionTypeExcludeNode:
if err := addRetryOnDifferentNode(
ctx, model.TaskID(log.TaskID), *log.AgentID, policy.Pattern(), log.Log,
); err != nil {
return fmt.Errorf("adding retry on different node: %w", err)
}
case string:
case expconf.LogActionTypeSignal:
signal := a.Signal
if signal != nil {
err = db.Bun().RunInTx(ctx, nil, func(ctx context.Context, tx bun.Tx) error {
Expand Down
Loading

0 comments on commit 50b0ebe

Please sign in to comment.