Skip to content

Commit

Permalink
chore: check experiment constraints (#10018)
Browse files Browse the repository at this point in the history
  • Loading branch information
kkunapuli authored Oct 7, 2024
1 parent f609a2d commit 5b1380c
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 58 deletions.
4 changes: 2 additions & 2 deletions master/internal/api_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ func (a *apiServer) getCommandLaunchParams(ctx context.Context, req *protoComman
}

// Check submitted config against task config policies.
valid, err := configpolicy.CheckNTSCConstraints(ctx, int(cmdSpec.Metadata.WorkspaceID), config, a.m.rm)
if !valid {
err = configpolicy.CheckNTSCConstraints(ctx, int(cmdSpec.Metadata.WorkspaceID), config, a.m.rm)
if err != nil {
return nil, nil, status.Errorf(codes.InvalidArgument, "failed constraint check: %v", err)
}

Expand Down
13 changes: 13 additions & 0 deletions master/internal/api_experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (

"github.com/determined-ai/determined/master/internal/api"
"github.com/determined-ai/determined/master/internal/authz"
"github.com/determined-ai/determined/master/internal/configpolicy"
"github.com/determined-ai/determined/master/internal/db"
"github.com/determined-ai/determined/master/internal/db/bunutils"
"github.com/determined-ai/determined/master/internal/experiment"
Expand Down Expand Up @@ -1681,6 +1682,18 @@ func (a *apiServer) CreateExperiment(
return nil, status.Errorf(codes.PermissionDenied, err.Error())
}

wkspIDs, err := workspace.WorkspaceIDsFromNames(ctx, []string{taskSpec.Workspace})
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}
if len(wkspIDs) != 1 {
return nil, status.Error(codes.InvalidArgument, "expected exactly one workspace")
}
err = configpolicy.CheckExperimentConstraints(ctx, int(wkspIDs[0]), activeConfig, a.m.rm)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}

if req.ValidateOnly {
return &apiv1.CreateExperimentResponse{
Experiment: &experimentv1.Experiment{},
Expand Down
12 changes: 9 additions & 3 deletions master/internal/api_experiment_intg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,8 @@ func TestPutExperimentsRetainLogs(t *testing.T) {

func TestParseAndMergeContinueConfig(t *testing.T) {
// Blank config.
api, curUser, ctx := setupAPITest(t, nil)
mockRM := MockRM()
api, curUser, ctx := setupAPITest(t, nil, mockRM)
exp := createTestExp(t, api, curUser)

_, _, err := api.parseAndMergeContinueConfig(exp.ID, ``)
Expand Down Expand Up @@ -652,6 +653,7 @@ resources:
}

// No checkpoint specified anywhere.
mockRM.On("SmallerValueIsHigherPriority", mock.Anything).Return(true, nil)
resp, err := api.CreateExperiment(ctx, createReq)
require.NoError(t, err)
_, _, err = api.parseAndMergeContinueConfig(int(resp.Experiment.Id), `{}`)
Expand All @@ -671,7 +673,8 @@ searcher:

// nolint: exhaustruct
func TestCreateExperimentCheckpointStorage(t *testing.T) {
api, _, ctx := setupAPITest(t, nil)
mockRM := MockRM()
api, _, ctx := setupAPITest(t, nil, mockRM)
api.m.config.CheckpointStorage = expconf.CheckpointStorageConfig{}
defer func() {
api.m.config.CheckpointStorage = expconf.CheckpointStorageConfig{}
Expand All @@ -694,6 +697,7 @@ resources:
}

// No checkpoint specified anywhere.
mockRM.On("SmallerValueIsHigherPriority", mock.Anything).Return(true, nil)
_, err := api.CreateExperiment(ctx, createReq)
require.ErrorContains(t, err, "checkpoint_storage: type is a required property")

Expand Down Expand Up @@ -1589,7 +1593,8 @@ func TestAuthZGetExperimentLabels(t *testing.T) {
}

func TestAuthZCreateExperiment(t *testing.T) {
api, authZExp, _, curUser, ctx := setupExpAuthTest(t, nil)
mockRM := MockRM()
api, authZExp, _, curUser, ctx := setupExpAuthTest(t, nil, mockRM)
forkFrom := createTestExp(t, api, curUser)
workspaceID, projectID := createProjectAndWorkspace(ctx, t, api)

Expand Down Expand Up @@ -1682,6 +1687,7 @@ func TestAuthZCreateExperiment(t *testing.T) {
Return(nil).Once()
authZExp.On("CanEditExperiment", mock.Anything, mockUserArg, mock.Anything, mock.Anything).Return(
fmt.Errorf("canActivateExperimentError")).Once()
mockRM.On("SmallerValueIsHigherPriority", mock.Anything).Return(true, nil)
_, err := api.CreateExperiment(ctx, &apiv1.CreateExperimentRequest{
Activate: true,
Config: minExpConfToYaml(t),
Expand Down
4 changes: 4 additions & 0 deletions master/internal/api_logretention_intg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

"github.com/go-co-op/gocron/v2"
"github.com/jonboulle/clockwork"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/uptrace/bun"

Expand Down Expand Up @@ -108,6 +109,9 @@ searcher:
}

// No checkpoint specified anywhere.
mockRM := MockRM()
api.m.rm = mockRM
mockRM.On("SmallerValueIsHigherPriority", mock.Anything).Return(true, nil)
resp, err := api.CreateExperiment(ctx, createReq)
require.NoError(t, err)
require.Empty(t, resp.Warnings)
Expand Down
89 changes: 73 additions & 16 deletions master/internal/configpolicy/task_config_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,43 +36,100 @@ var (
errResourceConstraintFailure = errors.New("submitted workload failed a resource constraint")
)

// CheckNTSCConstraints returns true if the NTSC config passes constraint checks.
// CheckNTSCConstraints returns an error if the NTSC config fails constraint checks.
func CheckNTSCConstraints(
ctx context.Context,
workspaceID int,
workloadConfig model.CommandConfig,
resourceManager rm.ResourceManager,
) (bool, error) {
) error {
constraints, err := GetMergedConstraints(ctx, workspaceID, model.NTSCType)
if err != nil {
return false, err
return err
}

if constraints.ResourceConstraints != nil && constraints.ResourceConstraints.MaxSlots != nil {
if err = checkSlotsConstraint(*constraints.ResourceConstraints.MaxSlots, workloadConfig.Resources.Slots,
workloadConfig.Resources.MaxSlots); err != nil {
return err
}
}

// For each submitted constraint, check if the workload config is within allowed values.
// rm.SmallerValueIsHigherPriority only returns an error if task priority is not implemented for that resource manager.
// In that case, there is no need to check if requested priority is within limits.
smallerHigher, err := resourceManager.SmallerValueIsHigherPriority()
if err == nil && constraints.PriorityLimit != nil && workloadConfig.Resources.Priority != nil {
if !priorityWithinLimit(*workloadConfig.Resources.Priority, *constraints.PriorityLimit, smallerHigher) {
return false, fmt.Errorf("requested priority [%d] exceeds limit set by admin [%d]: %w",
*workloadConfig.Resources.Priority, *constraints.PriorityLimit, errPriorityConstraintFailure)
if err == nil {
if err = checkPriorityConstraint(smallerHigher, constraints.PriorityLimit,
workloadConfig.Resources.Priority); err != nil {
return err
}
}

return nil
}

// CheckExperimentConstraints returns an error if the NTSC config fails constraint checks.
func CheckExperimentConstraints(
ctx context.Context,
workspaceID int,
workloadConfig expconf.ExperimentConfigV0,
resourceManager rm.ResourceManager,
) error {
constraints, err := GetMergedConstraints(ctx, workspaceID, model.ExperimentType)
if err != nil {
return err
}

if constraints.ResourceConstraints != nil && constraints.ResourceConstraints.MaxSlots != nil {
if workloadConfig.Resources.MaxSlots != nil {
if *constraints.ResourceConstraints.MaxSlots < *workloadConfig.Resources.MaxSlots {
return false, fmt.Errorf("requested resources.max_slots [%d] exceeds limit set by admin [%d]: %w",
*workloadConfig.Resources.MaxSlots, *constraints.ResourceConstraints.MaxSlots, errResourceConstraintFailure)
}
// users cannot specify number of slots for an experiment
slotsRequest := 0
if err = checkSlotsConstraint(*constraints.ResourceConstraints.MaxSlots, slotsRequest,
workloadConfig.Resources().MaxSlots()); err != nil {
return err
}
if *constraints.ResourceConstraints.MaxSlots < workloadConfig.Resources.Slots {
return false, fmt.Errorf("requested resources.slots [%d] exceeds limit set by admin [%d]: %w",
workloadConfig.Resources.Slots, *constraints.ResourceConstraints.MaxSlots, errResourceConstraintFailure)
}

// For each submitted constraint, check if the workload config is within allowed values.
// rm.SmallerValueIsHigherPriority only returns an error if task priority is not implemented for that resource manager.
// In that case, there is no need to check if requested priority is within limits.
smallerHigher, err := resourceManager.SmallerValueIsHigherPriority()
if err == nil {
if err = checkPriorityConstraint(smallerHigher, constraints.PriorityLimit,
workloadConfig.Resources().Priority()); err != nil {
return err
}
}

return true, nil
return nil
}

func checkPriorityConstraint(smallerHigher bool, priorityLimit *int, priorityRequest *int) error {
if priorityLimit == nil || priorityRequest == nil {
return nil
}

if !priorityWithinLimit(*priorityRequest, *priorityLimit, smallerHigher) {
return fmt.Errorf("requested priority [%d] exceeds limit set by admin [%d]: %w",
*priorityRequest, *priorityLimit, errPriorityConstraintFailure)
}
return nil
}

func checkSlotsConstraint(slotsLimit int, slotsRequest int, maxSlotsRequest *int) error {
if slotsLimit < slotsRequest {
return fmt.Errorf("requested resources.slots [%d] exceeds limit set by admin [%d]: %w",
slotsRequest, slotsLimit, errResourceConstraintFailure)
}

if maxSlotsRequest != nil {
if slotsLimit < *maxSlotsRequest {
return fmt.Errorf("requested resources.max_slots [%d] exceeds limit set by admin [%d]: %w",
*maxSlotsRequest, slotsLimit, errResourceConstraintFailure)
}
}

return nil
}

// GetMergedConstraints retrieves Workspace and Global constraints and returns a merged result.
Expand Down
Loading

0 comments on commit 5b1380c

Please sign in to comment.