Skip to content

Commit

Permalink
Merge pull request #128 from OneideLuizSchneider/feature/new-ami-rele…
Browse files Browse the repository at this point in the history
…ase-09-22-24-64e4a2e1bd0c673c75908a455f708ba21fefd7ae

New AMI version for EKS - Auto-PR
  • Loading branch information
OneideLuizSchneider authored Sep 22, 2024
2 parents 64e4a2e + 7b1e7d6 commit 3ae9132
Show file tree
Hide file tree
Showing 13 changed files with 1,742 additions and 41 deletions.
1,596 changes: 1,596 additions & 0 deletions amazon-eks-ami/CHANGELOG.md

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions amazon-eks-ami/doc/usage/al2023.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
<!-- template-variable-table-boundary -->
| Variable | Description |
| - | - |
| `enable_accelerator` | Vendor that provides the GPU or accelerator hardware. Currently we support Neuron and NVIDIA. |
| `ami_component_description` | |
| `ami_description` | |
| `ami_name` | |
Expand All @@ -21,15 +20,17 @@
| `binary_bucket_region` | |
| `containerd_version` | |
| `creator` | |
| `enable_accelerator` | Vendor that provides the GPU or accelerator hardware. Currently we support Neuron and NVIDIA. |
| `enable_efa` | Valid options are ```true``` or ```false```. Wheather or not to install the software needed to use AWS Elastic Fabric Adapter (EFA) network interfaces. |
| `enable_fips` | Install openssl and enable fips related kernel parameters |
| `encrypted` | |
| `iam_instance_profile` | The name of an IAM instance profile to launch the EC2 instance with. |
| `enable_efa` | Valid options are ```true``` or ```false```. Wheather or not to install the software needed to use AWS Elastic Fabric Adapter (EFA) network interfaces. |
| `instance_type` | |
| `kms_key_id` | |
| `kubernetes_build_date` | |
| `kubernetes_version` | |
| `launch_block_device_mappings_volume_size` | |
| `nodeadm_build_image` | Image to use as a build environment for nodeadm |
| `nvidia_driver_major_version` | To be used only when ```enable_accelerator = nvidia```. Driver version to install, depends on what is available in NVIDIA repository. |
| `remote_folder` | Directory path for shell provisioner scripts on the builder instance |
| `runc_version` | |
Expand Down
3 changes: 1 addition & 2 deletions amazon-eks-ami/nodeadm/internal/aws/ecr/ecr.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"go.uber.org/zap"
"net"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/ecr"
Expand All @@ -23,7 +22,7 @@ func GetAuthorizationToken(awsRegion string) (string, error) {
}
ecrClient := ecr.NewFromConfig(awsConfig)
var token *ecr.GetAuthorizationTokenOutput
err = util.RetryExponentialBackoff(3, 2*time.Second, func() error {
err = util.NewRetrier(util.WithBackoffExponential()).Retry(context.TODO(), func() error {
token, err = ecrClient.GetAuthorizationToken(context.Background(), &ecr.GetAuthorizationTokenInput{})
return err
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func (m *instanceTypeMixin) matches(instanceType string) bool {

var (
// TODO: fetch this list dynamically
nvidiaInstances = []string{"p3", "p3dn", "p4d", "p4de", "p5", "g4", "g4dn", "g5", "g6", "g6e"}
nvidiaInstances = []string{"p3", "p3dn", "p4d", "p4de", "p5", "p5e", "g4", "g4dn", "g5", "g6", "g6e"}
NvidiaInstanceTypeMixin = instanceTypeMixin{
instanceFamilies: nvidiaInstances,
apply: applyNvidia,
Expand Down
3 changes: 2 additions & 1 deletion amazon-eks-ami/nodeadm/internal/containerd/sandbox.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package containerd

import (
"context"
"fmt"
"os/exec"
"regexp"
Expand Down Expand Up @@ -44,7 +45,7 @@ func cacheSandboxImage(cfg *api.NodeConfig) error {
imageSpec := &v1.ImageSpec{Image: sandboxImage}
authConfig := &v1.AuthConfig{Auth: ecrUserToken}

return util.RetryExponentialBackoff(3, 2*time.Second, func() error {
return util.NewRetrier(util.WithBackoffExponential()).Retry(context.TODO(), func() error {
zap.L().Info("Pulling sandbox image..", zap.String("image", sandboxImage))
imageRef, err := client.PullImage(imageSpec, authConfig, nil)
if err != nil {
Expand Down
15 changes: 8 additions & 7 deletions amazon-eks-ami/nodeadm/internal/daemon/interface.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
package daemon

import "github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
import (
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
)

type Daemon interface {
// Configure configures the daemon.
Configure(*api.NodeConfig) error

// EnsureRunning ensures that the daemon is running.
// If the daemon is not running, it will be started.
// If the daemon is already running, and has been re-configured, it will be restarted.
// EnsureRunning ensures that the daemon is running by either
// starting/restarting the daemon, then blocking until the status of the
// daemon reflects that it is running.
// * If the daemon is not running, it will be started.
// * If the daemon is already running, and has been re-configured, it will be restarted.
EnsureRunning() error

// PostLaunch runs any additional step that needs to occur after the service
// daemon as been started
PostLaunch(*api.NodeConfig) error

// Name returns the name of the daemon.
Name() string
}
41 changes: 31 additions & 10 deletions amazon-eks-ami/nodeadm/internal/daemon/systemd.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package daemon
import (
"context"
"fmt"
"time"

"github.com/awslabs/amazon-eks-ami/nodeadm/internal/util"
"github.com/coreos/go-systemd/v22/dbus"
)

Expand All @@ -32,21 +34,24 @@ func NewDaemonManager() (DaemonManager, error) {
}

func (m *systemdDaemonManager) StartDaemon(name string) error {
unitName := getServiceUnitName(name)
_, err := m.conn.StartUnitContext(context.TODO(), unitName, ModeReplace, nil)
return err
if _, err := m.conn.StartUnitContext(context.TODO(), getServiceUnitName(name), ModeReplace, nil); err != nil {
return err
}
return m.waitForStatus(context.TODO(), name, DaemonStatusRunning)
}

func (m *systemdDaemonManager) StopDaemon(name string) error {
unitName := getServiceUnitName(name)
_, err := m.conn.StopUnitContext(context.TODO(), unitName, ModeReplace, nil)
return err
if _, err := m.conn.StopUnitContext(context.TODO(), getServiceUnitName(name), ModeReplace, nil); err != nil {
return err
}
return m.waitForStatus(context.TODO(), name, DaemonStatusStopped)
}

func (m *systemdDaemonManager) RestartDaemon(name string) error {
unitName := getServiceUnitName(name)
_, err := m.conn.RestartUnitContext(context.TODO(), unitName, ModeReplace, nil)
return err
if _, err := m.conn.RestartUnitContext(context.TODO(), getServiceUnitName(name), ModeReplace, nil); err != nil {
return err
}
return m.waitForStatus(context.TODO(), name, DaemonStatusRunning)
}

func (m *systemdDaemonManager) GetDaemonStatus(name string) (DaemonStatus, error) {
Expand All @@ -55,7 +60,7 @@ func (m *systemdDaemonManager) GetDaemonStatus(name string) (DaemonStatus, error
if err != nil {
return DaemonStatusUnknown, err
}
switch status.Value.String() {
switch status.Value.Value().(string) {
case "active":
return DaemonStatusRunning, nil
case "inactive":
Expand Down Expand Up @@ -102,3 +107,19 @@ func (m *systemdDaemonManager) Close() {
func getServiceUnitName(name string) string {
return fmt.Sprintf("%s.service", name)
}

func (m *systemdDaemonManager) waitForStatus(ctx context.Context, name string, targetStatus DaemonStatus) error {
return util.NewRetrier(
util.WithRetryAlways(),
util.WithBackoffFixed(250*time.Millisecond),
).Retry(ctx, func() error {
status, err := m.GetDaemonStatus(name)
if err != nil {
return err
}
if status != targetStatus {
return fmt.Errorf("%s status is not %q", name, targetStatus)
}
return nil
})
}
12 changes: 12 additions & 0 deletions amazon-eks-ami/nodeadm/internal/kubelet/eni-max-pods.txt
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,18 @@ x2iezn.4xlarge 234
x2iezn.6xlarge 234
x2iezn.8xlarge 234
x2iezn.metal 737
x8g.12xlarge 234
x8g.16xlarge 737
x8g.24xlarge 737
x8g.2xlarge 58
x8g.48xlarge 737
x8g.4xlarge 234
x8g.8xlarge 234
x8g.large 29
x8g.medium 8
x8g.metal-24xl 737
x8g.metal-48xl 737
x8g.xlarge 58
z1d.12xlarge 737
z1d.2xlarge 58
z1d.3xlarge 234
Expand Down
79 changes: 68 additions & 11 deletions amazon-eks-ami/nodeadm/internal/util/retry.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,73 @@
package util

import "time"

func RetryExponentialBackoff(attempts int, initial time.Duration, f func() error) error {
var err error
wait := initial
for i := 0; i < attempts; i++ {
if err = f(); err == nil {
return nil
import (
"context"
"time"
)

type Retrier struct {
ConditionFn func(*Retrier) bool
BackoffFn func(*Retrier) time.Duration

LastErr error
LastWait time.Duration
LastIter int
}

func (r *Retrier) Retry(ctx context.Context, fn func() error) error {
for r.LastIter = 0; r.ConditionFn(r); r.LastIter++ {
if r.LastErr = fn(); r.LastErr == nil {
return r.LastErr
}
select {
case <-ctx.Done():
return ctx.Err()
default:
time.Sleep(r.LastWait)
r.LastWait = r.BackoffFn(r)
}
time.Sleep(wait)
wait *= 2
}
return err
return r.LastErr
}

type fnOpt func(*Retrier)

func NewRetrier(fnOpts ...fnOpt) *Retrier {
retrier := Retrier{
LastErr: nil,
LastIter: 0,
LastWait: time.Second,
}
for _, fn := range append([]fnOpt{
WithRetryCount(5),
WithBackoffExponential(),
}, fnOpts...) {
fn(&retrier)
}
return &retrier
}

func WithRetryCount(maxAttempts int) fnOpt {
return func(r *Retrier) {
r.ConditionFn = func(r *Retrier) bool { return r.LastIter < maxAttempts }
}
}

func WithRetryAlways() fnOpt {
return func(r *Retrier) {
r.ConditionFn = func(r *Retrier) bool { return true }
}
}

func WithBackoffFixed(interval time.Duration) fnOpt {
return func(r *Retrier) {
r.LastWait = interval
r.BackoffFn = func(r *Retrier) time.Duration { return interval }
}
}

func WithBackoffExponential() fnOpt {
return func(r *Retrier) {
r.BackoffFn = func(r *Retrier) time.Duration { return r.LastWait * 2 }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ set -o pipefail
set -o nounset
set -o errexit

BUILD_IMAGE=public.ecr.aws/eks-distro-build-tooling/golang:1.22

sudo systemctl start containerd

sudo nerdctl run \
Expand Down
6 changes: 4 additions & 2 deletions amazon-eks-ami/templates/al2023/template.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"_comment": "All template variables are enumerated here; and most variables have a default value defined in a variables.json for each kubernetes minor version",
"variables": {
"enable_accelerator": null,
"ami_component_description": null,
"ami_description": null,
"ami_name": null,
Expand All @@ -17,15 +16,17 @@
"binary_bucket_region": null,
"containerd_version": null,
"creator": null,
"enable_accelerator": null,
"enable_efa": null,
"enable_fips": null,
"encrypted": null,
"iam_instance_profile": null,
"enable_efa": null,
"instance_type": null,
"kms_key_id": null,
"kubernetes_build_date": null,
"kubernetes_version": null,
"launch_block_device_mappings_volume_size": null,
"nodeadm_build_image": null,
"nvidia_driver_major_version": null,
"remote_folder": null,
"runc_version": null,
Expand Down Expand Up @@ -211,6 +212,7 @@
"remote_folder": "{{ user `remote_folder`}}",
"script": "{{template_dir}}/provisioners/install-nodeadm.sh",
"environment_vars": [
"BUILD_IMAGE={{user `nodeadm_build_image`}}",
"PROJECT_DIR={{user `working_dir`}}/nodeadm"
]
},
Expand Down
7 changes: 4 additions & 3 deletions amazon-eks-ami/templates/al2023/variables-default.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
{
"enable_accelerator": "",
"ami_component_description": "(k8s: {{ user `kubernetes_version` }}, containerd: {{ user `containerd_version` }})",
"ami_description": "EKS-optimized Kubernetes node based on Amazon Linux 2023",
"ami_regions": "",
Expand All @@ -13,12 +12,14 @@
"binary_bucket_region": "us-west-2",
"containerd_version": "1.7.11-*",
"creator": "{{env `USER`}}",
"enable_accelerator": "",
"enable_efa": "false",
"enable_fips": "false",
"encrypted": "false",
"kms_key_id": "",
"iam_instance_profile": "",
"enable_efa": "false",
"kms_key_id": "",
"launch_block_device_mappings_volume_size": "20",
"nodeadm_build_image": "public.ecr.aws/eks-distro-build-tooling/golang:1.23",
"nvidia_driver_major_version": "555",
"remote_folder": "/tmp",
"runc_version": "*",
Expand Down
12 changes: 12 additions & 0 deletions amazon-eks-ami/templates/shared/runtime/eni-max-pods.txt
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,18 @@ x2iezn.4xlarge 234
x2iezn.6xlarge 234
x2iezn.8xlarge 234
x2iezn.metal 737
x8g.12xlarge 234
x8g.16xlarge 737
x8g.24xlarge 737
x8g.2xlarge 58
x8g.48xlarge 737
x8g.4xlarge 234
x8g.8xlarge 234
x8g.large 29
x8g.medium 8
x8g.metal-24xl 737
x8g.metal-48xl 737
x8g.xlarge 58
z1d.12xlarge 737
z1d.2xlarge 58
z1d.3xlarge 234
Expand Down

0 comments on commit 3ae9132

Please sign in to comment.