create-cluster.sh

########## Cluster Creation ##########
# Script for helping setup the AWS parallel cluster
# This is not designed to be run as a script, but rather to be run line-by-line
# in a terminal. Some parts are not re-entrant either.

##### Basic AWS Setup #####
# On AWS Console:
# - Create new EC2 key pair "hpc-pcluster" and download the .pem file
# - Create new EBS volume (~1 GB/student, can be expanded later with some work)
# - Register an elastic IP with a hpc-pcluster=true tag
# - Register domain in Route 53
# - Set that domain's DNS to point to the elastic IP
# - Add a CSV to S3 for the user keys (first column is username, second column is public key)
# - Create a tarball with host private and public host keys and upload to S3:
#       mkdir -p host-keys
#       ssh-keygen -q -N "" -t rsa -b 4096 -f host-keys/ssh_host_rsa_key
#       ssh-keygen -q -N "" -t ecdsa -f host-keys/ssh_host_ecdsa_key
#       ssh-keygen -q -N "" -t ed25519 -f host-keys/ssh_host_ed25519_key
#       cd host-keys && tar -czf ../host-keys.tar.gz * && cd ..
# - Run the CloudFormation stack at https://us-east-1.console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/create/review?stackName=pcluster-slurm-db&templateURL=https://us-east-1-aws-parallelcluster.s3.amazonaws.com/templates/1-click/serverless-database.yaml with the values:
#   - Stack name: hpc-pcluster-slurm-db
#   - Database cluster name: hpc-slurm-accounting-cluster
#   - Sizing: 0.5 to 2
#   - VPC: one generated by ParallelCluster below (out of order, I know...)
#   - CIDR blocks of 10.0.200.0/24 and 10.0.201.0/24
# FUTURE TODO: don't use redundancy for the database since it is just for accounting and can be rebuilt easily (and will be half the cost)

# Update these with any changed values
VENV="$PWD/hpc-aws"
export AWS_PROFILE="bushj"
export AWS_DEFAULT_REGION="us-east-1"
AMI_IMAGE_ID="rocky-88"
DB_CF_NAME="hpc-pcluster-slurm-db"
DOMAIN_NAME="mucluster.com"
USER_KEYS_S3="s3://mu-hpc-pcluster/user-keys.csv"
HOST_KEYS_S3="s3://mu-hpc-pcluster/host-keys.tar.gz"
GRAFANA_CONFIG_S3="s3://mu-hpc-pcluster/grafana.tar.gz"
EBS_VOLUME_ID="vol-02c42f64eace590fa"
GRAFANA_SG_NAME="grafana-sg"  # this just needs to be unique to this VPC
CONFIG_FILE="pcluster-config.yaml"

# Allow use of spot instances (only needs to be done once for an entire AWS account)
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com


##### Tool Setup #####
# Install AWS ParallelCluster tools
if ! [ -d "$VENV" ]; then
  python3 -m venv "$VENV"
fi
source "$VENV/bin/activate"
python3 -m pip install --upgrade "aws-parallelcluster"

# Install NVM and NodeJS (LTS version)
if ! [ -e "$HOME/.nvm/nvm.sh" ]; then
  curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.38.0/install.sh | bash
  chmod ug+x "$HOME/.nvm/nvm.sh"
  source "$HOME/.nvm/nvm.sh"
  nvm install --lts
  export NVM_DIR="$HOME/.nvm"
  [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"
fi


##### Create Config #####
# Explanation of choices in the following:
#  OS: rocky8 - free version of RHEL 8 (like what centos8 should be, exactly what Expanse uses)
#    However, no pre-built images of Rocky 8 are available, so we have to build our own
#  Head Node: t3a.large - 2 vCPU, 30% + burtsable, 8 GB RAM, 0.0752 $/hr
#    Uses AMD EPYC 7000 series processor similar to Expanse and Bridges-2
#  Compute Node: c5ad.2xlarge - 8 vCPU, 16 GB RAM, 300 GB NVMe SSD, 0.344 $/hr (spot price 0.1634 $/hr)
#    Uses AMD EPYC 7002 series processor exaclty like Expanse and Bridges-2
#    NOTE: This is not EFA compatible so it will be slow for MPI (but all of the EFA compatible instances are WAY more expensive: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types)
#  The "2" option at the end causes us to make all machines public (instead of the 1/default option which makes compute nodes private)
#    Would like the compute fleet to be in private subnet, but that would cost $150+ for the semester
if ! [ -e "$CONFIG_FILE" ]; then
    pcluster configure --config "$CONFIG_FILE"
    # Options:
    # us-east-1
    # hpc-pcluster
    # slurm
    # rocky8
    # t3a.large
    # 1
    # compute
    # 1
    # c5ad.2xlarge
    # 24
    # y
    # us-east-1a
    # 2
    #---------
    # Creates VPC but does not actually launch the cluster itself yet
fi

##### Build Rocky 8 image #####
# Adapted from https://ciq.com/blog/how-to-use-aws-parallelcluster-3-8-0-with-rocky-linux-8/
# Create rocky-88.yaml file as described there but needed to:
#  - get AMI link according to https://rockylinux.org/cloud-images/ (correct region, arch, and not using LVM)
#  - add Image.RootVolume.Size parameter since it was running out of room (set it to 42 GB, default was ~37 GB)
pcluster build-image --image-id "$AMI_IMAGE_ID" --image-configuration rocky-88.yaml
# Takes about an hour to build... check progress with:
#pcluster describe-image --image-id "$AMI_IMAGE_ID"
#pcluster list-images --image-status PENDING
# If it fails need to delete the CloudFormation stack


##### Update Config #####
if ! which yq >/dev/null 2>&1; then brew install yq; fi
if ! which jq >/dev/null 2>&1; then brew install jq; fi

# Note: AMI_ID is surrounded by quotes (pcluster does not support the --output text option)
AMI_ID="$(pcluster describe-image --image-id "$AMI_IMAGE_ID" --query 'ec2AmiInfo.amiId')"
#AMI_ID="$(pcluster list-images --image-status AVAILABLE --query "images[?imageId=='$AMI_IMAGE_ID'].ec2AmiInfo.amiId")"
yq -i '.Image.CustomAmi = '"$AMI_ID" "$CONFIG_FILE"

# Set IP address
ELASTIC_IPS="$(aws ec2 describe-addresses --filters "Name=tag:hpc-pcluster,Values=true" "Name=domain,Values=vpc" --query "Addresses[?NetworkInterfaceId == null].PublicIp")"
if [ "$ELASTIC_IPS" = "[]" ]; then echo "!!! No elastic IPs available !!!";
elif [ "$(jq length <<<"$ELASTIC_IPS")" -gt 1 ]; then
    # FUTURE TODO
    echo "!!! Multiple elastic IPs available !!!";
    ELASTIC_IP="$(jq -r '.[0]' <<<"$ELASTIC_IPS")"
    yq -i '.HeadNode.Networking.ElasticIp = "'"$ELASTIC_IP"'"' "$CONFIG_FILE"
else
    ELASTIC_IP="$(jq -r '.[0]' <<<"$ELASTIC_IPS")"
    yq -i '.HeadNode.Networking.ElasticIp = "'"$ELASTIC_IP"'"' "$CONFIG_FILE"
fi

# Add persistent EBS volume for /home
yq -i '.SharedStorage += [{
    "MountDir": "/home",
    "Name": "home",
    "StorageType": "Ebs",
    "EbsSettings": { "VolumeId": "'"$EBS_VOLUME_ID"'" }
}]' "$CONFIG_FILE"

# Add initialization scripts
REPO="$(git remote get-url origin | sed -E -e 's~^(git@[^:]+:|https?://[^/]+/)([[:graph:]]*).git~\2~')"
REPO_URL="https://raw.githubusercontent.com/$REPO/main"
HN_SETUP_SCRIPT="$REPO_URL/head-node-setup.sh"
yq -i '.HeadNode.CustomActions.OnNodeStart.Sequence += [{"Script":"'"$HN_SETUP_SCRIPT"'","Args":["'"$DOMAIN_NAME"'","'"$USER_KEYS_S3"'","'"$HOST_KEYS_S3"'"]}]' "$CONFIG_FILE"
function add_s3_access() {
    S3_BUCKET="$(sed -E -e "s~^s3://([^/]*)/(.*)$~\1~" <<< "$1")"
    S3_KEY="$(sed -E -e "s~^s3://([^/]*)/(.*)$~\2~" <<< "$1")"
    yq -i '.HeadNode.Iam.S3Access += [{"BucketName":"'"$S3_BUCKET"'","KeyName":"'"$S3_KEY"'"}]' "$CONFIG_FILE"
}
add_s3_access "$USER_KEYS_S3"
add_s3_access "$HOST_KEYS_S3"
add_s3_access "$GRAFANA_CONFIG_S3"
CN_SETUP_SCRIPT="$REPO_URL/compute-node-setup.sh"

yq -i '.Scheduling.SlurmQueues[0].CustomActions.OnNodeStart.Sequence += [{"Script":"'"$CN_SETUP_SCRIPT"'"}]' "$CONFIG_FILE"

# Add custom prolog/epilog scripts
HN_CONFIG_SCRIPT="$REPO_URL/head-node-config.sh"
PROLOG="$REPO_URL/50_hpc_cluster_slurm_prolog"
EPILOG="$REPO_URL/50_hpc_cluster_slurm_epilog"
yq -i '.HeadNode.CustomActions.OnNodeConfigured.Sequence += [{"Script":"'"$HN_CONFIG_SCRIPT"'","Args":["'"$PROLOG"'","'"$EPILOG"'"]}]' "$CONFIG_FILE"

# All other configuration changes
yq -i '. *d load("pcluster-config-extras.yaml")' "$CONFIG_FILE"

# Integrate Accounting
DB_URI="$(aws cloudformation describe-stacks --stack-name "$DB_CF_NAME" --query "Stacks[0].Outputs[?OutputKey=='DatabaseHost'].OutputValue" --output text)"
DB_PORT="$(aws cloudformation describe-stacks --stack-name "$DB_CF_NAME" --query "Stacks[0].Outputs[?OutputKey=='DatabasePort'].OutputValue" --output text)"
DB_USERNAME="$(aws cloudformation describe-stacks --stack-name "$DB_CF_NAME" --query "Stacks[0].Outputs[?OutputKey=='DatabaseAdminUser'].OutputValue" --output text)"
DB_SECRET_ARN="$(aws cloudformation describe-stacks --stack-name "$DB_CF_NAME" --query "Stacks[0].Outputs[?OutputKey=='DatabaseSecretArn'].OutputValue" --output text)"
DB_SEC_GROUP="$(aws cloudformation describe-stacks --stack-name "$DB_CF_NAME" --query "Stacks[0].Outputs[?OutputKey=='DatabaseClientSecurityGroup'].OutputValue" --output text)"
yq -i '.HeadNode.Networking.AdditionalSecurityGroups += ["'"$DB_SEC_GROUP"'"]' "$CONFIG_FILE"
yq -i '.Scheduling.SlurmSettings.Database.Uri = "'"$DB_URI:$DB_PORT"'"' "$CONFIG_FILE"
yq -i '.Scheduling.SlurmSettings.Database.UserName = "'"$DB_USERNAME"'"' "$CONFIG_FILE"
yq -i '.Scheduling.SlurmSettings.Database.PasswordSecretArn = "'"$DB_SECRET_ARN"'"' "$CONFIG_FILE"


##### Setup Grafana #####
SUBNET_ID="$(yq ".HeadNode.Networking.SubnetId" "$CONFIG_FILE")"
VPC_ID="$(aws ec2 describe-subnets --subnet-ids "$SUBNET_ID" --query "Subnets[0].VpcId" --output text)"
GF_SEC_GROUP="$(aws ec2 create-security-group --group-name "$GRAFANA_SG_NAME" --description "Open HTTP/HTTPS ports" --vpc-id "$VPC_ID" --output text 2>/dev/null)"
if [ -n "$GF_SEC_GROUP" ]; then
    # newly created security group
    aws ec2 authorize-security-group-ingress --group-id "$GF_SEC_GROUP" --protocol tcp --port 443 --cidr 0.0.0.0/0
    aws ec2 authorize-security-group-ingress --group-id "$GF_SEC_GROUP" --protocol tcp --port 80 --cidr 0.0.0.0/0
else
    # already exists
    GF_SEC_GROUP="$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$GRAFANA_SG_NAME" "Name=vpc-id,Values=$VPC_ID" --query "SecurityGroups[0].GroupId" --output text)"
fi
yq -i '.HeadNode.Networking.AdditionalSecurityGroups += ["'"$GF_SEC_GROUP"'"]' "$CONFIG_FILE"
yq -i '.Tags += [{"Key":"Grafana","Value":"true"}]' "$CONFIG_FILE"


##### Create the cluster #####
pcluster create-cluster --cluster-name hpc-cluster --cluster-configuration "$CONFIG_FILE"
echo
echo "Cluster creation started. Visit CloudFormation console to monitor progress."

# Other commands:
# echo pcluster ssh --region us-east-1 --cluster-name hpc-cluster -i hpc-pcluster.pem

# pcluster describe-cluster --cluster-name hpc-cluster

# pcluster update-compute-fleet --cluster-name hpc-cluster --status STOP_REQUESTED
# pcluster update-cluster --cluster-name hpc-cluster --cluster-configuration "$CONFIG_FILE"
# pcluster update-compute-fleet --cluster-name hpc-cluster --status START_REQUESTED

# pcluster delete-cluster --cluster-name hpc-cluster

# Note: about 5 minutes to boot an instance, good to keep one instance always running

# TODO:
#   job numbering after rebuild