From 7e14cdd830882788a84e23bbdf88bd2d2411b70c Mon Sep 17 00:00:00 2001 From: Phillip Dykman Date: Wed, 6 Dec 2023 17:28:15 -0800 Subject: [PATCH] Add notifications and refactor pid tracking --- Dockerfile | 10 +++++++- docker-compose.yaml | 2 +- scripts/provision_a1111.sh | 2 +- scripts/provision_invokeai.sh | 2 +- scripts/provision_kohya.sh | 2 +- scripts/send-notification.sh | 20 ++++++++++++++++ scripts/start.sh | 18 ++++---------- scripts/start_a1111.sh | 2 ++ scripts/start_invokeai.sh | 2 ++ scripts/start_kohya.sh | 3 +++ scripts/start_vs_server.sh | 2 ++ scripts/track-training-pids.sh | 43 ++++++++++++++++++++++++++++++++++ 12 files changed, 89 insertions(+), 19 deletions(-) create mode 100755 scripts/send-notification.sh create mode 100755 scripts/track-training-pids.sh diff --git a/Dockerfile b/Dockerfile index 984bf42..2f92bc8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,6 +79,7 @@ RUN curl -s https://packagecloud.io/install/repositories/ookla/speedtest-cli/scr # Remove existing SSH host keys RUN rm -f /etc/ssh/ssh_host_* +RUN mkdir -p /app/{pid,config,scripts} WORKDIR / @@ -106,7 +107,14 @@ ENV DISABLE_MODEL_DOWNLOAD=false ENV DISABLE_TRAINING_ASSET_DOWNLOAD=false ENV SHUTDOWN_AFTER_PROVISION=false ENV DISABLE_AUTOLAUNCH=false -ENV POD_TERMINATION_PID_PATTERN=train +ENV PUSHBULLET_API_TOKEN="" +ENV NOTIFY_ON_TRAINING_END=false +ENV SHUTDOWN_AFTER_TRAINING=false + +# Matches most kohya training scripts, +# eg sdxl_train_network.py, sdxl_train.py, train_network.py, etc +ENV TRAINING_PID_PATTERN="train.*\.py" + VOLUME [ "/workspace" ] EXPOSE 3000 3010 6006 8080 9090 diff --git a/docker-compose.yaml b/docker-compose.yaml index bb84990..3732793 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,7 +1,7 @@ version: "3.9" services: sd-ultimate-test: - image: sd-ultimate-test + image: h3mul/sd-ultimate:latest environment: PUBLIC_KEY: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC+siaAQHJUe14o+RntWjbpxsTlklxY2pEUSrKKuGQGgBdWD/DTdnkIbD3DjfmNy7BQSuVp/JGaqyoGAQYOzhJec766omEQpZezXC752NNKh7+Q9OZ7EL+aKuvgR8rM1PbXsEW+l7xcqB0/nZccLevpfnEIKvQpz0RhorGfGU6Gx9V8N8zI7icST9TUs/kHYoTowL6ZPMxEe54+y852Y073spemlD9BTXO7ZX/hUn0ZMA5cxeNcfK5O6zOmq+wVSie34qH6ee0AZ9ojrDqaa+9CXjyp69bY96wu+8OibfRsK73lDp2S255m5+SUFPvUh24RgDMbmyaMTtaS4XCbOMSV6IitvV++lUx7CFA5SWx3GybnK6TVT72eVCELDv0j5FP3sKHhJJ4s+AiHRNH76vVB2Ymhenp79JDy4l6w4sHIE/steow5c+Xjsrq2/YZkyw1efRjD0Gv+D5bBHHX7/8jpnNgrEPaTrnA32KKH/v/rAVcgcKhLMqiFUvEU4tpZaSM= hemul@Kaden VS_SERVER_PASSWORD: testpass diff --git a/scripts/provision_a1111.sh b/scripts/provision_a1111.sh index 9c72e1e..96c40e1 100644 --- a/scripts/provision_a1111.sh +++ b/scripts/provision_a1111.sh @@ -6,7 +6,7 @@ set -eu cd ${A1111_ROOT} -git fetch --tags +git fetch --tags > /dev/null git checkout ${A1111_VERSION} if [ -f install_complete ]; then diff --git a/scripts/provision_invokeai.sh b/scripts/provision_invokeai.sh index 69bc6a8..9ccbf1e 100644 --- a/scripts/provision_invokeai.sh +++ b/scripts/provision_invokeai.sh @@ -5,7 +5,7 @@ set -eu cd ${INVOKEAI_ROOT} -git fetch --tags +git fetch --tags > /dev/null git checkout ${INVOKEAI_VERSION} if [ -f install_complete ]; then diff --git a/scripts/provision_kohya.sh b/scripts/provision_kohya.sh index c654551..e2f11fb 100644 --- a/scripts/provision_kohya.sh +++ b/scripts/provision_kohya.sh @@ -6,7 +6,7 @@ set -eu cd ${KOHYA_ROOT} -git fetch --tags +git fetch --tags > /dev/null git checkout ${KOHYA_VERSION} if [ -f install_complete ]; then diff --git a/scripts/send-notification.sh b/scripts/send-notification.sh new file mode 100755 index 0000000..166f2ca --- /dev/null +++ b/scripts/send-notification.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -eu + +MESSAGE="${1-}" + +if [ -z "${MESSAGE}" ]; then + echo "No message provided for notification, quitting early..." >&2 + exit 1 +fi + +if [ -z "${PUSHBULLET_API_TOKEN}" ]; then + echo "No pushbullet token provided via \$PUSHBULLET_API_TOKEN variable, quitting early..." >&2 + exit 1 +fi + +curl -u ${PUSHBULLET_API_TOKEN}: \ + -X POST https://api.pushbullet.com/v2/pushes \ + --header 'Content-Type: application/json' \ + --data-binary "{\"type\":\"note\",\"title\":\"sd-ultimate\",\"body\":\"${MESSAGE}\"}" diff --git a/scripts/start.sh b/scripts/start.sh index 4863158..4733807 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -5,20 +5,11 @@ set -e # Exit the script if any statement returns a non-true return value # Function Definitions # # ---------------------------------------------------------------------------- # -# Execute script if exists -execute_script() { - local script_path=$1 - local script_msg=$2 - if [[ -f ${script_path} ]]; then - echo "${script_msg}" - bash ${script_path} - fi -} - # Setup ssh setup_ssh() { if [[ $PUBLIC_KEY ]]; then echo "Setting up SSH..." + env >> /etc/environment mkdir -p ~/.ssh echo -e "${PUBLIC_KEY}\n" >> ~/.ssh/authorized_keys chmod 700 -R ~/.ssh @@ -69,8 +60,10 @@ start_vsserver () { # Main Program # # ---------------------------------------------------------------------------- # -execute_script "/pre_start.sh" "Running pre-start script..." +setup_ssh +echo "Running pre-start script..." +pre_start.sh | tee /workspace/logs/pre_start.log if [ "${SHUTDOWN_AFTER_PROVISION}" = true ]; then echo "Provisioning complete, shutting down..." @@ -87,12 +80,9 @@ if [ "${DISABLE_AUTOLAUNCH}" != true ]; then start_invokeai.sh fi -setup_ssh start_vsserver export_env_vars -execute_script "post_start.sh" "Running post-start script..." - echo "Container is READY!" tail -F /workspace/logs/* diff --git a/scripts/start_a1111.sh b/scripts/start_a1111.sh index fd3d810..f2776fd 100755 --- a/scripts/start_a1111.sh +++ b/scripts/start_a1111.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +set -eu + echo "Starting Stable Diffusion Web UI" cd ${A1111_ROOT} source ./venv/bin/activate diff --git a/scripts/start_invokeai.sh b/scripts/start_invokeai.sh index b09bea1..0986421 100755 --- a/scripts/start_invokeai.sh +++ b/scripts/start_invokeai.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +set -eu + echo "Starting InvokeAI..." source ${INVOKEAI_ROOT}/venv/bin/activate diff --git a/scripts/start_kohya.sh b/scripts/start_kohya.sh index 6709745..0465a26 100755 --- a/scripts/start_kohya.sh +++ b/scripts/start_kohya.sh @@ -1,3 +1,6 @@ +#!/usr/bin/env bash +set -eu + echo "Starting Kohya_ss Web UI" cd ${KOHYA_ROOT} nohup ./gui.sh --listen 0.0.0.0 --server_port 3010 --headless >> /workspace/logs/kohya_ss.log 2>&1 & diff --git a/scripts/start_vs_server.sh b/scripts/start_vs_server.sh index bb9a19b..995cfe0 100755 --- a/scripts/start_vs_server.sh +++ b/scripts/start_vs_server.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +set -eu + echo "Starting VS Server..." PASSWORD=${VS_SERVER_PASSWORD} \ nohup code-server \ diff --git a/scripts/track-training-pids.sh b/scripts/track-training-pids.sh new file mode 100755 index 0000000..46705df --- /dev/null +++ b/scripts/track-training-pids.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -eu + +if [ -f /app/pid/track-training-pids.pid ];then + echo "Killing running instance of this script..." + rkill $(cat /app/pid/track-training-pids.pid) +fi + +echo $$ > /app/pid/track-training-pids.pid + +# This script runs in the background and tracks common long-running tasks +# which we might want to quit and/or notify the completion of. + +mkdir -p /app/pid/training-pids + +while true; do + # 1. locate new processes that match supported tasks (eg, kohya training scripts) + ps aux | grep "${TRAINING_PID_PATTERN}" | grep -v grep | while read -r line; do + # 2. extract PID and key info (eg model name) + pid=$(echo ${line} | awk '{print $2}') + model_name=$(echo ${line} | grep -Po "output_name=\K\S+" | sed "s/\s+/_/") # dedupe by kohya output names + [ -z "${model_name}" ] && continue + + # 3. Add a file to /app/pid with PID and info string + pid_file="/app/pid/training-pids/${model_name}.pid" + [ -f ${pid_file} ] || (echo "${pid}" > ${pid_file}) + done + + # 4. Check all currently tracked pids to see if any exited, if so quit and/or notify + for pid_file in /app/pid/training-pids/*.pid; do + model_name=$(basename ${pid_file} .pid) + pid=$(cat ${pid_file}) + + if ! ps -p ${pid} > /dev/null; then + echo "Model ${model_name} finished training (PID no longer running)." + rm ${pid_file} + [ "${NOTIFY_ON_TRAINING_END}" = true ] && send-notification.sh "Model ${model_name} finished training." + [ "${SHUTDOWN_AFTER_TRAINING}" = true ] && runpodctl remove pod ${RUNPOD_POD_ID} + fi + done + sleep 30 +done