Skip to content

Commit

Permalink
Add notifications and refactor pid tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
H3mul committed Dec 7, 2023
1 parent 1ddee9b commit 11da45c
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 5 deletions.
10 changes: 9 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ RUN curl -s https://packagecloud.io/install/repositories/ookla/speedtest-cli/scr

# Remove existing SSH host keys
RUN rm -f /etc/ssh/ssh_host_*
RUN mkdir -p /app/{pid,config,scripts}

WORKDIR /

Expand Down Expand Up @@ -106,7 +107,14 @@ ENV DISABLE_MODEL_DOWNLOAD=false
ENV DISABLE_TRAINING_ASSET_DOWNLOAD=false
ENV SHUTDOWN_AFTER_PROVISION=false
ENV DISABLE_AUTOLAUNCH=false
ENV POD_TERMINATION_PID_PATTERN=train
ENV PUSHBULLET_API_TOKEN=""
ENV NOTIFY_ON_TRAINING_END=false
ENV SHUTDOWN_AFTER_TRAINING=false

# Matches most kohya training scripts,
# eg sdxl_train_network.py, sdxl_train.py, train_network.py, etc
ENV TRAINING_PID_PATTERN="train.*\.py"


VOLUME [ "/workspace" ]
EXPOSE 3000 3010 6006 8080 9090
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: "3.9"
services:
sd-ultimate-test:
image: sd-ultimate-test
image: h3mul/sd-ultimate:latest
environment:
PUBLIC_KEY: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC+siaAQHJUe14o+RntWjbpxsTlklxY2pEUSrKKuGQGgBdWD/DTdnkIbD3DjfmNy7BQSuVp/JGaqyoGAQYOzhJec766omEQpZezXC752NNKh7+Q9OZ7EL+aKuvgR8rM1PbXsEW+l7xcqB0/nZccLevpfnEIKvQpz0RhorGfGU6Gx9V8N8zI7icST9TUs/kHYoTowL6ZPMxEe54+y852Y073spemlD9BTXO7ZX/hUn0ZMA5cxeNcfK5O6zOmq+wVSie34qH6ee0AZ9ojrDqaa+9CXjyp69bY96wu+8OibfRsK73lDp2S255m5+SUFPvUh24RgDMbmyaMTtaS4XCbOMSV6IitvV++lUx7CFA5SWx3GybnK6TVT72eVCELDv0j5FP3sKHhJJ4s+AiHRNH76vVB2Ymhenp79JDy4l6w4sHIE/steow5c+Xjsrq2/YZkyw1efRjD0Gv+D5bBHHX7/8jpnNgrEPaTrnA32KKH/v/rAVcgcKhLMqiFUvEU4tpZaSM= hemul@Kaden
VS_SERVER_PASSWORD: testpass
Expand Down
20 changes: 20 additions & 0 deletions scripts/send-notification.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

set -eu

MESSAGE="${1-}"

if [ -z "${MESSAGE}" ]
echo "No message provided for notification, quitting early..." >2
exit 1
fi

if [ -z "${PUSHBULLET_API_TOKEN}" ]
echo "No pushbullet token provided via \$PUSHBULLET_API_TOKEN variable, quitting early..." >2
exit 1
fi

curl -u ${PUSHBULLET_API_TOKEN}: \
-X POST https://api.pushbullet.com/v2/pushes \
--header 'Content-Type: application/json' \
--data-binary "{\"type\":\"note\",\"title\":\"sd-ultimate\",\"body\":\"${MESSAGE}\"}"
7 changes: 4 additions & 3 deletions scripts/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ start_vsserver () {
# Main Program #
# ---------------------------------------------------------------------------- #

execute_script "/pre_start.sh" "Running pre-start script..."
setup_ssh

execute_script "/app/scripts/pre_start.sh" "Running pre-start script..."


if [ "${SHUTDOWN_AFTER_PROVISION}" = true ]; then
Expand All @@ -87,11 +89,10 @@ if [ "${DISABLE_AUTOLAUNCH}" != true ]; then
start_invokeai.sh
fi

setup_ssh
start_vsserver
export_env_vars

execute_script "post_start.sh" "Running post-start script..."
execute_script "/app/scripts/post_start.sh" "Running post-start script..."

echo "Container is READY!"

Expand Down
2 changes: 2 additions & 0 deletions scripts/start_a1111.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
set -eu

echo "Starting Stable Diffusion Web UI"
cd ${A1111_ROOT}
source ./venv/bin/activate
Expand Down
2 changes: 2 additions & 0 deletions scripts/start_invokeai.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
set -eu

echo "Starting InvokeAI..."
source ${INVOKEAI_ROOT}/venv/bin/activate

Expand Down
3 changes: 3 additions & 0 deletions scripts/start_kohya.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env bash
set -eu

echo "Starting Kohya_ss Web UI"
cd ${KOHYA_ROOT}
nohup ./gui.sh --listen 0.0.0.0 --server_port 3010 --headless >> /workspace/logs/kohya_ss.log 2>&1 &
Expand Down
2 changes: 2 additions & 0 deletions scripts/start_vs_server.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
set -eu

echo "Starting VS Server..."
PASSWORD=${VS_SERVER_PASSWORD} \
nohup code-server \
Expand Down
43 changes: 43 additions & 0 deletions scripts/track-training-pids.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

set -eu

if [ -f /app/pid/track-training-pids.pid ];then
echo "Killing running instance of this script..."
rkill $(cat /app/pid/track-training-pids.pid)
fi

echo $$ > /app/pid/track-training-pids.pid

# This script runs in the background and tracks common long-running tasks
# which we might want to quit and/or notify the completion of.

mkdir -p /app/pid/training-pids

while true; do
# 1. locate new processes that match supported tasks (eg, kohya training scripts)
ps aux | grep "${TRAINING_PID_PATTERN}" | grep -v grep | while read -r line; do
# 2. extract PID and key info (eg model name)
pid=$(echo ${line} | awk '{print $2}')
model_name=$(echo ${line} | grep -Po "output_name=\K\S+" | sed "s/\s+/_/") # dedupe by kohya output names
[ -z "${model_name}" ] && continue

# 3. Add a file to /app/pid with PID and info string
pid_file="/app/pid/training-pids/${model_name}.pid"
[ -f ${pid_file} ] || (echo "${pid}" > ${pid_file})
done

# 4. Check all currently tracked pids to see if any exited, if so quit and/or notify
for pid_file in /app/pid/training-pids/*.pid; do
model_name=$(basename ${pid_file} .pid)
pid=$(cat ${pid_file})

if ! ps -p ${pid} > /dev/null; then
echo "Model ${model_name} finished training (PID no longer running)."
rm ${pid_file}
[ "${NOTIFY_ON_TRAINING_END}" = true ] && send-notification.sh "Model ${model_name} finished training."
[ "${SHUTDOWN_AFTER_TRAINING}" = true ] && runpodctl remove pod ${RUNPOD_POD_ID}
fi
done
sleep 30
done

0 comments on commit 11da45c

Please sign in to comment.