Skip to content

Commit

Permalink
Merge branch 'NOAA-EMC:develop' into feature/monitor_jobs_separate
Browse files Browse the repository at this point in the history
  • Loading branch information
KateFriedman-NOAA authored Nov 1, 2023
2 parents b7b2638 + 517b92f commit 202a260
Show file tree
Hide file tree
Showing 16 changed files with 201 additions and 176 deletions.
3 changes: 2 additions & 1 deletion ci/scripts/driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ for pr in ${pr_list}; do
export pslot="${case}_${pr_sha}"
rm -Rf "${STMP}/RUNDIRS/${pslot}"
set +e
export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log"
"${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml"
ci_status=$?
set -e
Expand All @@ -169,7 +170,7 @@ for pr in ${pr_list}; do
echo "Failed to create experiment: *FAIL* ${pslot}"
echo "Experiment setup: failed at $(date) for experiment ${pslot}" || true
echo ""
cat "${HOMEgfs}/ci/scripts/"setup_*.std*
cat "${LOGFILE_PATH}"
} >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}"
"${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
Expand Down
1 change: 1 addition & 0 deletions jobs/JGLOBAL_STAGE_IC
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "stage_ic" -c "base stage_ic"
# shellcheck disable=SC2153
rCDUMP=${CDUMP}
[[ ${CDUMP} = "gfs" ]] && export rCDUMP="gdas"
export rCDUMP

# Execute the Script
"${HOMEgfs}/scripts/exglobal_stage_ic.sh"
Expand Down
293 changes: 143 additions & 150 deletions sorc/build_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ function _usage() {
Builds all of the global-workflow components by calling the individual build
scripts in sequence.
Usage: ${BASH_SOURCE[0]} [-a UFS_app][-c build_config][-h][-v]
Usage: ${BASH_SOURCE[0]} [-a UFS_app][-c build_config][-h][-j n][-v]
-a UFS_app:
Build a specific UFS app instead of the default
-c build_config:
Selectively build based on the provided config instead of the default config
-h:
print this help message and exit
-j:
Specify maximum number of build jobs (n)
-v:
Execute all build scripts with -v option to turn on verbose where supported
EOF
Expand All @@ -33,25 +35,25 @@ script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
cd "${script_dir}" || exit 1

_build_ufs_opt=""
_ops_opt=""
_verbose_opt=""
_partial_opt=""
_build_job_max=20
# Reset option counter in case this script is sourced
OPTIND=1
while getopts ":a:c:hov" option; do
while getopts ":a:c:j:hv" option; do
case "${option}" in
a) _build_ufs_opt+="-a ${OPTARG} ";;
c) _partial_opt+="-c ${OPTARG} ";;
h) _usage;;
o) _ops_opt+="-o";;
j) _build_job_max="${OPTARG} ";;
v) _verbose_opt="-v";;
:)
echo "[${BASH_SOURCE[0]}]: ${option} requires an argument"
usage
_usage
;;
*)
echo "[${BASH_SOURCE[0]}]: Unrecognized option: ${option}"
usage
_usage
;;
esac
done
Expand Down Expand Up @@ -105,170 +107,161 @@ ERRSCRIPT=${ERRSCRIPT:-'eval [[ $err = 0 ]]'}
# shellcheck disable=
err=0

#------------------------------------
# build gfs_utils
#------------------------------------
if [[ ${Build_gfs_utils} == 'true' ]]; then
echo " .... Building gfs_utils .... "
# shellcheck disable=SC2086,SC2248
./build_gfs_utils.sh ${_verbose_opt} > "${logs_dir}/build_gfs_utils.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building gfs_utils."
echo "The log file is in ${logs_dir}/build_gfs_utils.log"
fi
err=$((err + rc))
fi
declare -A build_jobs
declare -A build_opts

#------------------------------------
# build WW3 pre & post execs
# Check which builds to do and assign # of build jobs
#------------------------------------
if [[ ${Build_ww3_prepost} == "true" ]]; then
echo " .... Building WW3 pre and post execs .... "
# shellcheck disable=SC2086,SC2248
./build_ww3prepost.sh ${_verbose_opt} ${_build_ufs_opt} > "${logs_dir}/build_ww3_prepost.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building WW3 pre/post processing."
echo "The log file is in ${logs_dir}/build_ww3_prepost.log"
fi
err=$((err + rc))
fi

#------------------------------------
# build forecast model
#------------------------------------
# Mandatory builds, unless otherwise specified, for the UFS
big_jobs=0
if [[ ${Build_ufs_model} == 'true' ]]; then
echo " .... Building forecast model .... "
# shellcheck disable=SC2086,SC2248
./build_ufs.sh ${_verbose_opt} ${_build_ufs_opt} > "${logs_dir}/build_ufs.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building UFS model."
echo "The log file is in ${logs_dir}/build_ufs.log"
fi
err=$((err + rc))
build_jobs["ufs"]=8
big_jobs=$((big_jobs+1))
build_opts["ufs"]="${_verbose_opt} ${_build_ufs_opt}"
fi

#------------------------------------
# build GSI and EnKF - optional checkout
#------------------------------------
if [[ -d gsi_enkf.fd ]]; then
if [[ ${Build_gsi_enkf} == 'true' ]]; then
echo " .... Building gsi and enkf .... "
# shellcheck disable=SC2086,SC2248
./build_gsi_enkf.sh ${_ops_opt} ${_verbose_opt} > "${logs_dir}/build_gsi_enkf.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building gsi_enkf."
echo "The log file is in ${logs_dir}/build_gsi_enkf.log"
fi
err=$((err + rc))
fi
else
echo " .... Skip building gsi and enkf .... "
# The UPP is hardcoded to use 6 cores
if [[ ${Build_upp} == 'true' ]]; then
build_jobs["upp"]=6
build_opts["upp"]=""
fi

#------------------------------------
# build gsi utilities
#------------------------------------
if [[ -d gsi_utils.fd ]]; then
if [[ ${Build_gsi_utils} == 'true' ]]; then
echo " .... Building gsi utilities .... "
# shellcheck disable=SC2086,SC2248
./build_gsi_utils.sh ${_ops_opt} ${_verbose_opt} > "${logs_dir}/build_gsi_utils.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building gsi utilities."
echo "The log file is in ${logs_dir}/build_gsi_utils.log"
fi
err=$((err + rc))
fi
else
echo " .... Skip building gsi utilities .... "
if [[ ${Build_ufs_utils} == 'true' ]]; then
build_jobs["ufs_utils"]=3
build_opts["ufs_utils"]="${_verbose_opt}"
fi
if [[ ${Build_gfs_utils} == 'true' ]]; then
build_jobs["gfs_utils"]=1
build_opts["gfs_utils"]="${_verbose_opt}"
fi
if [[ ${Build_ww3prepost} == "true" ]]; then
build_jobs["ww3prepost"]=3
build_opts["ww3prepost"]="${_verbose_opt} ${_build_ufs_opt}"
fi

#------------------------------------
# build gdas - optional checkout
#------------------------------------
# Optional DA builds
if [[ -d gdas.cd ]]; then
if [[ ${Build_gdas} == 'true' ]]; then
echo " .... Building GDASApp .... "
# shellcheck disable=SC2086,SC2248
./build_gdas.sh ${_verbose_opt} > "${logs_dir}/build_gdas.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building GDASApp."
echo "The log file is in ${logs_dir}/build_gdas.log"
fi
err=$((err + rc))
fi
else
echo " .... Skip building GDASApp .... "
build_jobs["gdas"]=16
big_jobs=$((big_jobs+1))
build_opts["gdas"]="${_verbose_opt}"
fi
if [[ -d gsi_enkf.fd ]]; then
build_jobs["gsi_enkf"]=8
big_jobs=$((big_jobs+1))
build_opts["gsi_enkf"]="${_verbose_opt}"
fi
if [[ -d gsi_utils.fd ]]; then
build_jobs["gsi_utils"]=2
build_opts["gsi_utils"]="${_verbose_opt}"
fi

#------------------------------------
# build gsi monitor
#------------------------------------
if [[ -d gsi_monitor.fd ]]; then
if [[ ${Build_gsi_monitor} == 'true' ]]; then
echo " .... Building gsi monitor .... "
# shellcheck disable=SC2086,SC2248
./build_gsi_monitor.sh ${_ops_opt} ${_verbose_opt} > "${logs_dir}/build_gsi_monitor.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building gsi monitor."
echo "The log file is in ${logs_dir}/build_gsi_monitor.log"
fi
err=$((err + rc))
fi
else
echo " .... Skip building gsi monitor .... "
build_jobs["gsi_monitor"]=1
build_opts["gsi_monitor"]="${_verbose_opt}"
fi

#------------------------------------
# build UPP
#------------------------------------
if [[ ${Build_upp} == 'true' ]]; then
echo " .... Building UPP .... "
# shellcheck disable=SC2086,SC2248
./build_upp.sh ${_ops_opt} ${_verbose_opt} > "${logs_dir}/build_upp.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building UPP."
echo "The log file is in ${logs_dir}/build_upp.log"
fi
err=$((err + rc))
fi
# Go through all builds and adjust CPU counts down if necessary
requested_cpus=0
build_list=""
for build in "${!build_jobs[@]}"; do
if [[ -z "${build_list}" ]]; then
build_list="${build}"
else
build_list="${build_list}, ${build}"
fi
if [[ ${build_jobs[${build}]} -gt ${_build_job_max} ]]; then
build_jobs[${build}]=${_build_job_max}
fi
requested_cpus=$(( requested_cpus + build_jobs[${build}] ))
done

#------------------------------------
# build ufs_utils
#------------------------------------
if [[ ${Build_ufs_utils} == 'true' ]]; then
echo " .... Building ufs_utils .... "
# shellcheck disable=SC2086,SC2248
./build_ufs_utils.sh ${_verbose_opt} > "${logs_dir}/build_ufs_utils.log" 2>&1
# shellcheck disable=
rc=$?
if (( rc != 0 )) ; then
echo "Fatal error in building ufs_utils."
echo "The log file is in ${logs_dir}/build_ufs_utils.log"
fi
err=$((err + rc))
echo "Building ${build_list}"

# Go through all builds and adjust CPU counts up if possible
if [[ ${requested_cpus} -lt ${_build_job_max} && ${big_jobs} -gt 0 ]]; then
# Add cores to the gdas, ufs, and gsi build jobs
extra_cores=$(( _build_job_max - requested_cpus ))
extra_cores=$(( extra_cores / big_jobs ))
for build in "${!build_jobs[@]}"; do
if [[ "${build}" == "gdas" || "${build}" == "ufs" || "${build}" == "gsi_enkf" ]]; then
build_jobs[${build}]=$(( build_jobs[${build}] + extra_cores ))
fi
done
fi

procs_in_use=0
declare -A build_ids

builds_started=0
# Now start looping through all of the jobs until everything is done
while [[ ${builds_started} -lt ${#build_jobs[@]} ]]; do
for build in "${!build_jobs[@]}"; do
# Has the job started?
if [[ -n "${build_jobs[${build}]+0}" && -z "${build_ids[${build}]+0}" ]]; then
# Do we have enough processors to run it?
if [[ ${_build_job_max} -ge $(( build_jobs[build] + procs_in_use )) ]]; then
if [[ "${build}" != "upp" ]]; then
"./build_${build}.sh" -j "${build_jobs[${build}]}" "${build_opts[${build}]:-}" > \
"${logs_dir}/build_${build}.log" 2>&1 &
else
"./build_${build}.sh" "${build_opts[${build}]}" > \
"${logs_dir}/build_${build}.log" 2>&1 &
fi
build_ids["${build}"]=$!
echo "Starting build_${build}.sh"
procs_in_use=$(( procs_in_use + build_jobs[${build}] ))
fi
fi
done

# Check if all builds have completed
# Also recalculate how many processors are in use to account for completed builds
builds_started=0
procs_in_use=0
for build in "${!build_jobs[@]}"; do
# Has the build started?
if [[ -n "${build_ids[${build}]+0}" ]]; then
builds_started=$(( builds_started + 1))
# Calculate how many processors are in use
# Is the build still running?
if ps -p "${build_ids[${build}]}" > /dev/null; then
procs_in_use=$(( procs_in_use + build_jobs["${build}"] ))
fi
fi
done

sleep 5s
done

# Wait for all jobs to complete and check return statuses
errs=0
while [[ ${#build_jobs[@]} -gt 0 ]]; do
for build in "${!build_jobs[@]}"; do
# Test if each job is complete and if so, notify and remove from the array
if [[ -n "${build_ids[${build}]+0}" ]]; then
if ! ps -p "${build_ids[${build}]}" > /dev/null; then
wait "${build_ids[${build}]}"
build_stat=$?
errs=$((errs+build_stat))
if [[ ${build_stat} == 0 ]]; then
echo "build_${build}.sh completed successfully!"
else
echo "build_${build}.sh failed with status ${build_stat}!"
fi

# Remove the completed build from the list of PIDs
unset 'build_ids[${build}]'
unset 'build_jobs[${build}]'
fi
fi
done

sleep 5s
done

#------------------------------------
# Exception Handling
#------------------------------------
if (( err != 0 )); then
if (( errs != 0 )); then
cat << EOF
BUILD ERROR: One or more components failed to build
Check the associated build log(s) for details.
Expand Down
3 changes: 2 additions & 1 deletion sorc/build_gdas.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
set -eux

OPTIND=1
while getopts ":dov" option; do
while getopts ":j:dv" option; do
case "${option}" in
d) export BUILD_TYPE="DEBUG";;
j) export BUILD_JOBS=${OPTARG};;
v) export BUILD_VERBOSE="YES";;
:)
echo "[${BASH_SOURCE[0]}]: ${option} requires an argument"
Expand Down
Loading

0 comments on commit 202a260

Please sign in to comment.