From 9956b4f7270c266715dfc251280de00c0cdbc090 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:36:49 -0400 Subject: [PATCH 01/29] ci: replace nick-fields/retry with plain run step; deprioritize v100 nodes The JS action wrapper gets SIGKILL'd on Frontier login nodes under memory pressure, falsely failing the Build step even when build.sh succeeds. retry_build() inside build.sh already handles 2-attempt retry with rm -rf build between attempts. Also move gpu-v100 to last in Phoenix GPU partition priority so SLURM prefers newer GPU nodes (a100/h100/l40s/h200) over the aging V100s that have had recurring driver issues. --- .github/workflows/phoenix/submit-job.sh | 2 +- .github/workflows/test.yml | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh index caa6bd2175..06cc45d797 100755 --- a/.github/workflows/phoenix/submit-job.sh +++ b/.github/workflows/phoenix/submit-job.sh @@ -45,7 +45,7 @@ if [ "$job_type" = "bench" ]; then sbatch_time="#SBATCH -t 04:00:00" else sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200 +#SBATCH -p gpu-a100,gpu-h100,gpu-l40s,gpu-h200,gpu-v100 #SBATCH --ntasks-per-node=4 # Number of cores per node required #SBATCH -G2\ " diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9ce6dda24c..afeca4b0a3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -242,13 +242,8 @@ jobs: - name: Build if: matrix.cluster != 'phoenix' - uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 - with: - max_attempts: 2 - retry_wait_seconds: 60 - timeout_minutes: 60 - command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - on_retry_command: rm -rf build + timeout-minutes: 60 + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Submit SLURM Test Job if: matrix.cluster == 'phoenix' From 2fb93db10e806d93427cc304e649c526d10990d5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:40:07 -0400 Subject: [PATCH 02/29] ci: remove unused RETRY_VALIDATE_CMD from retry_build --- .github/scripts/retry-build.sh | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh index 38ac08b217..2fdb38bb8d 100755 --- a/.github/scripts/retry-build.sh +++ b/.github/scripts/retry-build.sh @@ -1,32 +1,15 @@ #!/bin/bash # Provides retry_build(): 2-attempt loop. # On failure of attempt 1, nukes the entire build directory before attempt 2. -# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry. # Usage: source .github/scripts/retry-build.sh # retry_build ./mfc.sh build -j 8 --gpu acc retry_build() { - local validate_cmd="${RETRY_VALIDATE_CMD:-}" local max_attempts=2 local attempt=1 while [ $attempt -le $max_attempts ]; do echo "Build attempt $attempt of $max_attempts..." if "$@"; then - if [ -n "$validate_cmd" ]; then - if ! eval "$validate_cmd"; then - echo "Post-build validation failed on attempt $attempt." - if [ $attempt -lt $max_attempts ]; then - echo " Nuking build directory before retry..." - rm -rf build 2>/dev/null || true - sleep 5 - attempt=$((attempt + 1)) - continue - else - echo "Validation still failing after $max_attempts attempts." - return 1 - fi - fi - fi echo "Build succeeded on attempt $attempt." return 0 fi From 24185f8760d8caf2c0b7def57f3f9bce87eccaab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:45:22 -0400 Subject: [PATCH 03/29] ci: shared sinfo-based GPU partition selection for tests and benchmarks Extract partition selection into select-gpu-partition.sh so both test jobs (submit-job.sh) and benchmark jobs (run_parallel_benchmarks.sh) use the same sinfo-based logic with a consistent priority order: gpu-rtx6000 -> gpu-l40s -> gpu-v100 -> gpu-h200 -> gpu-h100 -> gpu-a100 Tests now dynamically pick the best available partition rather than submitting to a static multi-partition list, matching the benchmark approach. Bench still exports BENCH_GPU_PARTITION so PR and master land on the same GPU type for fair comparisons. --- .github/scripts/run_parallel_benchmarks.sh | 20 ++------------- .github/scripts/select-gpu-partition.sh | 29 ++++++++++++++++++++++ .github/workflows/phoenix/submit-job.sh | 22 ++++++++-------- 3 files changed, 42 insertions(+), 29 deletions(-) create mode 100644 .github/scripts/select-gpu-partition.sh diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 8c562b911e..5cb491adf4 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -24,24 +24,8 @@ echo "==========================================" # both parallel jobs so PR and master always land on the same GPU type. if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then echo "Selecting Phoenix GPU partition for benchmark consistency..." - # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave - # large modern nodes (h200, h100, a100) free for production workloads. - # rtx6000 has the most nodes and gives the most consistent baselines. - BENCH_GPU_PARTITION="" - for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do - # || true: grep -c exits 1 on zero matches (or when sinfo returns no output - # for an unknown partition); suppress so set -euo pipefail doesn't abort. - idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) - if [ "${idle:-0}" -gt 0 ]; then - BENCH_GPU_PARTITION="$part" - echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)" - break - fi - done - if [ -z "$BENCH_GPU_PARTITION" ]; then - echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)" - BENCH_GPU_PARTITION="gpu-rtx6000" - fi + source "${SCRIPT_DIR}/select-gpu-partition.sh" + BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION" export BENCH_GPU_PARTITION fi diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh new file mode 100644 index 0000000000..ef8873f80b --- /dev/null +++ b/.github/scripts/select-gpu-partition.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Select the best available Phoenix GPU partition using sinfo. +# Sources into caller: exports SELECTED_GPU_PARTITION. +# +# Priority order prefers smaller/older nodes to leave modern GPUs free +# for production workloads. Falls back to gpu-rtx6000 if nothing is idle. +# +# Usage: source .github/scripts/select-gpu-partition.sh + +_GPU_PARTITION_PRIORITY="gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100" +_GPU_PARTITION_FALLBACK="gpu-rtx6000" + +SELECTED_GPU_PARTITION="" +for _part in $_GPU_PARTITION_PRIORITY; do + _idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) + if [ "${_idle:-0}" -gt 0 ]; then + SELECTED_GPU_PARTITION="$_part" + echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)" + break + fi +done + +if [ -z "$SELECTED_GPU_PARTITION" ]; then + echo "WARNING: No idle GPU partition found; falling back to $_GPU_PARTITION_FALLBACK (may queue)" + SELECTED_GPU_PARTITION="$_GPU_PARTITION_FALLBACK" +fi + +export SELECTED_GPU_PARTITION +unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _part _idle diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh index 06cc45d797..3e104ff2ad 100755 --- a/.github/workflows/phoenix/submit-job.sh +++ b/.github/workflows/phoenix/submit-job.sh @@ -35,22 +35,22 @@ sbatch_cpu_opts="\ " if [ "$job_type" = "bench" ]; then - bench_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}" - echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-})" - sbatch_gpu_opts="\ -#SBATCH -p $bench_partition -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ -" + # BENCH_GPU_PARTITION is pre-selected by run_parallel_benchmarks.sh so both + # PR and master jobs land on the same GPU type for a fair comparison. + gpu_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}" + echo "Submitting bench GPU job to partition: $gpu_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-})" sbatch_time="#SBATCH -t 04:00:00" else - sbatch_gpu_opts="\ -#SBATCH -p gpu-a100,gpu-h100,gpu-l40s,gpu-h200,gpu-v100 + source "$(dirname "${BASH_SOURCE[0]}")/../../scripts/select-gpu-partition.sh" + gpu_partition="$SELECTED_GPU_PARTITION" + sbatch_time="#SBATCH -t 03:00:00" +fi + +sbatch_gpu_opts="\ +#SBATCH -p $gpu_partition #SBATCH --ntasks-per-node=4 # Number of cores per node required #SBATCH -G2\ " - sbatch_time="#SBATCH -t 03:00:00" -fi if [ "$2" = "cpu" ]; then sbatch_device_opts="$sbatch_cpu_opts" From 946161d5291a084881e7a87fe03164a4b94af402 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:51:32 -0400 Subject: [PATCH 04/29] bench: update Phoenix tmpbuild path to project storage Make bench jobs use sinfo-based GPU partition selection (via select-gpu-partition.sh) as a baseline, then override with BENCH_GPU_PARTITION only when run_parallel_benchmarks.sh has pre-selected a partition for PR/master consistency. Previously bench jobs fell back to a hardcoded gpu-rtx6000 when BENCH_GPU_PARTITION was unset. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/phoenix/submit-job.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh index 3e104ff2ad..175e2fa881 100755 --- a/.github/workflows/phoenix/submit-job.sh +++ b/.github/workflows/phoenix/submit-job.sh @@ -34,15 +34,18 @@ sbatch_cpu_opts="\ #SBATCH --mem-per-cpu=2G # Memory per core\ " +source "$(dirname "${BASH_SOURCE[0]}")/../../scripts/select-gpu-partition.sh" +gpu_partition="$SELECTED_GPU_PARTITION" + if [ "$job_type" = "bench" ]; then - # BENCH_GPU_PARTITION is pre-selected by run_parallel_benchmarks.sh so both + # If run_parallel_benchmarks.sh pre-selected a partition, use it so both # PR and master jobs land on the same GPU type for a fair comparison. - gpu_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}" - echo "Submitting bench GPU job to partition: $gpu_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-})" + if [ -n "${BENCH_GPU_PARTITION:-}" ]; then + gpu_partition="$BENCH_GPU_PARTITION" + echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)" + fi sbatch_time="#SBATCH -t 04:00:00" else - source "$(dirname "${BASH_SOURCE[0]}")/../../scripts/select-gpu-partition.sh" - gpu_partition="$SELECTED_GPU_PARTITION" sbatch_time="#SBATCH -t 03:00:00" fi From 238428797941056ca1101bf6bb0f1c473e226d77 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:53:21 -0400 Subject: [PATCH 05/29] bench: require 2 idle/mix nodes for parallel benchmark GPU partition selection For parallel benchmarks (PR + master), both jobs need a GPU node concurrently, so require at least 2 idle/mix nodes before selecting a partition. Add GPU_PARTITION_MIN_NODES parameter to select-gpu-partition.sh (defaults to 1 for single-job test use). Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/run_parallel_benchmarks.sh | 3 ++- .github/scripts/select-gpu-partition.sh | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 5cb491adf4..901a6edebe 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -24,7 +24,8 @@ echo "==========================================" # both parallel jobs so PR and master always land on the same GPU type. if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then echo "Selecting Phoenix GPU partition for benchmark consistency..." - source "${SCRIPT_DIR}/select-gpu-partition.sh" + # Require 2 nodes so both PR and master jobs can run concurrently. + GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh" BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION" export BENCH_GPU_PARTITION fi diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh index ef8873f80b..8ff6e5d107 100644 --- a/.github/scripts/select-gpu-partition.sh +++ b/.github/scripts/select-gpu-partition.sh @@ -5,15 +5,19 @@ # Priority order prefers smaller/older nodes to leave modern GPUs free # for production workloads. Falls back to gpu-rtx6000 if nothing is idle. # +# Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum +# number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs). +# # Usage: source .github/scripts/select-gpu-partition.sh _GPU_PARTITION_PRIORITY="gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100" _GPU_PARTITION_FALLBACK="gpu-rtx6000" +_GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}" SELECTED_GPU_PARTITION="" for _part in $_GPU_PARTITION_PRIORITY; do _idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) - if [ "${_idle:-0}" -gt 0 ]; then + if [ "${_idle:-0}" -ge "$_GPU_PARTITION_MIN_NODES" ]; then SELECTED_GPU_PARTITION="$_part" echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)" break @@ -26,4 +30,4 @@ if [ -z "$SELECTED_GPU_PARTITION" ]; then fi export SELECTED_GPU_PARTITION -unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _part _idle +unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _GPU_PARTITION_MIN_NODES _part _idle From 60bcfaaae69af3e55256d8d9d992fc0ead449ab1 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Mar 2026 23:59:35 -0400 Subject: [PATCH 06/29] ci: restore RETRY_VALIDATE_CMD support in retry_build phoenix/test.sh relies on RETRY_VALIDATE_CMD to smoke-test the freshly built syscheck binary and trigger a rebuild on failure, catching architecture mismatches (SIGILL) from binaries compiled on a different compute node. Mistakenly removed in the previous commit as 'unused'. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-build.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh index 2fdb38bb8d..a0b6ce8cfe 100755 --- a/.github/scripts/retry-build.sh +++ b/.github/scripts/retry-build.sh @@ -1,15 +1,35 @@ #!/bin/bash # Provides retry_build(): 2-attempt loop. # On failure of attempt 1, nukes the entire build directory before attempt 2. +# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero +# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries +# compiled on a different CPU architecture. # Usage: source .github/scripts/retry-build.sh # retry_build ./mfc.sh build -j 8 --gpu acc +# RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8 retry_build() { local max_attempts=2 + local validate_cmd="${RETRY_VALIDATE_CMD:-}" local attempt=1 while [ $attempt -le $max_attempts ]; do echo "Build attempt $attempt of $max_attempts..." if "$@"; then + if [ -n "$validate_cmd" ]; then + if ! eval "$validate_cmd"; then + echo "Post-build validation failed on attempt $attempt." + if [ $attempt -lt $max_attempts ]; then + echo " Nuking build directory before retry..." + rm -rf build 2>/dev/null || true + sleep 5 + attempt=$((attempt + 1)) + continue + else + echo "Validation still failing after $max_attempts attempts." + return 1 + fi + fi + fi echo "Build succeeded on attempt $attempt." return 0 fi From a608a9f642fbe4f427c902a228729da802796bca Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 00:04:07 -0400 Subject: [PATCH 07/29] ci: exclude dead GPU node atl1-1-03-002-29-0 (cuInit error 999) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/phoenix/submit-job.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh index 175e2fa881..809a4c0038 100755 --- a/.github/workflows/phoenix/submit-job.sh +++ b/.github/workflows/phoenix/submit-job.sh @@ -52,7 +52,8 @@ fi sbatch_gpu_opts="\ #SBATCH -p $gpu_partition #SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH -G2 +#SBATCH --exclude=atl1-1-03-002-29-0 # Known bad GPU (cuInit error 999)\ " if [ "$2" = "cpu" ]; then From 72c9c866d8d2a213550bd80bf88b26b872312e06 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 00:18:57 -0400 Subject: [PATCH 08/29] ci: unify job submission, test, and bench scripts across clusters Replace per-cluster submit/test/bench scripts with unified versions: - submit-slurm-job.sh: single parameterized submit+monitor script for all clusters (replaces phoenix/submit-job.sh, phoenix/submit.sh, frontier/submit.sh). Cluster config (account, QOS, partitions, time limits) is selected via a case block. Idempotent stale-job cancellation now applies to all clusters, not just Phoenix. - common/test.sh: unified test script with conditional build (skips if build/ exists from Frontier login-node build), cluster-aware GPU detection, thread counts, RDMA, and sharding. - common/bench.sh: unified bench script with conditional build, TMPDIR management (Phoenix-only), and cluster-aware bench flags. Also removes nick-fields/retry from bench.yml (frontier build.sh already uses retry_build internally) and deletes dead code (run-tests-with-retry.sh). test.yml self job: 5 conditional steps -> 2 steps (Build + Test). test.yml case-opt job: 5 conditional steps -> 3 steps. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/run-tests-with-retry.sh | 34 ---- .github/scripts/submit-slurm-job.sh | 199 ++++++++++++++++++++ .github/scripts/submit_and_monitor_bench.sh | 12 +- .github/workflows/bench.yml | 24 +-- .github/workflows/common/bench.sh | 50 +++++ .github/workflows/common/test.sh | 65 +++++++ .github/workflows/frontier/bench.sh | 12 -- .github/workflows/frontier/submit.sh | 105 ----------- .github/workflows/frontier/test.sh | 20 -- .github/workflows/frontier_amd/bench.sh | 1 - .github/workflows/frontier_amd/submit.sh | 1 - .github/workflows/frontier_amd/test.sh | 1 - .github/workflows/phoenix/bench.sh | 32 ---- .github/workflows/phoenix/submit-job.sh | 127 ------------- .github/workflows/phoenix/submit.sh | 34 ---- .github/workflows/phoenix/test.sh | 22 --- .github/workflows/test.yml | 32 +--- 17 files changed, 333 insertions(+), 438 deletions(-) delete mode 100755 .github/scripts/run-tests-with-retry.sh create mode 100755 .github/scripts/submit-slurm-job.sh create mode 100644 .github/workflows/common/bench.sh create mode 100644 .github/workflows/common/test.sh delete mode 100644 .github/workflows/frontier/bench.sh delete mode 100644 .github/workflows/frontier/submit.sh delete mode 100644 .github/workflows/frontier/test.sh delete mode 120000 .github/workflows/frontier_amd/bench.sh delete mode 120000 .github/workflows/frontier_amd/submit.sh delete mode 120000 .github/workflows/frontier_amd/test.sh delete mode 100644 .github/workflows/phoenix/bench.sh delete mode 100755 .github/workflows/phoenix/submit-job.sh delete mode 100755 .github/workflows/phoenix/submit.sh delete mode 100644 .github/workflows/phoenix/test.sh diff --git a/.github/scripts/run-tests-with-retry.sh b/.github/scripts/run-tests-with-retry.sh deleted file mode 100755 index 18f1d05d0b..0000000000 --- a/.github/scripts/run-tests-with-retry.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Runs ./mfc.sh test with all provided arguments, then retries a small number -# of sporadic failures (up to 5). Exits non-zero on real failures. -# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...] - -# Extract flags that should carry over to retries (retries build their own -# argument list with --only, so we capture passthrough flags here). -PASSTHROUGH="" -for arg in "$@"; do - case "$arg" in - --test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;; - esac -done - -rm -f tests/failed_uuids.txt -TEST_EXIT=0 -/bin/bash mfc.sh test "$@" || TEST_EXIT=$? - -# Retry only if a small number of tests failed (sporadic failures) -if [ -s tests/failed_uuids.txt ]; then - NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -le 5 ]; then - FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt) - echo "" - echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" - echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $? - else - echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." - exit 1 - fi -elif [ "$TEST_EXIT" -ne 0 ]; then - exit $TEST_EXIT -fi diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh new file mode 100755 index 0000000000..c201b84c82 --- /dev/null +++ b/.github/scripts/submit-slurm-job.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# Unified SLURM job submission and monitoring for all clusters. +# Submits a script as a SLURM batch job, then monitors it until completion. +# Idempotent: cancels stale jobs from previous runs before resubmission. +# +# Usage: submit-slurm-job.sh [shard] + +set -euo pipefail + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +usage() { + echo "Usage: $0 [shard]" +} + +script_path="${1:-}" +device="${2:-}" +interface="${3:-}" +cluster="${4:-}" +shard="${5:-}" + +if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then + usage + exit 1 +fi + +sbatch_script_contents=$(cat "$script_path") +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Detect job type from submitted script basename +script_basename="$(basename "$script_path" .sh)" +case "$script_basename" in + bench*) job_type="bench" ;; + *) job_type="test" ;; +esac + +# --- Cluster configuration --- +case "$cluster" in + phoenix) + compiler_flag="p" + account="gts-sbryngelson3" + job_prefix="shb" + qos="embers" + extra_sbatch="#SBATCH --requeue" + test_time="03:00:00" + bench_time="04:00:00" + gpu_partition_dynamic=true + ;; + frontier) + compiler_flag="f" + account="CFD154" + job_prefix="MFC" + qos="normal" + extra_sbatch="" + test_time="01:59:00" + bench_time="01:59:00" + gpu_partition_dynamic=false + ;; + frontier_amd) + compiler_flag="famd" + account="CFD154" + job_prefix="MFC" + qos="normal" + extra_sbatch="" + test_time="01:59:00" + bench_time="01:59:00" + gpu_partition_dynamic=false + ;; + *) + echo "ERROR: Unknown cluster '$cluster'" + exit 1 + ;; +esac + +# --- Time limit --- +if [ "$job_type" = "bench" ]; then + sbatch_time="#SBATCH -t $bench_time" +else + sbatch_time="#SBATCH -t $test_time" +fi + +# --- Device-specific SBATCH options --- +if [ "$device" = "cpu" ]; then + case "$cluster" in + phoenix) + sbatch_device_opts="\ +#SBATCH -p cpu-small +#SBATCH --ntasks-per-node=24 +#SBATCH --mem-per-cpu=2G" + ;; + frontier|frontier_amd) + sbatch_device_opts="#SBATCH -n 32" + ;; + esac +elif [ "$device" = "gpu" ]; then + # Determine GPU partition + if [ "$gpu_partition_dynamic" = "true" ]; then + # Use pre-selected bench partition if available, otherwise query sinfo + if [ -n "${BENCH_GPU_PARTITION:-}" ]; then + gpu_partition="$BENCH_GPU_PARTITION" + echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)" + else + source "${SCRIPT_DIR}/select-gpu-partition.sh" + gpu_partition="$SELECTED_GPU_PARTITION" + fi + fi + + case "$cluster" in + phoenix) + sbatch_device_opts="\ +#SBATCH -p $gpu_partition +#SBATCH --ntasks-per-node=4 +#SBATCH -G2 +#SBATCH --exclude=atl1-1-03-002-29-0" + ;; + frontier|frontier_amd) + sbatch_device_opts="\ +#SBATCH -n 8 +#SBATCH -p batch" + ;; + esac +else + usage + exit 1 +fi + +# --- Job slug --- +shard_suffix="" +if [ -n "$shard" ]; then + shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')" +fi +job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}" +output_file="$job_slug.out" +id_file="${job_slug}.slurm_job_id" + +# --- Idempotency: cancel stale jobs from previous runs --- +if [ -f "$id_file" ]; then + existing_id=$(cat "$id_file") + state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) + case "${state:-UNKNOWN}" in + RUNNING|PENDING|REQUEUED|COMPLETING) + echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" + scancel "$existing_id" 2>/dev/null || true + ;; + *) + echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" + ;; + esac + rm -f "$id_file" +fi + +# --- Module load mode (short form) --- +module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") + +# --- Submit --- +submit_output=$(sbatch < "$id_file" +echo "Job ID written to $id_file" + +# --- Monitor --- +bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index e0a6eb7384..0887d9fb1d 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -19,13 +19,11 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Always use the PR's submit.sh so both master and PR builds benefit from the -# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is -# still resolved relative to the current directory (master/ or pr/) so the -# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs -# in the right directory regardless of which submit.sh is invoked. -PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh" -bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface" +# Always use the PR's submit-slurm-job.sh so both master and PR builds benefit +# from the run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench +# script is resolved relative to the current directory (master/ or pr/) so the +# correct branch code is benchmarked. +bash "${SCRIPT_DIR}/submit-slurm-job.sh" .github/workflows/common/bench.sh "$device" "$interface" "$cluster" # Verify the YAML output file was created job_slug="bench-$device-$interface" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 8a1c848493..9ccb8406b1 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -105,21 +105,15 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - uses: nick-fields/retry@v3 - with: - max_attempts: 2 - retry_wait_seconds: 60 - timeout_minutes: 150 - command: | - (cd pr && ${{ matrix.build_script }}) & - pid1=$! - (cd master && ${{ matrix.build_script }}) & - pid2=$! - wait $pid1; e1=$? - wait $pid2; e2=$? - [ $e1 -eq 0 ] && [ $e2 -eq 0 ] - on_retry_command: | - rm -rf pr/build master/build + timeout-minutes: 150 + run: | + (cd pr && ${{ matrix.build_script }}) & + pid1=$! + (cd master && ${{ matrix.build_script }}) & + pid2=$! + wait $pid1; e1=$? + wait $pid2; e2=$? + [ $e1 -eq 0 ] && [ $e2 -eq 0 ] - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh new file mode 100644 index 0000000000..66261d0564 --- /dev/null +++ b/.github/workflows/common/bench.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Unified benchmark script for all clusters. +# Runs inside a SLURM job via submit-slurm-job.sh. +# Expects env vars: $job_device, $job_interface, $job_slug, $job_cluster + +set -e + +source .github/scripts/bench-preamble.sh + +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes +# (GNR nodes have 192 cores but nproc is too aggressive for build/bench). +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + +# --- Phoenix TMPDIR setup --- +if [ "$job_cluster" = "phoenix" ]; then + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 900 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir +fi + +# --- Build (if not pre-built on login node) --- +# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +if [ ! -d "build" ]; then + rm -rf build + source .github/scripts/retry-build.sh + retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 +fi + +# --- Bench cluster flag --- +if [ "$job_cluster" = "phoenix" ]; then + bench_cluster="phoenix-bench" +else + bench_cluster="$job_cluster" +fi + +# --- Run benchmark --- +if [ "$job_device" = "gpu" ]; then + ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks +else + ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks +fi + +# --- Phoenix cleanup --- +if [ "$job_cluster" = "phoenix" ]; then + sleep 10 + rm -rf "$currentdir" || true + unset TMPDIR +fi diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh new file mode 100644 index 0000000000..5a0b771fb1 --- /dev/null +++ b/.github/workflows/common/test.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Unified test script for all clusters. +# Runs inside a SLURM job via submit-slurm-job.sh. +# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster + +set -e + +source .github/scripts/gpu-opts.sh +build_opts="$gpu_opts" + +# --- Build (if not pre-built on login node) --- +# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +if [ ! -d "build" ]; then + rm -rf build + source .github/scripts/retry-build.sh + + # Phoenix: smoke-test the syscheck binary to catch architecture mismatches + # (SIGILL from binaries compiled on a different compute node). + validate_cmd="" + if [ "$job_cluster" = "phoenix" ]; then + validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' + fi + + RETRY_VALIDATE_CMD="$validate_cmd" \ + retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 +fi + +# --- GPU detection and thread count --- +device_opts="" +rdma_opts="" +shard_opts="" + +case "$job_cluster" in + phoenix) n_test_threads=8 ;; + *) n_test_threads=32 ;; +esac + +if [ "$job_device" = "gpu" ]; then + source .github/scripts/detect-gpus.sh + + case "$job_cluster" in + phoenix) + device_opts="-g $gpu_ids" + n_test_threads=$((ngpus * 2)) + ;; + *) + device_opts="$gpu_opts" + n_test_threads=$ngpus + ;; + esac + + # RDMA for Frontier CCE (not frontier_amd) + if [ "$job_cluster" = "frontier" ]; then + rdma_opts="--rdma-mpi" + fi +else + device_opts="--no-gpu" +fi + +# --- Sharding (Frontier only) --- +if [ -n "${job_shard:-}" ]; then + shard_opts="--shard $job_shard" +fi + +./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $rdma_opts $device_opts $shard_opts -- -c $job_cluster diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh deleted file mode 100644 index b896feb17c..0000000000 --- a/.github/workflows/frontier/bench.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -source .github/scripts/bench-preamble.sh - -# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes. -n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) - -if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -else - ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -fi diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh deleted file mode 100644 index 4b472cd433..0000000000 --- a/.github/workflows/frontier/submit.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -set -e - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -# Determine compiler flag from directory name -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cluster_name="$(basename "$SCRIPT_DIR")" -case "$cluster_name" in - frontier) compiler_flag="f" ;; - frontier_amd) compiler_flag="famd" ;; - *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; -esac - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]" -} - -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi - -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 32 # Number of cores required" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 8 # Number of cores required" -else - usage - exit 1 -fi - -# Select SBATCH params based on job type -if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -else - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -fi - -shard_suffix="" -if [ -n "$4" ]; then - shard_suffix="-$(echo "$4" | sed 's|/|-of-|')" -fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}" -output_file="$job_slug.out" - -submit_output=$(sbatch < 64 ? 64 : $(nproc) )) - -tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build -currentdir=$tmpbuild/run-$(( RANDOM % 900 )) -mkdir -p $tmpbuild -mkdir -p $currentdir - -export TMPDIR=$currentdir - -if [ "$job_device" = "gpu" ]; then - bench_opts="--mem 4" -else - bench_opts="--mem 1" -fi - -rm -rf build - -source .github/scripts/retry-build.sh -retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 - -./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks - -sleep 10 -rm -rf "$currentdir" || true - -unset TMPDIR diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh deleted file mode 100755 index 809a4c0038..0000000000 --- a/.github/workflows/phoenix/submit-job.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# Submit a SLURM job without waiting for it to complete. -# Writes the job ID to .slurm_job_id so a separate monitor step can wait. -# Idempotent: if a job for this slug is still RUNNING or PENDING, skip resubmission. -# -# Usage: submit-job.sh [script.sh] [cpu|gpu] [none|acc|omp] - -set -euo pipefail - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" -} - -if [ -z "${1:-}" ]; then - usage - exit 1 -fi - -sbatch_script_contents=$(cat "$1") - -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -sbatch_cpu_opts="\ -#SBATCH -p cpu-small # partition -#SBATCH --ntasks-per-node=24 # Number of cores per node required -#SBATCH --mem-per-cpu=2G # Memory per core\ -" - -source "$(dirname "${BASH_SOURCE[0]}")/../../scripts/select-gpu-partition.sh" -gpu_partition="$SELECTED_GPU_PARTITION" - -if [ "$job_type" = "bench" ]; then - # If run_parallel_benchmarks.sh pre-selected a partition, use it so both - # PR and master jobs land on the same GPU type for a fair comparison. - if [ -n "${BENCH_GPU_PARTITION:-}" ]; then - gpu_partition="$BENCH_GPU_PARTITION" - echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)" - fi - sbatch_time="#SBATCH -t 04:00:00" -else - sbatch_time="#SBATCH -t 03:00:00" -fi - -sbatch_gpu_opts="\ -#SBATCH -p $gpu_partition -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2 -#SBATCH --exclude=atl1-1-03-002-29-0 # Known bad GPU (cuInit error 999)\ -" - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="$sbatch_cpu_opts" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="$sbatch_gpu_opts" -else - usage - exit 1 -fi - -job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" -output_file="$job_slug.out" -id_file="${job_slug}.slurm_job_id" - -# On rerun, cancel any existing job for this slug and submit a fresh one. -# If the job is still live (RUNNING/PENDING), scancel it first as a safety net -# in case the "Cancel SLURM Jobs" step did not fire (e.g. runner was SIGKILL'd). -if [ -f "$id_file" ]; then - existing_id=$(cat "$id_file") - state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) - case "${state:-UNKNOWN}" in - RUNNING|PENDING|REQUEUED|COMPLETING) - echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" - scancel "$existing_id" 2>/dev/null || true - ;; - *) - echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" - ;; - esac - rm -f "$id_file" -fi - -submit_output=$(sbatch < "$id_file" -echo "Job ID written to $id_file" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh deleted file mode 100755 index 0c009bd001..0000000000 --- a/.github/workflows/phoenix/submit.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Submit a SLURM job and wait for it to complete. -# Delegates submission (with idempotency) to submit-job.sh, then monitors. -# -# Usage: submit.sh [script.sh] [cpu|gpu] [none|acc|omp] - -set -euo pipefail - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" -} - -if [ -z "${1:-}" ]; then - usage - exit 1 -fi - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Submit (idempotent — skips resubmission if a live job already exists) -bash "$SCRIPT_DIR/submit-job.sh" "$@" - -# Derive the same job slug and file paths as submit-job.sh. -# NOTE: this sed pipeline must stay identical to the one in submit-job.sh — -# if they diverge the id-file will not be found and the monitor will fail. -job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" -output_file="$job_slug.out" -id_file="${job_slug}.slurm_job_id" - -job_id=$(cat "$id_file") -bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh deleted file mode 100644 index d073c54bde..0000000000 --- a/.github/workflows/phoenix/test.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -source .github/scripts/gpu-opts.sh -build_opts="$gpu_opts" - -rm -rf build - -# Build with retry; smoke-test the freshly built syscheck binary to catch -# architecture mismatches (SIGILL from binaries compiled on a different compute node). -source .github/scripts/retry-build.sh -RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 - -n_test_threads=8 - -if [ "$job_device" = "gpu" ]; then - source .github/scripts/detect-gpus.sh - device_opts="-g $gpu_ids" - n_test_threads=$((ngpus * 2)) -fi - -./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts ${build_opts:---no-gpu} -- -c phoenix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index afeca4b0a3..a410047a58 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -237,27 +237,16 @@ jobs: uses: actions/checkout@v4 with: # clean: false preserves .slurm_job_id files across reruns so - # submit-job.sh can detect and cancel stale SLURM jobs on retry. + # submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry. clean: false - - name: Build + - name: Build (login node) if: matrix.cluster != 'phoenix' timeout-minutes: 60 run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - - name: Submit SLURM Test Job - if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit-job.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} ${{ matrix.interface }} - - - name: Monitor SLURM Test Job - if: matrix.cluster == 'phoenix' - run: | - slug="test-${{ matrix.device }}-${{ matrix.interface }}" - bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" - - name: Test - if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }} + run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} - name: Cancel SLURM Jobs if: cancelled() @@ -336,25 +325,14 @@ jobs: - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Pre-Build (login node) if: matrix.cluster != 'phoenix' run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} - - name: Submit Case-Optimization Tests - if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} - - - name: Monitor Case-Optimization Tests - if: matrix.cluster == 'phoenix' - run: | - slug="run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}" - bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" - - name: Run Case-Optimization Tests - if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Cancel SLURM Jobs if: cancelled() From 49d3a7bbc83d7f7d40b00971e3ec673475482dcb Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 00:24:04 -0400 Subject: [PATCH 09/29] ci: remove dead rm -rf build, add gpu_partition default - Remove no-op 'rm -rf build' inside 'if [ ! -d build ]' guard in common/test.sh and common/bench.sh. - Default gpu_partition to 'batch' before dynamic selection to prevent unbound variable error if a new cluster is added. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit-slurm-job.sh | 1 + .github/workflows/common/bench.sh | 1 - .github/workflows/common/test.sh | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index c201b84c82..29b46c1d69 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -95,6 +95,7 @@ if [ "$device" = "cpu" ]; then esac elif [ "$device" = "gpu" ]; then # Determine GPU partition + gpu_partition="batch" if [ "$gpu_partition_dynamic" = "true" ]; then # Use pre-selected bench partition if available, otherwise query sinfo if [ -n "${BENCH_GPU_PARTITION:-}" ]; then diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 66261d0564..f974c0787a 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -23,7 +23,6 @@ fi # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. if [ ! -d "build" ]; then - rm -rf build source .github/scripts/retry-build.sh retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 fi diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 5a0b771fb1..d332297240 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -11,7 +11,6 @@ build_opts="$gpu_opts" # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. if [ ! -d "build" ]; then - rm -rf build source .github/scripts/retry-build.sh # Phoenix: smoke-test the syscheck binary to catch architecture mismatches From 8825dff207a9c21145af56b7211de15e7b89343a Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 00:29:31 -0400 Subject: [PATCH 10/29] ci: use strict shell mode in common/test.sh and common/bench.sh Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/bench.sh | 2 +- .github/workflows/common/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index f974c0787a..1c77aa1993 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -3,7 +3,7 @@ # Runs inside a SLURM job via submit-slurm-job.sh. # Expects env vars: $job_device, $job_interface, $job_slug, $job_cluster -set -e +set -euo pipefail source .github/scripts/bench-preamble.sh diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index d332297240..0c13d25e19 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -3,7 +3,7 @@ # Runs inside a SLURM job via submit-slurm-job.sh. # Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster -set -e +set -euo pipefail source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" From c68d6d6676ebdf3d8d3ee956be9f47189351983c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:34:17 -0400 Subject: [PATCH 11/29] ci: validate existing Phoenix build against node ISA before reuse On Phoenix, test.yml uses clean:false so build/ can persist across reruns. If the prior run built on a different CPU microarchitecture, the stale binaries would SIGILL. Run syscheck on any existing build and nuke build/ on failure so the rebuild block fires. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/test.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 0c13d25e19..6a5e52e0a0 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -10,6 +10,16 @@ build_opts="$gpu_opts" # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# Phoenix: validate an existing build against this node's CPU ISA. +# A stale build/ from a prior run on a different microarchitecture would SIGILL. +if [ "$job_cluster" = "phoenix" ] && [ -d "build" ]; then + syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1) + if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then + echo "syscheck failed on existing build — nuking build/" + rm -rf build + fi +fi + if [ ! -d "build" ]; then source .github/scripts/retry-build.sh From 7f70c2e6a9bad5dc100d236eddc2cb07ded43a82 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:36:49 -0400 Subject: [PATCH 12/29] ci: always nuke build/ on Phoenix to avoid stale ISA mismatches Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/test.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 6a5e52e0a0..dcc28ee3f6 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -10,14 +10,10 @@ build_opts="$gpu_opts" # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. -# Phoenix: validate an existing build against this node's CPU ISA. -# A stale build/ from a prior run on a different microarchitecture would SIGILL. -if [ "$job_cluster" = "phoenix" ] && [ -d "build" ]; then - syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1) - if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then - echo "syscheck failed on existing build — nuking build/" - rm -rf build - fi +# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh +# to avoid SIGILL from stale binaries compiled on a different microarchitecture. +if [ "$job_cluster" = "phoenix" ]; then + rm -rf build fi if [ ! -d "build" ]; then From a3c37ceb855830db280e6c41d885356a029b9a4f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:41:12 -0400 Subject: [PATCH 13/29] ci: nuke stale Phoenix bench builds; trap TMPDIR cleanup on exit Same ISA mismatch fix as test.sh: always rm -rf build on Phoenix. Also add trap EXIT for TMPDIR cleanup so early failures don't leak temp directories under /storage/project. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/bench.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 1c77aa1993..44c2991359 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -18,10 +18,16 @@ if [ "$job_cluster" = "phoenix" ]; then mkdir -p $tmpbuild mkdir -p $currentdir export TMPDIR=$currentdir + trap 'rm -rf "$currentdir" || true' EXIT fi # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). +if [ "$job_cluster" = "phoenix" ]; then + rm -rf build +fi + if [ ! -d "build" ]; then source .github/scripts/retry-build.sh retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 @@ -41,9 +47,8 @@ else ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks fi -# --- Phoenix cleanup --- +# --- Phoenix cleanup (trap EXIT handles rm -rf "$currentdir") --- if [ "$job_cluster" = "phoenix" ]; then sleep 10 - rm -rf "$currentdir" || true unset TMPDIR fi From 5efa8273c83f9f927449d2520b66f526fae2d4d0 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:46:52 -0400 Subject: [PATCH 14/29] ci: widen TMPDIR random range to reduce collision risk Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 44c2991359..b1bff6a75e 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -14,7 +14,7 @@ n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) # --- Phoenix TMPDIR setup --- if [ "$job_cluster" = "phoenix" ]; then tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build - currentdir=$tmpbuild/run-$(( RANDOM % 900 )) + currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) mkdir -p $tmpbuild mkdir -p $currentdir export TMPDIR=$currentdir From b7e8e75586da3865dbb9d1ecf35b3dbc9c4044c3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:52:41 -0400 Subject: [PATCH 15/29] ci: remove no-op -j flag from mfc.sh bench invocations bench.py spawns ./mfc.sh run as a subprocess without forwarding -j, so the flag was silently ignored. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/bench.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index b1bff6a75e..2584e62de0 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -42,9 +42,9 @@ fi # --- Run benchmark --- if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks + ./mfc.sh bench --mem 4 -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks else - ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks + ./mfc.sh bench --mem 1 -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks fi # --- Phoenix cleanup (trap EXIT handles rm -rf "$currentdir") --- From 5a46a3e8970c2fbbfa28ab3536cc6e4bf4efbbbf Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 01:53:47 -0400 Subject: [PATCH 16/29] ci: add space after SBATCH -o for consistency with SLURM docs Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit-slurm-job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index 29b46c1d69..a4ba1dfc0f 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -164,7 +164,7 @@ ${sbatch_device_opts} ${sbatch_time} #SBATCH --qos=${qos} ${extra_sbatch} -#SBATCH -o${output_file} +#SBATCH -o ${output_file} set -e set -x From 61b84cfdf1a75a4811f4577af409f8f6abf18182 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 10:11:06 -0400 Subject: [PATCH 17/29] ci: fix bench master job failing to find common/bench.sh submit_and_monitor_bench.sh cd's into master/ before calling submit-slurm-job.sh, which reads the bench script via cat. Since master branch doesn't have common/bench.sh yet, the cat fails. Fix by resolving the bench script path from the PR tree (absolute path) so it works regardless of cwd. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit_and_monitor_bench.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 0887d9fb1d..62a377bb26 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -19,11 +19,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Always use the PR's submit-slurm-job.sh so both master and PR builds benefit -# from the run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench -# script is resolved relative to the current directory (master/ or pr/) so the -# correct branch code is benchmarked. -bash "${SCRIPT_DIR}/submit-slurm-job.sh" .github/workflows/common/bench.sh "$device" "$interface" "$cluster" +# Use the PR's submit-slurm-job.sh and bench script for both master and PR jobs. +# The bench script must come from the PR tree (master may not have common/bench.sh +# yet), and the script only orchestrates build+bench — the actual MFC code under +# test is the cwd's checkout (master/ or pr/). +PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh" +bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster" # Verify the YAML output file was created job_slug="bench-$device-$interface" From df60f822f755e1fc27f62a3b0370d791bf37b1cd Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 15:49:29 -0400 Subject: [PATCH 18/29] ci: drop gpu-rtx6000 from partition list (too slow for test time limit) RTX 6000 nodes can't finish the full test suite within the 3-hour SLURM wall time. Use gpu-l40s as the new fallback. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/select-gpu-partition.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh index 8ff6e5d107..7f19ee04c0 100644 --- a/.github/scripts/select-gpu-partition.sh +++ b/.github/scripts/select-gpu-partition.sh @@ -3,15 +3,16 @@ # Sources into caller: exports SELECTED_GPU_PARTITION. # # Priority order prefers smaller/older nodes to leave modern GPUs free -# for production workloads. Falls back to gpu-rtx6000 if nothing is idle. +# for production workloads. Falls back to gpu-l40s if nothing is idle. +# RTX 6000 nodes are excluded (too slow for the test suite time limit). # # Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum # number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs). # # Usage: source .github/scripts/select-gpu-partition.sh -_GPU_PARTITION_PRIORITY="gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100" -_GPU_PARTITION_FALLBACK="gpu-rtx6000" +_GPU_PARTITION_PRIORITY="gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100" +_GPU_PARTITION_FALLBACK="gpu-l40s" _GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}" SELECTED_GPU_PARTITION="" From 1c4bd73c02dc2ed894d649c12f932bacd09e059f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 15:50:24 -0400 Subject: [PATCH 19/29] ci: deprioritize gpu-v100 to last in partition selection Co-Authored-By: Claude Opus 4.6 --- .github/scripts/select-gpu-partition.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh index 7f19ee04c0..1ebd6906fe 100644 --- a/.github/scripts/select-gpu-partition.sh +++ b/.github/scripts/select-gpu-partition.sh @@ -11,7 +11,7 @@ # # Usage: source .github/scripts/select-gpu-partition.sh -_GPU_PARTITION_PRIORITY="gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100" +_GPU_PARTITION_PRIORITY="gpu-l40s gpu-h200 gpu-h100 gpu-a100 gpu-v100" _GPU_PARTITION_FALLBACK="gpu-l40s" _GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}" From b51f214469a1296bcfd2ef39678317454a67b06e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 15:54:59 -0400 Subject: [PATCH 20/29] ci: pass build_opts (GPU interface flag) to live test command The dry-run build uses build_opts but the live test command didn't. CMake caches the config, but passing it explicitly is safer. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index dcc28ee3f6..a003552bf0 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -67,4 +67,4 @@ if [ -n "${job_shard:-}" ]; then shard_opts="--shard $job_shard" fi -./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $rdma_opts $device_opts $shard_opts -- -c $job_cluster +./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster From 698bd2e7f642c4f2fa6ffb24523109ea5b6e558e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 16:00:45 -0400 Subject: [PATCH 21/29] ci: fix wait + set -e race that orphans parallel jobs Under set -e, 'wait $pid' returning non-zero aborts the script before the exit code is captured, leaving the second parallel job unmonitored. Use 'wait $pid || exit=$?' so both jobs are always waited on. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/run_parallel_benchmarks.sh | 10 +++++----- .github/workflows/bench.yml | 10 +++++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 901a6edebe..b6a6034c3c 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -42,12 +42,13 @@ echo "Master job started in background (PID: $master_pid)" echo "Waiting for both jobs to complete..." -# Wait and capture exit codes reliably +# Wait and capture exit codes reliably. +# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure +# (which would orphan the second job). pr_exit=0 master_exit=0 -wait "$pr_pid" -pr_exit=$? +wait "$pr_pid" || pr_exit=$? if [ "$pr_exit" -ne 0 ]; then echo "PR job exited with code: $pr_exit" echo "Last 50 lines of PR job log:" @@ -56,8 +57,7 @@ else echo "PR job completed successfully" fi -wait "$master_pid" -master_exit=$? +wait "$master_pid" || master_exit=$? if [ "$master_exit" -ne 0 ]; then echo "Master job exited with code: $master_exit" echo "Last 50 lines of master job log:" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 9ccb8406b1..7ce02c1e3f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -111,9 +111,13 @@ jobs: pid1=$! (cd master && ${{ matrix.build_script }}) & pid2=$! - wait $pid1; e1=$? - wait $pid2; e2=$? - [ $e1 -eq 0 ] && [ $e2 -eq 0 ] + e1=0; e2=0 + wait $pid1 || e1=$? + wait $pid2 || e2=$? + if [ $e1 -ne 0 ] || [ $e2 -ne 0 ]; then + echo "Build failures: pr=$e1 master=$e2" + exit 1 + fi - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} From 1d4d79f4e5b61608d3d53b24feb99bdd1d6c168f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 16:02:36 -0400 Subject: [PATCH 22/29] ci: fix stale comments from incremental refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - select-gpu-partition.sh: priority comment said 'smaller/older' but list is now L40S/H200/H100/A100/V100 - submit-slurm-job.sh: 'Idempotent' → 'Rerun-safe' (it always submits a new job) - bench.sh: n_jobs only used for build, not bench Co-Authored-By: Claude Opus 4.6 --- .github/scripts/select-gpu-partition.sh | 5 +++-- .github/scripts/submit-slurm-job.sh | 2 +- .github/workflows/common/bench.sh | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh index 1ebd6906fe..c812c000a9 100644 --- a/.github/scripts/select-gpu-partition.sh +++ b/.github/scripts/select-gpu-partition.sh @@ -2,8 +2,9 @@ # Select the best available Phoenix GPU partition using sinfo. # Sources into caller: exports SELECTED_GPU_PARTITION. # -# Priority order prefers smaller/older nodes to leave modern GPUs free -# for production workloads. Falls back to gpu-l40s if nothing is idle. +# Priority order prefers partitions most likely to have availability. +# V100 is last due to slower performance near the test time limit. +# Falls back to gpu-l40s if no partition meets the idle node threshold. # RTX 6000 nodes are excluded (too slow for the test suite time limit). # # Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index a4ba1dfc0f..084d8988eb 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -1,7 +1,7 @@ #!/bin/bash # Unified SLURM job submission and monitoring for all clusters. # Submits a script as a SLURM batch job, then monitors it until completion. -# Idempotent: cancels stale jobs from previous runs before resubmission. +# Rerun-safe: cancels stale jobs from previous runs before resubmission. # # Usage: submit-slurm-job.sh [shard] diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 2584e62de0..3251f7baca 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -8,7 +8,7 @@ set -euo pipefail source .github/scripts/bench-preamble.sh # Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes -# (GNR nodes have 192 cores but nproc is too aggressive for build/bench). +# (GNR nodes have 192 cores but nproc is too aggressive for build). n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) # --- Phoenix TMPDIR setup --- From 6cba9352e97be4ec28f2d2ca294a4143787acab8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 16:03:09 -0400 Subject: [PATCH 23/29] ci: fix duplicate --gpu flag on Frontier GPU test commands For non-Phoenix GPU jobs, both device_opts and build_opts resolved to the same --gpu flag. Let build_opts carry it; device_opts is only for cluster-specific runtime flags like -g (Phoenix GPU IDs). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/common/test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index a003552bf0..746c54f5d1 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -49,7 +49,8 @@ if [ "$job_device" = "gpu" ]; then n_test_threads=$((ngpus * 2)) ;; *) - device_opts="$gpu_opts" + # Frontier: --gpu flag is already in $build_opts; no extra device opts needed + device_opts="" n_test_threads=$ngpus ;; esac From 4915884ab6f783177c32f629e7c6d6eebd4cfe44 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 18:22:04 -0400 Subject: [PATCH 24/29] ci: remove stale .out file before SLURM submission With clean:false, old SLURM job epilogs can write to the .out file after our stale-job check completes. The monitor tail then picks up this stale output (including errors from dead nodes) and reports it as if it came from the new job. Removing the .out file before submission ensures a clean output stream. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit-slurm-job.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index 084d8988eb..6f5ab5f366 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -151,6 +151,11 @@ if [ -f "$id_file" ]; then rm -f "$id_file" fi +# Remove stale output file so the monitor doesn't pick up old content +# (a previous SLURM job's epilog can write to the .out file after our +# stale-job check, polluting the new job's output stream). +rm -f "$output_file" + # --- Module load mode (short form) --- module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") From e1e0f424f53b20f88d3bb3435b200de4d9c0d32e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 23:12:36 -0400 Subject: [PATCH 25/29] ci: cap case-optimization build jobs at 8 to match prebuild Co-Authored-By: Claude Opus 4.6 --- .github/scripts/run_case_optimization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index 167505ece3..922d0a9012 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -44,7 +44,7 @@ for case in "${benchmarks[@]}"; do rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data" # Build + run with --case-optimization, small grid, 10 timesteps - if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then + if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then # Validate output if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then echo "PASS: $case_name" From 97ecc2353a4a7aef5c1d3ba70042efffa2ea8a25 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 10 Mar 2026 23:35:13 -0400 Subject: [PATCH 26/29] ci: submit case-opt prebuild to CPU partition (no GPU needed for compilation) Co-Authored-By: Claude Opus 4.6 --- .github/scripts/prebuild-case-optimization.sh | 15 ++++++++++++--- .github/workflows/test.yml | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 130f523c07..581630f742 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -1,14 +1,15 @@ #!/bin/bash # Pre-builds all benchmark cases with --case-optimization. +# No GPU hardware needed — compilation only. # Can run in two modes: # 1. Direct (Frontier login nodes): pass cluster/device/interface as args -# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh +# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh # Usage: bash prebuild-case-optimization.sh [ ] set -e -# Support both positional args (direct invocation) and env vars (SLURM via submit.sh) +# Support both positional args (direct invocation) and env vars (SLURM) cluster="${1:-${job_cluster:-phoenix}}" job_device="${2:-$job_device}" job_interface="${3:-$job_interface}" @@ -24,7 +25,15 @@ esac rm -rf build . ./mfc.sh load -c "$flag" -m g -source .github/scripts/gpu-opts.sh + +# Set GPU build flags from interface — this is always a GPU build. +# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted +# to a CPU SLURM partition (no GPU hardware needed for compilation). +case "$job_interface" in + acc) gpu_opts="--gpu acc" ;; + omp) gpu_opts="--gpu mp" ;; + *) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;; +esac for case in benchmarks/*/case.py; do echo "=== Pre-building: $case ===" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a410047a58..80bb3af468 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -325,7 +325,7 @@ jobs: - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' - run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }} - name: Pre-Build (login node) if: matrix.cluster != 'phoenix' From 17934983abb33b9d50cd2545890c553a1a36184d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Mar 2026 08:06:24 -0400 Subject: [PATCH 27/29] ci: add CPU architecture to build cache key to prevent SIGILL GitHub Actions runners have different CPU microarchitectures. MFC compiles with -march=native, so cached binaries from one runner can contain instructions illegal on another. Adding the GCC-detected -march target to the cache key ensures each ISA gets its own cache. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 80bb3af468..8e32d06315 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,11 +98,19 @@ jobs: - name: Clone uses: actions/checkout@v4 + - name: CPU Architecture Hash + id: cpu + run: | + # Include CPU model in cache key so binaries compiled with -march=native + # on one runner aren't reused on a runner with a different ISA (→ SIGILL). + arch=$(gcc -march=native -Q --help=target 2>/dev/null | grep -m1 '^\s*-march=' | awk '{print $2}') + echo "arch=${arch:-unknown}" >> "$GITHUB_OUTPUT" + - name: Restore Build Cache uses: actions/cache@v4 with: path: build - key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} + key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ steps.cpu.outputs.arch }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} - name: Setup MacOS if: matrix.os == 'macos' From dbe6a5e6f91fddf6b49dc669e0a39e8210546bca Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Mar 2026 08:43:14 -0400 Subject: [PATCH 28/29] ci: remove build cache from GitHub runners (prevents SIGILL, negligible speedup) Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8e32d06315..a52a5967d1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,20 +98,6 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: CPU Architecture Hash - id: cpu - run: | - # Include CPU model in cache key so binaries compiled with -march=native - # on one runner aren't reused on a runner with a different ISA (→ SIGILL). - arch=$(gcc -march=native -Q --help=target 2>/dev/null | grep -m1 '^\s*-march=' | awk '{print $2}') - echo "arch=${arch:-unknown}" >> "$GITHUB_OUTPUT" - - - name: Restore Build Cache - uses: actions/cache@v4 - with: - path: build - key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ steps.cpu.outputs.arch }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} - - name: Setup MacOS if: matrix.os == 'macos' run: | From fde5fc475cc56f3c5896a0a536819b1d9ea8b89b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Mar 2026 09:03:25 -0400 Subject: [PATCH 29/29] ci: switch Frontier to service partition and develop QOS Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit-slurm-job.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index 6f5ab5f366..eb6702cfbe 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -51,7 +51,7 @@ case "$cluster" in compiler_flag="f" account="CFD154" job_prefix="MFC" - qos="normal" + qos="develop" extra_sbatch="" test_time="01:59:00" bench_time="01:59:00" @@ -61,7 +61,7 @@ case "$cluster" in compiler_flag="famd" account="CFD154" job_prefix="MFC" - qos="normal" + qos="develop" extra_sbatch="" test_time="01:59:00" bench_time="01:59:00" @@ -90,7 +90,9 @@ if [ "$device" = "cpu" ]; then #SBATCH --mem-per-cpu=2G" ;; frontier|frontier_amd) - sbatch_device_opts="#SBATCH -n 32" + sbatch_device_opts="\ +#SBATCH -n 32 +#SBATCH -p service" ;; esac elif [ "$device" = "gpu" ]; then @@ -118,7 +120,7 @@ elif [ "$device" = "gpu" ]; then frontier|frontier_amd) sbatch_device_opts="\ #SBATCH -n 8 -#SBATCH -p batch" +#SBATCH -p service" ;; esac else