MFlowCode · sbryngelson · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
@@ -1,14 +1,15 @@
 #!/bin/bash
 
 # Pre-builds all benchmark cases with --case-optimization.
+# No GPU hardware needed — compilation only.
 # Can run in two modes:
 #   1. Direct (Frontier login nodes): pass cluster/device/interface as args
-#   2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
+#   2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
 # Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
 
 set -e
 
-# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
+# Support both positional args (direct invocation) and env vars (SLURM)
 cluster="${1:-${job_cluster:-phoenix}}"
 job_device="${2:-$job_device}"
 job_interface="${3:-$job_interface}"
@@ -24,7 +25,15 @@ esac
 rm -rf build
 
 . ./mfc.sh load -c "$flag" -m g
-source .github/scripts/gpu-opts.sh
+
+# Set GPU build flags from interface — this is always a GPU build.
+# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
+# to a CPU SLURM partition (no GPU hardware needed for compilation).
+case "$job_interface" in
+    acc) gpu_opts="--gpu acc" ;;
+    omp) gpu_opts="--gpu mp" ;;
+    *)   echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
+esac
 
 for case in benchmarks/*/case.py; do
     echo "=== Pre-building: $case ==="

@@ -1,13 +1,16 @@
 #!/bin/bash
 # Provides retry_build(): 2-attempt loop.
 # On failure of attempt 1, nukes the entire build directory before attempt 2.
-# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
+# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero
+# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries
+# compiled on a different CPU architecture.
 # Usage: source .github/scripts/retry-build.sh
 #        retry_build ./mfc.sh build -j 8 --gpu acc
+#        RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8
 
 retry_build() {
-    local validate_cmd="${RETRY_VALIDATE_CMD:-}"
     local max_attempts=2
+    local validate_cmd="${RETRY_VALIDATE_CMD:-}"
     local attempt=1
     while [ $attempt -le $max_attempts ]; do
         echo "Build attempt $attempt of $max_attempts..."

@@ -44,7 +44,7 @@ for case in "${benchmarks[@]}"; do
     rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
 
     # Build + run with --case-optimization, small grid, 10 timesteps
-    if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
+    if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
         # Validate output
         if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
             echo "PASS: $case_name"

@@ -24,24 +24,9 @@ echo "=========================================="
 # both parallel jobs so PR and master always land on the same GPU type.
 if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     echo "Selecting Phoenix GPU partition for benchmark consistency..."
-    # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
-    # large modern nodes (h200, h100, a100) free for production workloads.
-    # rtx6000 has the most nodes and gives the most consistent baselines.
-    BENCH_GPU_PARTITION=""
-    for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
-        # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
-        # for an unknown partition); suppress so set -euo pipefail doesn't abort.
-        idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
-        if [ "${idle:-0}" -gt 0 ]; then
-            BENCH_GPU_PARTITION="$part"
-            echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
-            break
-        fi
-    done
-    if [ -z "$BENCH_GPU_PARTITION" ]; then
-        echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
-        BENCH_GPU_PARTITION="gpu-rtx6000"
-    fi
+    # Require 2 nodes so both PR and master jobs can run concurrently.
+    GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
+    BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
     export BENCH_GPU_PARTITION
 fi
 
@@ -57,12 +42,13 @@ echo "Master job started in background (PID: $master_pid)"
 
 echo "Waiting for both jobs to complete..."
 
-# Wait and capture exit codes reliably
+# Wait and capture exit codes reliably.
+# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
+# (which would orphan the second job).
 pr_exit=0
 master_exit=0
 
-wait "$pr_pid"
-pr_exit=$?
+wait "$pr_pid" || pr_exit=$?
 if [ "$pr_exit" -ne 0 ]; then
   echo "PR job exited with code: $pr_exit"
   echo "Last 50 lines of PR job log:"
@@ -71,8 +57,7 @@ else
   echo "PR job completed successfully"
 fi
 
-wait "$master_pid"
-master_exit=$?
+wait "$master_pid" || master_exit=$?
 if [ "$master_exit" -ne 0 ]; then
   echo "Master job exited with code: $master_exit"
   echo "Last 50 lines of master job log:"

@@ -0,0 +1,35 @@
+#!/bin/bash
+# Select the best available Phoenix GPU partition using sinfo.
+# Sources into caller: exports SELECTED_GPU_PARTITION.
+#
+# Priority order prefers partitions most likely to have availability.
+# V100 is last due to slower performance near the test time limit.
+# Falls back to gpu-l40s if no partition meets the idle node threshold.
+# RTX 6000 nodes are excluded (too slow for the test suite time limit).
+#
+# Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum
+# number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs).
+#
+# Usage: source .github/scripts/select-gpu-partition.sh
+
+_GPU_PARTITION_PRIORITY="gpu-l40s gpu-h200 gpu-h100 gpu-a100 gpu-v100"
+_GPU_PARTITION_FALLBACK="gpu-l40s"
+_GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}"
+
+SELECTED_GPU_PARTITION=""
+for _part in $_GPU_PARTITION_PRIORITY; do
+    _idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
+    if [ "${_idle:-0}" -ge "$_GPU_PARTITION_MIN_NODES" ]; then
+        SELECTED_GPU_PARTITION="$_part"
+        echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)"
+        break
+    fi
+done
+
+if [ -z "$SELECTED_GPU_PARTITION" ]; then
+    echo "WARNING: No idle GPU partition found; falling back to $_GPU_PARTITION_FALLBACK (may queue)"
+    SELECTED_GPU_PARTITION="$_GPU_PARTITION_FALLBACK"
+fi
+
+export SELECTED_GPU_PARTITION
+unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _GPU_PARTITION_MIN_NODES _part _idle
@@ -0,0 +1,207 @@
+#!/bin/bash
+# Unified SLURM job submission and monitoring for all clusters.
+# Submits a script as a SLURM batch job, then monitors it until completion.
+# Rerun-safe: cancels stale jobs from previous runs before resubmission.
+#
+# Usage: submit-slurm-job.sh <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]
+
+set -euo pipefail
+
+# Ignore SIGHUP to survive login node session drops
+trap '' HUP
+
+usage() {
+    echo "Usage: $0 <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]"
+}
+
+script_path="${1:-}"
+device="${2:-}"
+interface="${3:-}"
+cluster="${4:-}"
+shard="${5:-}"
+
+if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then
+    usage
+    exit 1
+fi
+
+sbatch_script_contents=$(cat "$script_path")
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Detect job type from submitted script basename
+script_basename="$(basename "$script_path" .sh)"
+case "$script_basename" in
+    bench*) job_type="bench" ;;
+    *)      job_type="test"  ;;
+esac
+
+# --- Cluster configuration ---
+case "$cluster" in
+    phoenix)
+        compiler_flag="p"
+        account="gts-sbryngelson3"
+        job_prefix="shb"
+        qos="embers"
+        extra_sbatch="#SBATCH --requeue"
+        test_time="03:00:00"
+        bench_time="04:00:00"
+        gpu_partition_dynamic=true
+        ;;
+    frontier)
+        compiler_flag="f"
+        account="CFD154"
+        job_prefix="MFC"
+        qos="develop"
+        extra_sbatch=""
+        test_time="01:59:00"
+        bench_time="01:59:00"
+        gpu_partition_dynamic=false
+        ;;
+    frontier_amd)
+        compiler_flag="famd"
+        account="CFD154"
+        job_prefix="MFC"
+        qos="develop"
+        extra_sbatch=""
+        test_time="01:59:00"
+        bench_time="01:59:00"
+        gpu_partition_dynamic=false
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '$cluster'"
+        exit 1
+        ;;
+esac
+
+# --- Time limit ---
+if [ "$job_type" = "bench" ]; then
+    sbatch_time="#SBATCH -t $bench_time"
+else
+    sbatch_time="#SBATCH -t $test_time"
+fi
+
+# --- Device-specific SBATCH options ---
+if [ "$device" = "cpu" ]; then
+    case "$cluster" in
+        phoenix)
+            sbatch_device_opts="\
+#SBATCH -p cpu-small
+#SBATCH --ntasks-per-node=24
+#SBATCH --mem-per-cpu=2G"
+            ;;
+        frontier|frontier_amd)
+            sbatch_device_opts="\
+#SBATCH -n 32
+#SBATCH -p service"
+            ;;
+    esac
+elif [ "$device" = "gpu" ]; then
+    # Determine GPU partition
+    gpu_partition="batch"
+    if [ "$gpu_partition_dynamic" = "true" ]; then
+        # Use pre-selected bench partition if available, otherwise query sinfo
+        if [ -n "${BENCH_GPU_PARTITION:-}" ]; then
+            gpu_partition="$BENCH_GPU_PARTITION"
+            echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)"
+        else
+            source "${SCRIPT_DIR}/select-gpu-partition.sh"
+            gpu_partition="$SELECTED_GPU_PARTITION"
+        fi
+    fi
+
+    case "$cluster" in
+        phoenix)
+            sbatch_device_opts="\
+#SBATCH -p $gpu_partition
+#SBATCH --ntasks-per-node=4
+#SBATCH -G2
+#SBATCH --exclude=atl1-1-03-002-29-0"
+            ;;
+        frontier|frontier_amd)
+            sbatch_device_opts="\
+#SBATCH -n 8
+#SBATCH -p service"
+            ;;
+    esac
+else
+    usage
+    exit 1
+fi
+
+# --- Job slug ---
+shard_suffix=""
+if [ -n "$shard" ]; then
+    shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')"
+fi
+job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}"
+output_file="$job_slug.out"
+id_file="${job_slug}.slurm_job_id"
+
+# --- Idempotency: cancel stale jobs from previous runs ---
+if [ -f "$id_file" ]; then
+    existing_id=$(cat "$id_file")
+    state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
+    case "${state:-UNKNOWN}" in
+        RUNNING|PENDING|REQUEUED|COMPLETING)
+            echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission"
+            scancel "$existing_id" 2>/dev/null || true
+            ;;
+        *)
+            echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh"
+            ;;
+    esac
+    rm -f "$id_file"
+fi
+
+# Remove stale output file so the monitor doesn't pick up old content
+# (a previous SLURM job's epilog can write to the .out file after our
+# stale-job check, polluting the new job's output stream).
+rm -f "$output_file"
+
+# --- Module load mode (short form) ---
+module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
+
+# --- Submit ---
+submit_output=$(sbatch <<EOT
+#!/bin/bash
+#SBATCH -J ${job_prefix}-${job_slug}
+#SBATCH --account=${account}
+#SBATCH -N 1
+${sbatch_device_opts}
+${sbatch_time}
+#SBATCH --qos=${qos}
+${extra_sbatch}
+#SBATCH -o ${output_file}
+
+set -e
+set -x
+
+cd "\$SLURM_SUBMIT_DIR"
+echo "Running in \$(pwd):"
+
+job_slug="$job_slug"
+job_device="$device"
+job_interface="$interface"
+job_shard="$shard"
+job_cluster="$cluster"
+
+. ./mfc.sh load -c $compiler_flag -m $module_mode
+
+$sbatch_script_contents
+
+EOT
+)
+
+job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+if [ -z "$job_id" ]; then
+    echo "ERROR: Failed to submit job. sbatch output:"
+    echo "$submit_output"
+    exit 1
+fi
+
+echo "Submitted batch job $job_id"
+echo "$job_id" > "$id_file"
+echo "Job ID written to $id_file"
+
+# --- Monitor ---
+bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"