diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 130f523c07..581630f742 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -1,14 +1,15 @@ #!/bin/bash # Pre-builds all benchmark cases with --case-optimization. +# No GPU hardware needed — compilation only. # Can run in two modes: # 1. Direct (Frontier login nodes): pass cluster/device/interface as args -# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh +# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh # Usage: bash prebuild-case-optimization.sh [ ] set -e -# Support both positional args (direct invocation) and env vars (SLURM via submit.sh) +# Support both positional args (direct invocation) and env vars (SLURM) cluster="${1:-${job_cluster:-phoenix}}" job_device="${2:-$job_device}" job_interface="${3:-$job_interface}" @@ -24,7 +25,15 @@ esac rm -rf build . ./mfc.sh load -c "$flag" -m g -source .github/scripts/gpu-opts.sh + +# Set GPU build flags from interface — this is always a GPU build. +# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted +# to a CPU SLURM partition (no GPU hardware needed for compilation). +case "$job_interface" in + acc) gpu_opts="--gpu acc" ;; + omp) gpu_opts="--gpu mp" ;; + *) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;; +esac for case in benchmarks/*/case.py; do echo "=== Pre-building: $case ===" diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh index 38ac08b217..a0b6ce8cfe 100755 --- a/.github/scripts/retry-build.sh +++ b/.github/scripts/retry-build.sh @@ -1,13 +1,16 @@ #!/bin/bash # Provides retry_build(): 2-attempt loop. # On failure of attempt 1, nukes the entire build directory before attempt 2. -# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry. +# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero +# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries +# compiled on a different CPU architecture. # Usage: source .github/scripts/retry-build.sh # retry_build ./mfc.sh build -j 8 --gpu acc +# RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8 retry_build() { - local validate_cmd="${RETRY_VALIDATE_CMD:-}" local max_attempts=2 + local validate_cmd="${RETRY_VALIDATE_CMD:-}" local attempt=1 while [ $attempt -le $max_attempts ]; do echo "Build attempt $attempt of $max_attempts..." diff --git a/.github/scripts/run-tests-with-retry.sh b/.github/scripts/run-tests-with-retry.sh deleted file mode 100755 index 18f1d05d0b..0000000000 --- a/.github/scripts/run-tests-with-retry.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Runs ./mfc.sh test with all provided arguments, then retries a small number -# of sporadic failures (up to 5). Exits non-zero on real failures. -# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...] - -# Extract flags that should carry over to retries (retries build their own -# argument list with --only, so we capture passthrough flags here). -PASSTHROUGH="" -for arg in "$@"; do - case "$arg" in - --test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;; - esac -done - -rm -f tests/failed_uuids.txt -TEST_EXIT=0 -/bin/bash mfc.sh test "$@" || TEST_EXIT=$? - -# Retry only if a small number of tests failed (sporadic failures) -if [ -s tests/failed_uuids.txt ]; then - NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -le 5 ]; then - FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt) - echo "" - echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" - echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $? - else - echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." - exit 1 - fi -elif [ "$TEST_EXIT" -ne 0 ]; then - exit $TEST_EXIT -fi diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index 167505ece3..922d0a9012 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -44,7 +44,7 @@ for case in "${benchmarks[@]}"; do rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data" # Build + run with --case-optimization, small grid, 10 timesteps - if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then + if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then # Validate output if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then echo "PASS: $case_name" diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 8c562b911e..b6a6034c3c 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -24,24 +24,9 @@ echo "==========================================" # both parallel jobs so PR and master always land on the same GPU type. if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then echo "Selecting Phoenix GPU partition for benchmark consistency..." - # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave - # large modern nodes (h200, h100, a100) free for production workloads. - # rtx6000 has the most nodes and gives the most consistent baselines. - BENCH_GPU_PARTITION="" - for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do - # || true: grep -c exits 1 on zero matches (or when sinfo returns no output - # for an unknown partition); suppress so set -euo pipefail doesn't abort. - idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) - if [ "${idle:-0}" -gt 0 ]; then - BENCH_GPU_PARTITION="$part" - echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)" - break - fi - done - if [ -z "$BENCH_GPU_PARTITION" ]; then - echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)" - BENCH_GPU_PARTITION="gpu-rtx6000" - fi + # Require 2 nodes so both PR and master jobs can run concurrently. + GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh" + BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION" export BENCH_GPU_PARTITION fi @@ -57,12 +42,13 @@ echo "Master job started in background (PID: $master_pid)" echo "Waiting for both jobs to complete..." -# Wait and capture exit codes reliably +# Wait and capture exit codes reliably. +# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure +# (which would orphan the second job). pr_exit=0 master_exit=0 -wait "$pr_pid" -pr_exit=$? +wait "$pr_pid" || pr_exit=$? if [ "$pr_exit" -ne 0 ]; then echo "PR job exited with code: $pr_exit" echo "Last 50 lines of PR job log:" @@ -71,8 +57,7 @@ else echo "PR job completed successfully" fi -wait "$master_pid" -master_exit=$? +wait "$master_pid" || master_exit=$? if [ "$master_exit" -ne 0 ]; then echo "Master job exited with code: $master_exit" echo "Last 50 lines of master job log:" diff --git a/.github/scripts/select-gpu-partition.sh b/.github/scripts/select-gpu-partition.sh new file mode 100644 index 0000000000..c812c000a9 --- /dev/null +++ b/.github/scripts/select-gpu-partition.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Select the best available Phoenix GPU partition using sinfo. +# Sources into caller: exports SELECTED_GPU_PARTITION. +# +# Priority order prefers partitions most likely to have availability. +# V100 is last due to slower performance near the test time limit. +# Falls back to gpu-l40s if no partition meets the idle node threshold. +# RTX 6000 nodes are excluded (too slow for the test suite time limit). +# +# Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum +# number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs). +# +# Usage: source .github/scripts/select-gpu-partition.sh + +_GPU_PARTITION_PRIORITY="gpu-l40s gpu-h200 gpu-h100 gpu-a100 gpu-v100" +_GPU_PARTITION_FALLBACK="gpu-l40s" +_GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}" + +SELECTED_GPU_PARTITION="" +for _part in $_GPU_PARTITION_PRIORITY; do + _idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) + if [ "${_idle:-0}" -ge "$_GPU_PARTITION_MIN_NODES" ]; then + SELECTED_GPU_PARTITION="$_part" + echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)" + break + fi +done + +if [ -z "$SELECTED_GPU_PARTITION" ]; then + echo "WARNING: No idle GPU partition found; falling back to $_GPU_PARTITION_FALLBACK (may queue)" + SELECTED_GPU_PARTITION="$_GPU_PARTITION_FALLBACK" +fi + +export SELECTED_GPU_PARTITION +unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _GPU_PARTITION_MIN_NODES _part _idle diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh new file mode 100755 index 0000000000..eb6702cfbe --- /dev/null +++ b/.github/scripts/submit-slurm-job.sh @@ -0,0 +1,207 @@ +#!/bin/bash +# Unified SLURM job submission and monitoring for all clusters. +# Submits a script as a SLURM batch job, then monitors it until completion. +# Rerun-safe: cancels stale jobs from previous runs before resubmission. +# +# Usage: submit-slurm-job.sh [shard] + +set -euo pipefail + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +usage() { + echo "Usage: $0 [shard]" +} + +script_path="${1:-}" +device="${2:-}" +interface="${3:-}" +cluster="${4:-}" +shard="${5:-}" + +if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then + usage + exit 1 +fi + +sbatch_script_contents=$(cat "$script_path") +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Detect job type from submitted script basename +script_basename="$(basename "$script_path" .sh)" +case "$script_basename" in + bench*) job_type="bench" ;; + *) job_type="test" ;; +esac + +# --- Cluster configuration --- +case "$cluster" in + phoenix) + compiler_flag="p" + account="gts-sbryngelson3" + job_prefix="shb" + qos="embers" + extra_sbatch="#SBATCH --requeue" + test_time="03:00:00" + bench_time="04:00:00" + gpu_partition_dynamic=true + ;; + frontier) + compiler_flag="f" + account="CFD154" + job_prefix="MFC" + qos="develop" + extra_sbatch="" + test_time="01:59:00" + bench_time="01:59:00" + gpu_partition_dynamic=false + ;; + frontier_amd) + compiler_flag="famd" + account="CFD154" + job_prefix="MFC" + qos="develop" + extra_sbatch="" + test_time="01:59:00" + bench_time="01:59:00" + gpu_partition_dynamic=false + ;; + *) + echo "ERROR: Unknown cluster '$cluster'" + exit 1 + ;; +esac + +# --- Time limit --- +if [ "$job_type" = "bench" ]; then + sbatch_time="#SBATCH -t $bench_time" +else + sbatch_time="#SBATCH -t $test_time" +fi + +# --- Device-specific SBATCH options --- +if [ "$device" = "cpu" ]; then + case "$cluster" in + phoenix) + sbatch_device_opts="\ +#SBATCH -p cpu-small +#SBATCH --ntasks-per-node=24 +#SBATCH --mem-per-cpu=2G" + ;; + frontier|frontier_amd) + sbatch_device_opts="\ +#SBATCH -n 32 +#SBATCH -p service" + ;; + esac +elif [ "$device" = "gpu" ]; then + # Determine GPU partition + gpu_partition="batch" + if [ "$gpu_partition_dynamic" = "true" ]; then + # Use pre-selected bench partition if available, otherwise query sinfo + if [ -n "${BENCH_GPU_PARTITION:-}" ]; then + gpu_partition="$BENCH_GPU_PARTITION" + echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)" + else + source "${SCRIPT_DIR}/select-gpu-partition.sh" + gpu_partition="$SELECTED_GPU_PARTITION" + fi + fi + + case "$cluster" in + phoenix) + sbatch_device_opts="\ +#SBATCH -p $gpu_partition +#SBATCH --ntasks-per-node=4 +#SBATCH -G2 +#SBATCH --exclude=atl1-1-03-002-29-0" + ;; + frontier|frontier_amd) + sbatch_device_opts="\ +#SBATCH -n 8 +#SBATCH -p service" + ;; + esac +else + usage + exit 1 +fi + +# --- Job slug --- +shard_suffix="" +if [ -n "$shard" ]; then + shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')" +fi +job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}" +output_file="$job_slug.out" +id_file="${job_slug}.slurm_job_id" + +# --- Idempotency: cancel stale jobs from previous runs --- +if [ -f "$id_file" ]; then + existing_id=$(cat "$id_file") + state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) + case "${state:-UNKNOWN}" in + RUNNING|PENDING|REQUEUED|COMPLETING) + echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" + scancel "$existing_id" 2>/dev/null || true + ;; + *) + echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" + ;; + esac + rm -f "$id_file" +fi + +# Remove stale output file so the monitor doesn't pick up old content +# (a previous SLURM job's epilog can write to the .out file after our +# stale-job check, polluting the new job's output stream). +rm -f "$output_file" + +# --- Module load mode (short form) --- +module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") + +# --- Submit --- +submit_output=$(sbatch < "$id_file" +echo "Job ID written to $id_file" + +# --- Monitor --- +bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index e0a6eb7384..62a377bb26 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -19,13 +19,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Always use the PR's submit.sh so both master and PR builds benefit from the -# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is -# still resolved relative to the current directory (master/ or pr/) so the -# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs -# in the right directory regardless of which submit.sh is invoked. -PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh" -bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface" +# Use the PR's submit-slurm-job.sh and bench script for both master and PR jobs. +# The bench script must come from the PR tree (master may not have common/bench.sh +# yet), and the script only orchestrates build+bench — the actual MFC code under +# test is the cwd's checkout (master/ or pr/). +PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh" +bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster" # Verify the YAML output file was created job_slug="bench-$device-$interface" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 8a1c848493..7ce02c1e3f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -105,21 +105,19 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - uses: nick-fields/retry@v3 - with: - max_attempts: 2 - retry_wait_seconds: 60 - timeout_minutes: 150 - command: | - (cd pr && ${{ matrix.build_script }}) & - pid1=$! - (cd master && ${{ matrix.build_script }}) & - pid2=$! - wait $pid1; e1=$? - wait $pid2; e2=$? - [ $e1 -eq 0 ] && [ $e2 -eq 0 ] - on_retry_command: | - rm -rf pr/build master/build + timeout-minutes: 150 + run: | + (cd pr && ${{ matrix.build_script }}) & + pid1=$! + (cd master && ${{ matrix.build_script }}) & + pid2=$! + e1=0; e2=0 + wait $pid1 || e1=$? + wait $pid2 || e2=$? + if [ $e1 -ne 0 ] || [ $e2 -ne 0 ]; then + echo "Build failures: pr=$e1 master=$e2" + exit 1 + fi - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh new file mode 100644 index 0000000000..3251f7baca --- /dev/null +++ b/.github/workflows/common/bench.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Unified benchmark script for all clusters. +# Runs inside a SLURM job via submit-slurm-job.sh. +# Expects env vars: $job_device, $job_interface, $job_slug, $job_cluster + +set -euo pipefail + +source .github/scripts/bench-preamble.sh + +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes +# (GNR nodes have 192 cores but nproc is too aggressive for build). +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + +# --- Phoenix TMPDIR setup --- +if [ "$job_cluster" = "phoenix" ]; then + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir + trap 'rm -rf "$currentdir" || true' EXIT +fi + +# --- Build (if not pre-built on login node) --- +# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). +if [ "$job_cluster" = "phoenix" ]; then + rm -rf build +fi + +if [ ! -d "build" ]; then + source .github/scripts/retry-build.sh + retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 +fi + +# --- Bench cluster flag --- +if [ "$job_cluster" = "phoenix" ]; then + bench_cluster="phoenix-bench" +else + bench_cluster="$job_cluster" +fi + +# --- Run benchmark --- +if [ "$job_device" = "gpu" ]; then + ./mfc.sh bench --mem 4 -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks +else + ./mfc.sh bench --mem 1 -o "$job_slug.yaml" -- -c $bench_cluster $device_opts -n $n_ranks +fi + +# --- Phoenix cleanup (trap EXIT handles rm -rf "$currentdir") --- +if [ "$job_cluster" = "phoenix" ]; then + sleep 10 + unset TMPDIR +fi diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh new file mode 100644 index 0000000000..746c54f5d1 --- /dev/null +++ b/.github/workflows/common/test.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Unified test script for all clusters. +# Runs inside a SLURM job via submit-slurm-job.sh. +# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster + +set -euo pipefail + +source .github/scripts/gpu-opts.sh +build_opts="$gpu_opts" + +# --- Build (if not pre-built on login node) --- +# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh +# to avoid SIGILL from stale binaries compiled on a different microarchitecture. +if [ "$job_cluster" = "phoenix" ]; then + rm -rf build +fi + +if [ ! -d "build" ]; then + source .github/scripts/retry-build.sh + + # Phoenix: smoke-test the syscheck binary to catch architecture mismatches + # (SIGILL from binaries compiled on a different compute node). + validate_cmd="" + if [ "$job_cluster" = "phoenix" ]; then + validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' + fi + + RETRY_VALIDATE_CMD="$validate_cmd" \ + retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 +fi + +# --- GPU detection and thread count --- +device_opts="" +rdma_opts="" +shard_opts="" + +case "$job_cluster" in + phoenix) n_test_threads=8 ;; + *) n_test_threads=32 ;; +esac + +if [ "$job_device" = "gpu" ]; then + source .github/scripts/detect-gpus.sh + + case "$job_cluster" in + phoenix) + device_opts="-g $gpu_ids" + n_test_threads=$((ngpus * 2)) + ;; + *) + # Frontier: --gpu flag is already in $build_opts; no extra device opts needed + device_opts="" + n_test_threads=$ngpus + ;; + esac + + # RDMA for Frontier CCE (not frontier_amd) + if [ "$job_cluster" = "frontier" ]; then + rdma_opts="--rdma-mpi" + fi +else + device_opts="--no-gpu" +fi + +# --- Sharding (Frontier only) --- +if [ -n "${job_shard:-}" ]; then + shard_opts="--shard $job_shard" +fi + +./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh deleted file mode 100644 index b896feb17c..0000000000 --- a/.github/workflows/frontier/bench.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -source .github/scripts/bench-preamble.sh - -# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes. -n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) - -if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -else - ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -fi diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh deleted file mode 100644 index 4b472cd433..0000000000 --- a/.github/workflows/frontier/submit.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -set -e - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -# Determine compiler flag from directory name -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cluster_name="$(basename "$SCRIPT_DIR")" -case "$cluster_name" in - frontier) compiler_flag="f" ;; - frontier_amd) compiler_flag="famd" ;; - *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; -esac - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]" -} - -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi - -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 32 # Number of cores required" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 8 # Number of cores required" -else - usage - exit 1 -fi - -# Select SBATCH params based on job type -if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -else - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -fi - -shard_suffix="" -if [ -n "$4" ]; then - shard_suffix="-$(echo "$4" | sed 's|/|-of-|')" -fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}" -output_file="$job_slug.out" - -submit_output=$(sbatch < 64 ? 64 : $(nproc) )) - -tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build -currentdir=$tmpbuild/run-$(( RANDOM % 900 )) -mkdir -p $tmpbuild -mkdir -p $currentdir - -export TMPDIR=$currentdir - -if [ "$job_device" = "gpu" ]; then - bench_opts="--mem 4" -else - bench_opts="--mem 1" -fi - -rm -rf build - -source .github/scripts/retry-build.sh -retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 - -./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks - -sleep 10 -rm -rf "$currentdir" || true - -unset TMPDIR diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh deleted file mode 100755 index caa6bd2175..0000000000 --- a/.github/workflows/phoenix/submit-job.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# Submit a SLURM job without waiting for it to complete. -# Writes the job ID to .slurm_job_id so a separate monitor step can wait. -# Idempotent: if a job for this slug is still RUNNING or PENDING, skip resubmission. -# -# Usage: submit-job.sh [script.sh] [cpu|gpu] [none|acc|omp] - -set -euo pipefail - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" -} - -if [ -z "${1:-}" ]; then - usage - exit 1 -fi - -sbatch_script_contents=$(cat "$1") - -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -sbatch_cpu_opts="\ -#SBATCH -p cpu-small # partition -#SBATCH --ntasks-per-node=24 # Number of cores per node required -#SBATCH --mem-per-cpu=2G # Memory per core\ -" - -if [ "$job_type" = "bench" ]; then - bench_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}" - echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-})" - sbatch_gpu_opts="\ -#SBATCH -p $bench_partition -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ -" - sbatch_time="#SBATCH -t 04:00:00" -else - sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200 -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ -" - sbatch_time="#SBATCH -t 03:00:00" -fi - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="$sbatch_cpu_opts" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="$sbatch_gpu_opts" -else - usage - exit 1 -fi - -job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" -output_file="$job_slug.out" -id_file="${job_slug}.slurm_job_id" - -# On rerun, cancel any existing job for this slug and submit a fresh one. -# If the job is still live (RUNNING/PENDING), scancel it first as a safety net -# in case the "Cancel SLURM Jobs" step did not fire (e.g. runner was SIGKILL'd). -if [ -f "$id_file" ]; then - existing_id=$(cat "$id_file") - state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) - case "${state:-UNKNOWN}" in - RUNNING|PENDING|REQUEUED|COMPLETING) - echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" - scancel "$existing_id" 2>/dev/null || true - ;; - *) - echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" - ;; - esac - rm -f "$id_file" -fi - -submit_output=$(sbatch < "$id_file" -echo "Job ID written to $id_file" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh deleted file mode 100755 index 0c009bd001..0000000000 --- a/.github/workflows/phoenix/submit.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Submit a SLURM job and wait for it to complete. -# Delegates submission (with idempotency) to submit-job.sh, then monitors. -# -# Usage: submit.sh [script.sh] [cpu|gpu] [none|acc|omp] - -set -euo pipefail - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" -} - -if [ -z "${1:-}" ]; then - usage - exit 1 -fi - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Submit (idempotent — skips resubmission if a live job already exists) -bash "$SCRIPT_DIR/submit-job.sh" "$@" - -# Derive the same job slug and file paths as submit-job.sh. -# NOTE: this sed pipeline must stay identical to the one in submit-job.sh — -# if they diverge the id-file will not be found and the monitor will fail. -job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" -output_file="$job_slug.out" -id_file="${job_slug}.slurm_job_id" - -job_id=$(cat "$id_file") -bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh deleted file mode 100644 index d073c54bde..0000000000 --- a/.github/workflows/phoenix/test.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -source .github/scripts/gpu-opts.sh -build_opts="$gpu_opts" - -rm -rf build - -# Build with retry; smoke-test the freshly built syscheck binary to catch -# architecture mismatches (SIGILL from binaries compiled on a different compute node). -source .github/scripts/retry-build.sh -RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 - -n_test_threads=8 - -if [ "$job_device" = "gpu" ]; then - source .github/scripts/detect-gpus.sh - device_opts="-g $gpu_ids" - n_test_threads=$((ngpus * 2)) -fi - -./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts ${build_opts:---no-gpu} -- -c phoenix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9ce6dda24c..a52a5967d1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,12 +98,6 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: Restore Build Cache - uses: actions/cache@v4 - with: - path: build - key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} - - name: Setup MacOS if: matrix.os == 'macos' run: | @@ -237,32 +231,16 @@ jobs: uses: actions/checkout@v4 with: # clean: false preserves .slurm_job_id files across reruns so - # submit-job.sh can detect and cancel stale SLURM jobs on retry. + # submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry. clean: false - - name: Build + - name: Build (login node) if: matrix.cluster != 'phoenix' - uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 - with: - max_attempts: 2 - retry_wait_seconds: 60 - timeout_minutes: 60 - command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - on_retry_command: rm -rf build - - - name: Submit SLURM Test Job - if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit-job.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} ${{ matrix.interface }} - - - name: Monitor SLURM Test Job - if: matrix.cluster == 'phoenix' - run: | - slug="test-${{ matrix.device }}-${{ matrix.interface }}" - bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" + timeout-minutes: 60 + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Test - if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }} + run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} - name: Cancel SLURM Jobs if: cancelled() @@ -341,25 +319,14 @@ jobs: - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }} - name: Pre-Build (login node) if: matrix.cluster != 'phoenix' run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} - - name: Submit Case-Optimization Tests - if: matrix.cluster == 'phoenix' - run: bash .github/workflows/phoenix/submit-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} - - - name: Monitor Case-Optimization Tests - if: matrix.cluster == 'phoenix' - run: | - slug="run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}" - bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" - - name: Run Case-Optimization Tests - if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Cancel SLURM Jobs if: cancelled()