Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9956b4f
ci: replace nick-fields/retry with plain run step; deprioritize v100 …
Mar 10, 2026
2fb93db
ci: remove unused RETRY_VALIDATE_CMD from retry_build
Mar 10, 2026
24185f8
ci: shared sinfo-based GPU partition selection for tests and benchmarks
Mar 10, 2026
946161d
bench: update Phoenix tmpbuild path to project storage
Mar 10, 2026
2384287
bench: require 2 idle/mix nodes for parallel benchmark GPU partition …
Mar 10, 2026
60bcfaa
ci: restore RETRY_VALIDATE_CMD support in retry_build
Mar 10, 2026
a608a9f
ci: exclude dead GPU node atl1-1-03-002-29-0 (cuInit error 999)
Mar 10, 2026
72c9c86
ci: unify job submission, test, and bench scripts across clusters
Mar 10, 2026
49d3a7b
ci: remove dead rm -rf build, add gpu_partition default
Mar 10, 2026
8825dff
ci: use strict shell mode in common/test.sh and common/bench.sh
Mar 10, 2026
c68d6d6
ci: validate existing Phoenix build against node ISA before reuse
Mar 10, 2026
7f70c2e
ci: always nuke build/ on Phoenix to avoid stale ISA mismatches
Mar 10, 2026
a3c37ce
ci: nuke stale Phoenix bench builds; trap TMPDIR cleanup on exit
Mar 10, 2026
5efa827
ci: widen TMPDIR random range to reduce collision risk
Mar 10, 2026
b7e8e75
ci: remove no-op -j flag from mfc.sh bench invocations
Mar 10, 2026
5a46a3e
ci: add space after SBATCH -o for consistency with SLURM docs
Mar 10, 2026
61b84cf
ci: fix bench master job failing to find common/bench.sh
sbryngelson Mar 10, 2026
df60f82
ci: drop gpu-rtx6000 from partition list (too slow for test time limit)
sbryngelson Mar 10, 2026
1c4bd73
ci: deprioritize gpu-v100 to last in partition selection
sbryngelson Mar 10, 2026
b51f214
ci: pass build_opts (GPU interface flag) to live test command
sbryngelson Mar 10, 2026
698bd2e
ci: fix wait + set -e race that orphans parallel jobs
sbryngelson Mar 10, 2026
1d4d79f
ci: fix stale comments from incremental refactoring
sbryngelson Mar 10, 2026
6cba935
ci: fix duplicate --gpu flag on Frontier GPU test commands
sbryngelson Mar 10, 2026
4915884
ci: remove stale .out file before SLURM submission
sbryngelson Mar 10, 2026
e1e0f42
ci: cap case-optimization build jobs at 8 to match prebuild
sbryngelson Mar 11, 2026
97ecc23
ci: submit case-opt prebuild to CPU partition (no GPU needed for comp…
sbryngelson Mar 11, 2026
1793498
ci: add CPU architecture to build cache key to prevent SIGILL
sbryngelson Mar 11, 2026
dbe6a5e
ci: remove build cache from GitHub runners (prevents SIGILL, negligib…
sbryngelson Mar 11, 2026
fde5fc4
ci: switch Frontier to service partition and develop QOS
sbryngelson Mar 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/bin/bash

# Pre-builds all benchmark cases with --case-optimization.
# No GPU hardware needed — compilation only.
# Can run in two modes:
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]

set -e

# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
# Support both positional args (direct invocation) and env vars (SLURM)
cluster="${1:-${job_cluster:-phoenix}}"
job_device="${2:-$job_device}"
job_interface="${3:-$job_interface}"
Expand All @@ -24,7 +25,15 @@ esac
rm -rf build

. ./mfc.sh load -c "$flag" -m g
source .github/scripts/gpu-opts.sh

# Set GPU build flags from interface — this is always a GPU build.
# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
# to a CPU SLURM partition (no GPU hardware needed for compilation).
case "$job_interface" in
acc) gpu_opts="--gpu acc" ;;
omp) gpu_opts="--gpu mp" ;;
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
esac

for case in benchmarks/*/case.py; do
echo "=== Pre-building: $case ==="
Expand Down
7 changes: 5 additions & 2 deletions .github/scripts/retry-build.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/bin/bash
# Provides retry_build(): 2-attempt loop.
# On failure of attempt 1, nukes the entire build directory before attempt 2.
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero
# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries
# compiled on a different CPU architecture.
# Usage: source .github/scripts/retry-build.sh
# retry_build ./mfc.sh build -j 8 --gpu acc
# RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8

retry_build() {
local validate_cmd="${RETRY_VALIDATE_CMD:-}"

This comment was marked as off-topic.

local max_attempts=2
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
Expand Down
34 changes: 0 additions & 34 deletions .github/scripts/run-tests-with-retry.sh

This file was deleted.

2 changes: 1 addition & 1 deletion .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ for case in "${benchmarks[@]}"; do
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"

# Build + run with --case-optimization, small grid, 10 timesteps
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
# Validate output
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
echo "PASS: $case_name"
Expand Down
31 changes: 8 additions & 23 deletions .github/scripts/run_parallel_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,9 @@ echo "=========================================="
# both parallel jobs so PR and master always land on the same GPU type.
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
echo "Selecting Phoenix GPU partition for benchmark consistency..."
# Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
# large modern nodes (h200, h100, a100) free for production workloads.
# rtx6000 has the most nodes and gives the most consistent baselines.
BENCH_GPU_PARTITION=""
for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
# || true: grep -c exits 1 on zero matches (or when sinfo returns no output
# for an unknown partition); suppress so set -euo pipefail doesn't abort.
idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
if [ "${idle:-0}" -gt 0 ]; then
BENCH_GPU_PARTITION="$part"
echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
break
fi
done
if [ -z "$BENCH_GPU_PARTITION" ]; then
echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
BENCH_GPU_PARTITION="gpu-rtx6000"
fi
# Require 2 nodes so both PR and master jobs can run concurrently.
GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
export BENCH_GPU_PARTITION
fi

Expand All @@ -57,12 +42,13 @@ echo "Master job started in background (PID: $master_pid)"

echo "Waiting for both jobs to complete..."

# Wait and capture exit codes reliably
# Wait and capture exit codes reliably.
# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
# (which would orphan the second job).
pr_exit=0
master_exit=0

wait "$pr_pid"
pr_exit=$?
wait "$pr_pid" || pr_exit=$?
if [ "$pr_exit" -ne 0 ]; then
echo "PR job exited with code: $pr_exit"
echo "Last 50 lines of PR job log:"
Expand All @@ -71,8 +57,7 @@ else
echo "PR job completed successfully"
fi

wait "$master_pid"
master_exit=$?
wait "$master_pid" || master_exit=$?
if [ "$master_exit" -ne 0 ]; then
echo "Master job exited with code: $master_exit"
echo "Last 50 lines of master job log:"
Expand Down
35 changes: 35 additions & 0 deletions .github/scripts/select-gpu-partition.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Select the best available Phoenix GPU partition using sinfo.
# Sources into caller: exports SELECTED_GPU_PARTITION.
#
# Priority order prefers partitions most likely to have availability.
# V100 is last due to slower performance near the test time limit.
# Falls back to gpu-l40s if no partition meets the idle node threshold.
# RTX 6000 nodes are excluded (too slow for the test suite time limit).
#
# Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum
# number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs).
#
# Usage: source .github/scripts/select-gpu-partition.sh

_GPU_PARTITION_PRIORITY="gpu-l40s gpu-h200 gpu-h100 gpu-a100 gpu-v100"
_GPU_PARTITION_FALLBACK="gpu-l40s"
_GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}"

SELECTED_GPU_PARTITION=""
for _part in $_GPU_PARTITION_PRIORITY; do
_idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
if [ "${_idle:-0}" -ge "$_GPU_PARTITION_MIN_NODES" ]; then
SELECTED_GPU_PARTITION="$_part"
echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)"
break
fi
done

if [ -z "$SELECTED_GPU_PARTITION" ]; then
echo "WARNING: No idle GPU partition found; falling back to $_GPU_PARTITION_FALLBACK (may queue)"
SELECTED_GPU_PARTITION="$_GPU_PARTITION_FALLBACK"
fi

export SELECTED_GPU_PARTITION
unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _GPU_PARTITION_MIN_NODES _part _idle
207 changes: 207 additions & 0 deletions .github/scripts/submit-slurm-job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/bin/bash
# Unified SLURM job submission and monitoring for all clusters.
# Submits a script as a SLURM batch job, then monitors it until completion.
# Rerun-safe: cancels stale jobs from previous runs before resubmission.
#
# Usage: submit-slurm-job.sh <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]

set -euo pipefail

# Ignore SIGHUP to survive login node session drops
trap '' HUP

usage() {
echo "Usage: $0 <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]"
}

script_path="${1:-}"
device="${2:-}"
interface="${3:-}"
cluster="${4:-}"
shard="${5:-}"

if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then
usage
exit 1
fi

sbatch_script_contents=$(cat "$script_path")
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Detect job type from submitted script basename
script_basename="$(basename "$script_path" .sh)"
case "$script_basename" in
bench*) job_type="bench" ;;
*) job_type="test" ;;
esac

# --- Cluster configuration ---
case "$cluster" in
phoenix)
compiler_flag="p"
account="gts-sbryngelson3"
job_prefix="shb"
qos="embers"
extra_sbatch="#SBATCH --requeue"
test_time="03:00:00"
bench_time="04:00:00"
gpu_partition_dynamic=true
;;
frontier)
compiler_flag="f"
account="CFD154"
job_prefix="MFC"
qos="develop"
extra_sbatch=""
test_time="01:59:00"
bench_time="01:59:00"
gpu_partition_dynamic=false
;;
frontier_amd)
compiler_flag="famd"
account="CFD154"
job_prefix="MFC"
qos="develop"
extra_sbatch=""
test_time="01:59:00"
bench_time="01:59:00"
gpu_partition_dynamic=false
;;
*)
echo "ERROR: Unknown cluster '$cluster'"
exit 1
;;
esac

# --- Time limit ---
if [ "$job_type" = "bench" ]; then
sbatch_time="#SBATCH -t $bench_time"
else
sbatch_time="#SBATCH -t $test_time"
fi

# --- Device-specific SBATCH options ---
if [ "$device" = "cpu" ]; then
case "$cluster" in
phoenix)
sbatch_device_opts="\
#SBATCH -p cpu-small
#SBATCH --ntasks-per-node=24
#SBATCH --mem-per-cpu=2G"
;;
frontier|frontier_amd)
sbatch_device_opts="\
#SBATCH -n 32
#SBATCH -p service"
;;
esac
elif [ "$device" = "gpu" ]; then
# Determine GPU partition
gpu_partition="batch"
if [ "$gpu_partition_dynamic" = "true" ]; then
# Use pre-selected bench partition if available, otherwise query sinfo
if [ -n "${BENCH_GPU_PARTITION:-}" ]; then
gpu_partition="$BENCH_GPU_PARTITION"
echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)"
else
source "${SCRIPT_DIR}/select-gpu-partition.sh"
gpu_partition="$SELECTED_GPU_PARTITION"
fi
fi

case "$cluster" in
phoenix)
sbatch_device_opts="\
#SBATCH -p $gpu_partition
#SBATCH --ntasks-per-node=4
#SBATCH -G2
#SBATCH --exclude=atl1-1-03-002-29-0"
;;
frontier|frontier_amd)
sbatch_device_opts="\
#SBATCH -n 8
#SBATCH -p service"
;;
esac
else
usage
exit 1
fi

# --- Job slug ---
shard_suffix=""
if [ -n "$shard" ]; then
shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')"
fi
job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}"
output_file="$job_slug.out"
id_file="${job_slug}.slurm_job_id"

# --- Idempotency: cancel stale jobs from previous runs ---
if [ -f "$id_file" ]; then
existing_id=$(cat "$id_file")
state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
case "${state:-UNKNOWN}" in
RUNNING|PENDING|REQUEUED|COMPLETING)
echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission"
scancel "$existing_id" 2>/dev/null || true
;;
*)
echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh"
;;
esac
rm -f "$id_file"
fi

# Remove stale output file so the monitor doesn't pick up old content
# (a previous SLURM job's epilog can write to the .out file after our
# stale-job check, polluting the new job's output stream).
rm -f "$output_file"

# --- Module load mode (short form) ---
module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")

# --- Submit ---
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J ${job_prefix}-${job_slug}
#SBATCH --account=${account}
#SBATCH -N 1
${sbatch_device_opts}
${sbatch_time}
#SBATCH --qos=${qos}
${extra_sbatch}
#SBATCH -o ${output_file}

set -e
set -x

cd "\$SLURM_SUBMIT_DIR"
echo "Running in \$(pwd):"

job_slug="$job_slug"
job_device="$device"
job_interface="$interface"
job_shard="$shard"
job_cluster="$cluster"

. ./mfc.sh load -c $compiler_flag -m $module_mode

$sbatch_script_contents

EOT
)

job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
if [ -z "$job_id" ]; then
echo "ERROR: Failed to submit job. sbatch output:"
echo "$submit_output"
exit 1
fi

echo "Submitted batch job $job_id"
echo "$job_id" > "$id_file"
echo "Job ID written to $id_file"

# --- Monitor ---
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
Loading
Loading