diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78fdffa9a..a8457b65f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2716,3 +2716,36 @@ dsv4-fp4-mi355x-sglang-agentic: # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 # probe to validate the ROCm DP+EP path. + +dsv4-fp4-mi355x-atom-disagg: + #TODO: (srok), temporary dev img. will update + image: rocm/atom-dev:nightly_202606071111-Jasen-fix_dockerfile + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P1D DP+TP8 + # TODO: (srok), spot check + - conc-list: [ 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh new file mode 100644 index 000000000..75c727c6c --- /dev/null +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# ATOM/mooncake-specific environment setup for multi-node disaggregated serving. +# +# Sourced by server_atom.sh in place of env.sh (which is SGLang/MoRI-specific). +# +# REQUIRED ENVIRONMENT VARIABLES: +# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,...) +# Set by runner or auto-detected from hostname. + +set -x + +export PYTHONUNBUFFERED=1 +export PYTHONDONTWRITEBYTECODE=1 + +# ============================================================================= +# IBDEVICES detection (same as env.sh) +# ============================================================================= + +if [[ -z "$IBDEVICES" ]]; then + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)" + else + # ATOM uses mooncake proxy_ip/handshake_port for KV transfer — IBDEVICES is + # not passed as a server argument (unlike SGLang --disaggregation-ib-device). + # Log a warning but do not fail; mooncake will use its own RDMA device selection. + echo "[WARN] Unable to detect RDMA devices via ibv_devinfo; IBDEVICES unset (non-fatal for ATOM/mooncake)" >&2 + fi +else + echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" +fi +export IBDEVICES + +# ============================================================================= +# ATOM/mooncake-specific environment +# ============================================================================= + +# mooncake RDMA KV transfer library path +export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} + +# ATOM MoE gather/scatter interleave optimization +export ATOM_MOE_GU_ITLV=1 + +# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) + +# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) +export AITER_LOG_LEVEL=WARNING + +# Disable bf16->fp8 MoE bound (matches reference script) +export AITER_BF16_FP8_MOE_BOUND=0 + +# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) +# No env var needed; documented here for reference. + +set +x + +echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 5e8e67606..2ac2a7f28 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -25,8 +25,8 @@ echo "" # at runtime, but the CWD remains the submit-time directory (amd_utils/). if [[ "$ENGINE" == "vllm-disagg" ]]; then MODELS_YAML="$(pwd)/models_vllm.yaml" -else - MODELS_YAML="$(pwd)/models.yaml" +elif [[ "$ENGINE" == "atom-disagg" ]]; then + MODELS_YAML="$(pwd)/models_atom.yaml" fi if [[ ! -f "$MODELS_YAML" ]]; then @@ -402,6 +402,20 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} -e PYTHONPYCACHEPREFIX=/tmp/pycache ) +elif [[ "$ENGINE" == "atom-disagg" ]]; then + DOCKER_ENV_ENGINE=( + -e ATOM_WS_PATH=${WS_PATH} + -e PREFILL_PORT=${PREFILL_PORT:-8010} + -e DECODE_PORT=${DECODE_PORT:-8020} + -e ROUTER_PORT=${ROUTER_PORT:-30000} + -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301} + -e MEM_FRACTION=${MEM_FRACTION:-0.85} + -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} + -e BLOCK_SIZE=${BLOCK_SIZE:-16} + -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256} + -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-} + -e IBDEVICES=${IBDEVICES:-} + ) else DOCKER_ENV_ENGINE=( -e SGLANG_WS_PATH=${WS_PATH} @@ -425,6 +439,83 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\" eval \"\$DOCKER_CMD_DETECT\" echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" +# Enable out-of-tree RDMA library mounts for atom-disagg (mooncake requires host RDMA stack) +RDMA_MOUNTS=() +if [[ "$ENGINE" == "atom-disagg" ]]; then + +# When the container base OS differs from the host (e.g. Ubuntu 24.04 image +# on a 22.04 host), the container's bundled libibverbs/libionic may be +# ABI-incompatible with the host kernel drivers. Detect the NIC type and +# bind-mount the host's out-of-tree RDMA userspace libraries into the +# container so the RDMA stack always matches the running kernel. +_detect_nic_type() { + if [[ -n \"\${MORI_NIC_TYPE:-}\" ]]; then echo \"\$MORI_NIC_TYPE\"; return; fi + local bnxt=0 mlx5=0 ionic=0 + if [[ -d /sys/class/infiniband ]]; then + for dev in /sys/class/infiniband/*; do + local name; name=\$(basename \"\$dev\") + case \"\$name\" in + bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; + *) + local drv; drv=\$(basename \"\$(readlink -f \"\$dev/device/driver\" 2>/dev/null)\" 2>/dev/null || true) + case \"\$drv\" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;; + esac + done + fi + if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo bnxt + elif (( ionic >= mlx5 && ionic > 0 )); then echo ionic + else echo mlx5; fi +} + +_find_host_ibverbs() { + for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do + local r; r=\$(readlink -f \"\$c\" 2>/dev/null || true) + if [[ -f \"\$r\" ]]; then echo \"\$r\"; return; fi + done +} + +_NIC_TYPE=\$(_detect_nic_type) +echo \"[rdma] NIC type: \${_NIC_TYPE} on \$(hostname)\" + +if [[ \"\$_NIC_TYPE\" == \"ionic\" || \"\$_NIC_TYPE\" == \"bnxt\" ]]; then + _host_ibv=\$(_find_host_ibverbs) + if [[ -n \"\$_host_ibv\" ]]; then + RDMA_MOUNTS+=(-v \"\$_host_ibv:/lib/x86_64-linux-gnu/libibverbs.so.1\") + fi +fi + +if [[ \"\$_NIC_TYPE\" == \"ionic\" ]]; then + for _dir in /usr/local/lib /usr/lib/x86_64-linux-gnu; do + for _lib in \"\$_dir\"/libionic*.so; do + [[ -f \"\$_lib\" ]] || continue + _real=\$(readlink -f \"\$_lib\") + [[ -f \"\$_real\" ]] && RDMA_MOUNTS+=(-v \"\$_real:\$_real\") + RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/\$(basename \"\$_lib\")\") + done + done + if [[ -d /usr/lib/x86_64-linux-gnu/libibverbs ]]; then + for _lib in /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav*.so; do + [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:\$_lib\") + done + fi + [[ -d /etc/libibverbs.d ]] && RDMA_MOUNTS+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro) +elif [[ \"\$_NIC_TYPE\" == \"bnxt\" ]]; then + for _lib in /usr/local/lib/libbnxt_re-rdmav*.so; do + [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/libibverbs/\$(basename \"\$_lib\")\") + done + for _lib in /usr/local/lib/libbnxt_re.so; do + [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/\$(basename \"\$_lib\")\") + done + [[ -d /etc/libibverbs.d ]] && RDMA_MOUNTS+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro) +fi + +if [[ \${#RDMA_MOUNTS[@]} -gt 0 ]]; then + echo \"[rdma] bind-mounts: \${RDMA_MOUNTS[*]}\" +else + echo \"[rdma] no out-of-tree RDMA mounts needed\" +fi +fi # end: if ENGINE == atom-disagg + # Pre-clean (idempotent) \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true @@ -490,6 +581,7 @@ fi -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ ${EXTRA_DOCKER_MOUNTS:-} \ + \${RDMA_MOUNTS[@]+"\${RDMA_MOUNTS[@]}"} \ ${DOCKER_ENV_COMMON[*]} \ ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml new file mode 100644 index 000000000..9066b4d7a --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -0,0 +1,72 @@ +# Model-specific SGLang server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# base_flags: str # Common flags for both prefill and decode +# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 +# dp_flags: str # Appended when DP is enabled (prefill or decode) +# prefill: +# mem_fraction_static: float +# disable_radix_cache: bool +# dp: # Config when data-parallel attention is enabled +# max_running_requests: int +# chunked_prefill_size: str # Can be integer or bash arithmetic expression +# cuda_graph_bs: str # Space-separated values +# no_dp: # Config when data-parallel attention is disabled +# max_running_requests: int +# chunked_prefill_size: int +# cuda_graph_bs_range: str # "start-end" expanded via seq +# decode: +# mem_fraction_static: float +# prefill_round_robin_balance: bool +# dp: +# max_running_requests: int +# chunked_prefill_size: str +# cuda_graph_bs_range: str +# ep_only: # Config when EP is enabled but DP is disabled +# max_running_requests: int +# chunked_prefill_size: int +# cuda_graph_bs_range: str +# no_dp: +# max_running_requests: int +# chunked_prefill_size: int +# cuda_graph_bs_range: str + +DeepSeek-V4-Pro: + # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS + # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by + # server_atom.sh; they are kept here for documentation and potential future use. + base_flags: "" + mtp_flags: "" + dp_flags: "" + prefill: + mem_fraction_static: 0.85 + disable_radix_cache: true + dp: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: false + dp: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 5c441a793..b62ca5816 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,19 +1,23 @@ #!/bin/bash -# Dual-Engine Disaggregated Server Dispatcher +# Multi-Engine Disaggregated Server Dispatcher # ============================================================================= # Dispatches to the engine-specific server launcher based on ENGINE env var. # ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) # ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) +# ENGINE=atom-disagg -> server_atom.sh (ATOM + mooncake) # ============================================================================= ENGINE="${ENGINE:-sglang-disagg}" -WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-${ATOM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}}" export WS_PATH ENGINE echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" if [[ "$ENGINE" == "vllm-disagg" ]]; then source "$WS_PATH/server_vllm.sh" +elif [[ "$ENGINE" == "atom-disagg" ]]; then + export ATOM_WS_PATH="$WS_PATH" + source "$WS_PATH/server_atom.sh" else source "$WS_PATH/server_sglang.sh" fi diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh new file mode 100644 index 000000000..9bbebfd6a --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -0,0 +1,336 @@ +#!/bin/bash +# ATOM Disaggregated Server Launcher +# ============================================================================= +# Uses atom.entrypoints.openai_server with mooncake RDMA KV transfer. +# Mirrors server_sglang.sh topology (dynamic xP/yD) but adapts to ATOM's +# explicit kv-transfer-config and atomesh router. +# +# Key differences from server_sglang.sh: +# - Engine: atom.entrypoints.openai_server (not sglang.launch_server) +# - KV transfer: mooncake (--kv-transfer-config JSON) +# - Router: atomesh (not sglang_router) +# - Prefill port: $PREFILL_PORT (default 8010) / Decode port: $DECODE_PORT (default 8020) +# - Router port: $ROUTER_PORT (default 8000) +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Parallelism +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" + +# ATOM server ports (different from SGLang which uses 8000 for all) +PREFILL_PORT="${PREFILL_PORT:-8010}" +DECODE_PORT="${DECODE_PORT:-8020}" +ROUTER_PORT="${ROUTER_PORT:-8000}" +HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" + +# ATOM server tuning (from reference script defaults) +MEM_FRACTION="${MEM_FRACTION:-0.85}" +KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" +BLOCK_SIZE="${BLOCK_SIZE:-16}" +MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" +EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= + +source $ATOM_WS_PATH/setup_deps.sh +source $ATOM_WS_PATH/env_atom.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I 2>/dev/null | awk '{print $1}') +fi +host_name=$(hostname) + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= + +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + GPUS_PER_NODE - 1) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + GPUS_PER_NODE - 1) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill IP list and atomesh --prefill args +PREFILL_ARGS="" +PREFILL_IPS=() +for i in $(seq 0 $((xP - 1))); do + idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_IPS[$i]="${IP_ARRAY[$idx]}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$idx]}:${PREFILL_PORT}" +done + +# Build decode IP list and atomesh --decode args +DECODE_ARGS="" +DECODE_IPS=() +for i in $(seq 0 $((yD - 1))); do + idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_IPS[$i]="${IP_ARRAY[$idx]}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" +done + +echo "Prefill IPs : ${PREFILL_IPS[*]}" +echo "Decode IPs : ${DECODE_IPS[*]}" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $ATOM_WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Node Role Assignment +# +# Role mapping (same as server_sglang.sh): +# rank 0 -> prefill node 0 + router +# rank 1 .. (NODE_OFFSET-1) -> remaining prefill nodes +# rank NODE_OFFSET .. -> decode nodes +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + # ────────────────────────────────────────────────────────────────────────── + # Node 0: prefill server (producer) + atomesh router + # ────────────────────────────────────────────────────────────────────────── + echo "NODE INFO =======================================" + echo "${host_name}:${host_ip} is Prefill Node 0 + Router" + echo "Prefill TP=${PREFILL_TP_SIZE}, Decode TP=${DECODE_TP_SIZE}" + echo "Prefill servers: ${PREFILL_ARGS}" + echo "Decode servers: ${DECODE_ARGS}" + echo "================================================" + + PREFILL_CMD="python3 -m atom.entrypoints.openai_server \ + --model ${MODEL_DIR}/${MODEL_NAME} \ + --host 0.0.0.0 --server-port ${PREFILL_PORT} \ + --trust-remote-code \ + -tp ${PREFILL_TP_SIZE} \ + --enable-dp-attention \ + --kv_cache_dtype ${KV_CACHE_DTYPE} \ + --block-size ${BLOCK_SIZE} \ + --gpu-memory-utilization ${MEM_FRACTION} \ + --max-num-seqs ${MAX_NUM_SEQS} \ + --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ + ${EXTRA_SERVER_ARGS}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + # Wait for all prefill and decode servers to be ready + echo "Waiting for all servers to be up..." + BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports ${PREFILL_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "All servers up. Starting atomesh router..." + + ROUTER_CMD="/usr/local/bin/atomesh launch \ + --host 0.0.0.0 --port ${ROUTER_PORT} \ + --pd-disaggregation \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS} \ + --policy random \ + --backend atom \ + --log-level info \ + --disable-health-check \ + --disable-circuit-breaker \ + --prometheus-port 29100" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_router_${host_name}.log" + set -x + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + set +x + proxy_pid=$! + + # Wait for router to accept connections + HEALTH_BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 3000" + eval "$HEALTH_BARRIER_CMD" + echo "Router is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + cd $ATOM_WS_PATH + + BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy results + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing router and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + # ────────────────────────────────────────────────────────────────────────── + # Prefill nodes 1..N (kv_producer) + # ────────────────────────────────────────────────────────────────────────── + echo "${host_name}:${host_ip} is Prefill Node (rank ${NODE_RANK})" + + # Determine which prefill worker this node belongs to, and its headnode IP + prefill_worker_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_IP="${PREFILL_IPS[$prefill_worker_idx]}" + + PREFILL_CMD="python3 -m atom.entrypoints.openai_server \ + --model ${MODEL_DIR}/${MODEL_NAME} \ + --host 0.0.0.0 --server-port ${PREFILL_PORT} \ + --trust-remote-code \ + -tp ${PREFILL_TP_SIZE} \ + --enable-dp-attention \ + --kv_cache_dtype ${KV_CACHE_DTYPE} \ + --block-size ${BLOCK_SIZE} \ + --gpu-memory-utilization ${MEM_FRACTION} \ + --max-num-seqs ${MAX_NUM_SEQS} \ + --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ + ${EXTRA_SERVER_ARGS}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for router to be up..." + BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 3600" + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BARRIER_CMD"; else eval "$BARRIER_CMD"; fi + + echo "Waiting until router closes..." + WAIT_CMD="python3 $ATOM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $WAIT_CMD"; else eval "$WAIT_CMD"; fi + + echo "Killing prefill server (rank ${NODE_RANK})" + if [[ "$DRY_RUN" -eq 0 ]]; then kill $prefill_pid; fi + +else + # ────────────────────────────────────────────────────────────────────────── + # Decode nodes (kv_consumer) + # ────────────────────────────────────────────────────────────────────────── + RANK=$((NODE_RANK - NODE_OFFSET)) + echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})" + + DECODE_CMD="python3 -m atom.entrypoints.openai_server \ + --model ${MODEL_DIR}/${MODEL_NAME} \ + --host 0.0.0.0 --server-port ${DECODE_PORT} \ + --trust-remote-code \ + -tp ${DECODE_TP_SIZE} \ + --enable-dp-attention \ + --kv_cache_dtype ${KV_CACHE_DTYPE} \ + --block-size ${BLOCK_SIZE} \ + --gpu-memory-utilization ${MEM_FRACTION} \ + --max-num-seqs ${MAX_NUM_SEQS} \ + --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ + --cudagraph-capture-sizes '[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]' \ + ${EXTRA_SERVER_ARGS}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + set +x + decode_pid=$! + fi + + echo "Waiting for router to be up..." + BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 3600" + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BARRIER_CMD"; else eval "$BARRIER_CMD"; fi + + echo "Waiting until router closes..." + WAIT_CMD="python3 $ATOM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $WAIT_CMD"; else eval "$WAIT_CMD"; fi + + echo "Killing decode server (rank ${RANK})" + if [[ "$DRY_RUN" -eq 0 ]]; then kill $decode_pid; fi +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh new file mode 100644 index 000000000..d17d1a323 --- /dev/null +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index d62e6bc4b..62c38b45a 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -77,7 +77,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then fi SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]] || [[ "$FRAMEWORK" == "atom-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node/fixed_seq_len"