diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 78fdffa9a..a8457b65f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2716,3 +2716,36 @@ dsv4-fp4-mi355x-sglang-agentic:
 # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
 # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
 # probe to validate the ROCm DP+EP path.
+
+dsv4-fp4-mi355x-atom-disagg:
+  #TODO: (srok), temporary dev img. will update
+  image: rocm/atom-dev:nightly_202606071111-Jasen-fix_dockerfile
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D DP+TP8
+      # TODO: (srok), spot check
+      - conc-list: [ 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh
new file mode 100644
index 000000000..75c727c6c
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/env_atom.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# ATOM/mooncake-specific environment setup for multi-node disaggregated serving.
+#
+# Sourced by server_atom.sh in place of env.sh (which is SGLang/MoRI-specific).
+#
+# REQUIRED ENVIRONMENT VARIABLES:
+#   IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,...)
+#               Set by runner or auto-detected from hostname.
+
+set -x
+
+export PYTHONUNBUFFERED=1
+export PYTHONDONTWRITEBYTECODE=1
+
+# =============================================================================
+# IBDEVICES detection (same as env.sh)
+# =============================================================================
+
+if [[ -z "$IBDEVICES" ]]; then
+    DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
+    if [[ -n "$DETECTED" ]]; then
+        export IBDEVICES="$DETECTED"
+        echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)"
+    else
+        # ATOM uses mooncake proxy_ip/handshake_port for KV transfer — IBDEVICES is
+        # not passed as a server argument (unlike SGLang --disaggregation-ib-device).
+        # Log a warning but do not fail; mooncake will use its own RDMA device selection.
+        echo "[WARN] Unable to detect RDMA devices via ibv_devinfo; IBDEVICES unset (non-fatal for ATOM/mooncake)" >&2
+    fi
+else
+    echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
+fi
+export IBDEVICES
+
+# =============================================================================
+# ATOM/mooncake-specific environment
+# =============================================================================
+
+# mooncake RDMA KV transfer library path
+export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}
+
+# ATOM MoE gather/scatter interleave optimization
+export ATOM_MOE_GU_ITLV=1
+
+# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
+
+# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
+export AITER_LOG_LEVEL=WARNING
+
+# Disable bf16->fp8 MoE bound (matches reference script)
+export AITER_BF16_FP8_MOE_BOUND=0
+
+# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
+# No env var needed; documented here for reference.
+
+set +x
+
+echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES  LD_LIBRARY_PATH includes mooncake"
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 5e8e67606..2ac2a7f28 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -25,8 +25,8 @@ echo ""
 # at runtime, but the CWD remains the submit-time directory (amd_utils/).
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     MODELS_YAML="$(pwd)/models_vllm.yaml"
-else
-    MODELS_YAML="$(pwd)/models.yaml"
+elif [[ "$ENGINE" == "atom-disagg" ]]; then
+    MODELS_YAML="$(pwd)/models_atom.yaml"
 fi
 
 if [[ ! -f "$MODELS_YAML" ]]; then
@@ -402,6 +402,20 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
         -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
         -e PYTHONPYCACHEPREFIX=/tmp/pycache
     )
+elif [[ "$ENGINE" == "atom-disagg" ]]; then
+    DOCKER_ENV_ENGINE=(
+        -e ATOM_WS_PATH=${WS_PATH}
+        -e PREFILL_PORT=${PREFILL_PORT:-8010}
+        -e DECODE_PORT=${DECODE_PORT:-8020}
+        -e ROUTER_PORT=${ROUTER_PORT:-30000}
+        -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301}
+        -e MEM_FRACTION=${MEM_FRACTION:-0.85}
+        -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8}
+        -e BLOCK_SIZE=${BLOCK_SIZE:-16}
+        -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
+        -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-}
+        -e IBDEVICES=${IBDEVICES:-}
+    )
 else
     DOCKER_ENV_ENGINE=(
         -e SGLANG_WS_PATH=${WS_PATH}
@@ -425,6 +439,83 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\"
 eval \"\$DOCKER_CMD_DETECT\"
 echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\"
 
+# Enable out-of-tree RDMA library mounts for atom-disagg (mooncake requires host RDMA stack)
+RDMA_MOUNTS=()
+if [[ "$ENGINE" == "atom-disagg" ]]; then
+
+# When the container base OS differs from the host (e.g. Ubuntu 24.04 image
+# on a 22.04 host), the container's bundled libibverbs/libionic may be
+# ABI-incompatible with the host kernel drivers.  Detect the NIC type and
+# bind-mount the host's out-of-tree RDMA userspace libraries into the
+# container so the RDMA stack always matches the running kernel.
+_detect_nic_type() {
+    if [[ -n \"\${MORI_NIC_TYPE:-}\" ]]; then echo \"\$MORI_NIC_TYPE\"; return; fi
+    local bnxt=0 mlx5=0 ionic=0
+    if [[ -d /sys/class/infiniband ]]; then
+        for dev in /sys/class/infiniband/*; do
+            local name; name=\$(basename \"\$dev\")
+            case \"\$name\" in
+                bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;;
+                *)
+                    local drv; drv=\$(basename \"\$(readlink -f \"\$dev/device/driver\" 2>/dev/null)\" 2>/dev/null || true)
+                    case \"\$drv\" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;;
+            esac
+        done
+    fi
+    if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo bnxt
+    elif (( ionic >= mlx5 && ionic > 0 )); then echo ionic
+    else echo mlx5; fi
+}
+
+_find_host_ibverbs() {
+    for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do
+        local r; r=\$(readlink -f \"\$c\" 2>/dev/null || true)
+        if [[ -f \"\$r\" ]]; then echo \"\$r\"; return; fi
+    done
+}
+
+_NIC_TYPE=\$(_detect_nic_type)
+echo \"[rdma] NIC type: \${_NIC_TYPE} on \$(hostname)\"
+
+if [[ \"\$_NIC_TYPE\" == \"ionic\" || \"\$_NIC_TYPE\" == \"bnxt\" ]]; then
+    _host_ibv=\$(_find_host_ibverbs)
+    if [[ -n \"\$_host_ibv\" ]]; then
+        RDMA_MOUNTS+=(-v \"\$_host_ibv:/lib/x86_64-linux-gnu/libibverbs.so.1\")
+    fi
+fi
+
+if [[ \"\$_NIC_TYPE\" == \"ionic\" ]]; then
+    for _dir in /usr/local/lib /usr/lib/x86_64-linux-gnu; do
+        for _lib in \"\$_dir\"/libionic*.so; do
+            [[ -f \"\$_lib\" ]] || continue
+            _real=\$(readlink -f \"\$_lib\")
+            [[ -f \"\$_real\" ]] && RDMA_MOUNTS+=(-v \"\$_real:\$_real\")
+            RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/\$(basename \"\$_lib\")\")
+        done
+    done
+    if [[ -d /usr/lib/x86_64-linux-gnu/libibverbs ]]; then
+        for _lib in /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav*.so; do
+            [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:\$_lib\")
+        done
+    fi
+    [[ -d /etc/libibverbs.d ]] && RDMA_MOUNTS+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro)
+elif [[ \"\$_NIC_TYPE\" == \"bnxt\" ]]; then
+    for _lib in /usr/local/lib/libbnxt_re-rdmav*.so; do
+        [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/libibverbs/\$(basename \"\$_lib\")\")
+    done
+    for _lib in /usr/local/lib/libbnxt_re.so; do
+        [[ -f \"\$_lib\" ]] && RDMA_MOUNTS+=(-v \"\$_lib:/usr/lib/x86_64-linux-gnu/\$(basename \"\$_lib\")\")
+    done
+    [[ -d /etc/libibverbs.d ]] && RDMA_MOUNTS+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro)
+fi
+
+if [[ \${#RDMA_MOUNTS[@]} -gt 0 ]]; then
+    echo \"[rdma] bind-mounts: \${RDMA_MOUNTS[*]}\"
+else
+    echo \"[rdma] no out-of-tree RDMA mounts needed\"
+fi
+fi  # end: if ENGINE == atom-disagg
+
 # Pre-clean (idempotent)
 \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true
 \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true
@@ -490,6 +581,7 @@ fi
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
     ${EXTRA_DOCKER_MOUNTS:-} \
+    \${RDMA_MOUNTS[@]+"\${RDMA_MOUNTS[@]}"} \
     ${DOCKER_ENV_COMMON[*]} \
     ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
new file mode 100644
index 000000000..9066b4d7a
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -0,0 +1,72 @@
+# Model-specific SGLang server configurations for disaggregated inference.
+#
+# Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR).
+#
+# To add a new model: add a new top-level entry following the same schema.
+# No script changes are required.
+#
+# Schema:
+#   <model-name>:
+#     base_flags: str          # Common flags for both prefill and decode
+#     mtp_flags: str           # Appended to decode when DECODE_MTP_SIZE > 0
+#     dp_flags: str            # Appended when DP is enabled (prefill or decode)
+#     prefill:
+#       mem_fraction_static: float
+#       disable_radix_cache: bool
+#       dp:                              # Config when data-parallel attention is enabled
+#         max_running_requests: int
+#         chunked_prefill_size: str      # Can be integer or bash arithmetic expression
+#         cuda_graph_bs: str             # Space-separated values
+#       no_dp:                           # Config when data-parallel attention is disabled
+#         max_running_requests: int
+#         chunked_prefill_size: int
+#         cuda_graph_bs_range: str       # "start-end" expanded via seq
+#     decode:
+#       mem_fraction_static: float
+#       prefill_round_robin_balance: bool
+#       dp:
+#         max_running_requests: int
+#         chunked_prefill_size: str
+#         cuda_graph_bs_range: str
+#       ep_only:                         # Config when EP is enabled but DP is disabled
+#         max_running_requests: int
+#         chunked_prefill_size: int
+#         cuda_graph_bs_range: str
+#       no_dp:
+#         max_running_requests: int
+#         chunked_prefill_size: int
+#         cuda_graph_bs_range: str
+
+DeepSeek-V4-Pro:
+  # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS
+  # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by
+  # server_atom.sh; they are kept here for documentation and potential future use.
+  base_flags: ""
+  mtp_flags: ""
+  dp_flags: ""
+  prefill:
+    mem_fraction_static: 0.85
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: false
+    dp:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 5c441a793..b62ca5816 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -1,19 +1,23 @@
 #!/bin/bash
-# Dual-Engine Disaggregated Server Dispatcher
+# Multi-Engine Disaggregated Server Dispatcher
 # =============================================================================
 # Dispatches to the engine-specific server launcher based on ENGINE env var.
 #   ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI)
 #   ENGINE=vllm-disagg             -> server_vllm.sh  (vLLM + Nixl/MoRI-IO)
+#   ENGINE=atom-disagg             -> server_atom.sh  (ATOM + mooncake)
 # =============================================================================
 
 ENGINE="${ENGINE:-sglang-disagg}"
-WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}"
+WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-${ATOM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}}"
 export WS_PATH ENGINE
 
 echo "[DISPATCHER] ENGINE=$ENGINE  WS_PATH=$WS_PATH"
 
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     source "$WS_PATH/server_vllm.sh"
+elif [[ "$ENGINE" == "atom-disagg" ]]; then
+    export ATOM_WS_PATH="$WS_PATH"
+    source "$WS_PATH/server_atom.sh"
 else
     source "$WS_PATH/server_sglang.sh"
 fi
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
new file mode 100644
index 000000000..9bbebfd6a
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -0,0 +1,336 @@
+#!/bin/bash
+# ATOM Disaggregated Server Launcher
+# =============================================================================
+# Uses atom.entrypoints.openai_server with mooncake RDMA KV transfer.
+# Mirrors server_sglang.sh topology (dynamic xP/yD) but adapts to ATOM's
+# explicit kv-transfer-config and atomesh router.
+#
+# Key differences from server_sglang.sh:
+#   - Engine: atom.entrypoints.openai_server  (not sglang.launch_server)
+#   - KV transfer: mooncake (--kv-transfer-config JSON)
+#   - Router: atomesh  (not sglang_router)
+#   - Prefill port: $PREFILL_PORT (default 8010) / Decode port: $DECODE_PORT (default 8020)
+#   - Router port: $ROUTER_PORT (default 8000)
+# =============================================================================
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}"
+yD="${yD:-1}"
+
+IPADDRS="${IPADDRS:-localhost}"
+
+# Parallelism
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+
+# ATOM server ports (different from SGLang which uses 8000 for all)
+PREFILL_PORT="${PREFILL_PORT:-8010}"
+DECODE_PORT="${DECODE_PORT:-8020}"
+ROUTER_PORT="${ROUTER_PORT:-8000}"
+HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"
+
+# ATOM server tuning (from reference script defaults)
+MEM_FRACTION="${MEM_FRACTION:-0.85}"
+KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
+BLOCK_SIZE="${BLOCK_SIZE:-16}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
+EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+DRY_RUN="${DRY_RUN:-0}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+
+source $ATOM_WS_PATH/setup_deps.sh
+source $ATOM_WS_PATH/env_atom.sh
+
+host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
+if [[ -z "$host_ip" ]]; then
+    host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
+fi
+host_name=$(hostname)
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + GPUS_PER_NODE - 1) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + GPUS_PER_NODE - 1) / GPUS_PER_NODE))
+NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
+
+# Build prefill IP list and atomesh --prefill args
+PREFILL_ARGS=""
+PREFILL_IPS=()
+for i in $(seq 0 $((xP - 1))); do
+    idx=$((i * PREFILL_NODES_PER_WORKER))
+    PREFILL_IPS[$i]="${IP_ARRAY[$idx]}"
+    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$idx]}:${PREFILL_PORT}"
+done
+
+# Build decode IP list and atomesh --decode args
+DECODE_ARGS=""
+DECODE_IPS=()
+for i in $(seq 0 $((yD - 1))); do
+    idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
+    DECODE_IPS[$i]="${IP_ARRAY[$idx]}"
+    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}"
+done
+
+echo "Prefill IPs : ${PREFILL_IPS[*]}"
+echo "Decode  IPs : ${DECODE_IPS[*]}"
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $ATOM_WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 300
+
+# =============================================================================
+# Node Role Assignment
+#
+# Role mapping (same as server_sglang.sh):
+#   rank 0                          -> prefill node 0 + router
+#   rank 1 .. (NODE_OFFSET-1)       -> remaining prefill nodes
+#   rank NODE_OFFSET ..             -> decode nodes
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    # ──────────────────────────────────────────────────────────────────────────
+    # Node 0: prefill server (producer) + atomesh router
+    # ──────────────────────────────────────────────────────────────────────────
+    echo "NODE INFO ======================================="
+    echo "${host_name}:${host_ip} is Prefill Node 0 + Router"
+    echo "Prefill TP=${PREFILL_TP_SIZE}, Decode TP=${DECODE_TP_SIZE}"
+    echo "Prefill servers: ${PREFILL_ARGS}"
+    echo "Decode  servers: ${DECODE_ARGS}"
+    echo "================================================"
+
+    PREFILL_CMD="python3 -m atom.entrypoints.openai_server \
+        --model ${MODEL_DIR}/${MODEL_NAME} \
+        --host 0.0.0.0 --server-port ${PREFILL_PORT} \
+        --trust-remote-code \
+        -tp ${PREFILL_TP_SIZE} \
+        --enable-dp-attention \
+        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        --block-size ${BLOCK_SIZE} \
+        --gpu-memory-utilization ${MEM_FRACTION} \
+        --max-num-seqs ${MAX_NUM_SEQS} \
+        --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
+        ${EXTRA_SERVER_ARGS}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill0_pid=$!
+    fi
+
+    # Wait for all prefill and decode servers to be ready
+    echo "Waiting for all servers to be up..."
+    BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \
+        --node-ips ${IPADDRS} \
+        --node-ports ${PREFILL_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+    echo "All servers up. Starting atomesh router..."
+
+    ROUTER_CMD="/usr/local/bin/atomesh launch \
+        --host 0.0.0.0 --port ${ROUTER_PORT} \
+        --pd-disaggregation \
+        ${PREFILL_ARGS} \
+        ${DECODE_ARGS} \
+        --policy random \
+        --backend atom \
+        --log-level info \
+        --disable-health-check \
+        --disable-circuit-breaker \
+        --prometheus-port 29100"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $ROUTER_CMD"
+    else
+        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_router_${host_name}.log"
+        set -x
+        eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
+        set +x
+        proxy_pid=$!
+
+        # Wait for router to accept connections
+        HEALTH_BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \
+            --node-ips ${NODE0_ADDR} \
+            --node-ports ${ROUTER_PORT} \
+            --wait-for-all-ports \
+            --timeout 3000"
+        eval "$HEALTH_BARRIER_CMD"
+        echo "Router is ready for benchmarking"
+    fi
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+
+    cd $ATOM_WS_PATH
+
+    BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Copy results
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing router and prefill server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $proxy_pid
+        kill $prefill0_pid
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    # ──────────────────────────────────────────────────────────────────────────
+    # Prefill nodes 1..N (kv_producer)
+    # ──────────────────────────────────────────────────────────────────────────
+    echo "${host_name}:${host_ip} is Prefill Node (rank ${NODE_RANK})"
+
+    # Determine which prefill worker this node belongs to, and its headnode IP
+    prefill_worker_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
+    PREFILL_HEADNODE_IP="${PREFILL_IPS[$prefill_worker_idx]}"
+
+    PREFILL_CMD="python3 -m atom.entrypoints.openai_server \
+        --model ${MODEL_DIR}/${MODEL_NAME} \
+        --host 0.0.0.0 --server-port ${PREFILL_PORT} \
+        --trust-remote-code \
+        -tp ${PREFILL_TP_SIZE} \
+        --enable-dp-attention \
+        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        --block-size ${BLOCK_SIZE} \
+        --gpu-memory-utilization ${MEM_FRACTION} \
+        --max-num-seqs ${MAX_NUM_SEQS} \
+        --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
+        ${EXTRA_SERVER_ARGS}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for router to be up..."
+    BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 3600"
+    if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BARRIER_CMD"; else eval "$BARRIER_CMD"; fi
+
+    echo "Waiting until router closes..."
+    WAIT_CMD="python3 $ATOM_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+    if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $WAIT_CMD"; else eval "$WAIT_CMD"; fi
+
+    echo "Killing prefill server (rank ${NODE_RANK})"
+    if [[ "$DRY_RUN" -eq 0 ]]; then kill $prefill_pid; fi
+
+else
+    # ──────────────────────────────────────────────────────────────────────────
+    # Decode nodes (kv_consumer)
+    # ──────────────────────────────────────────────────────────────────────────
+    RANK=$((NODE_RANK - NODE_OFFSET))
+    echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})"
+
+    DECODE_CMD="python3 -m atom.entrypoints.openai_server \
+        --model ${MODEL_DIR}/${MODEL_NAME} \
+        --host 0.0.0.0 --server-port ${DECODE_PORT} \
+        --trust-remote-code \
+        -tp ${DECODE_TP_SIZE} \
+        --enable-dp-attention \
+        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        --block-size ${BLOCK_SIZE} \
+        --gpu-memory-utilization ${MEM_FRACTION} \
+        --max-num-seqs ${MAX_NUM_SEQS} \
+        --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
+        --cudagraph-capture-sizes '[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]' \
+        ${EXTRA_SERVER_ARGS}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        set -x
+        eval "$DECODE_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
+        set +x
+        decode_pid=$!
+    fi
+
+    echo "Waiting for router to be up..."
+    BARRIER_CMD="python3 $ATOM_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 3600"
+    if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BARRIER_CMD"; else eval "$BARRIER_CMD"; fi
+
+    echo "Waiting until router closes..."
+    WAIT_CMD="python3 $ATOM_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+    if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $WAIT_CMD"; else eval "$WAIT_CMD"; fi
+
+    echo "Killing decode server (rank ${RANK})"
+    if [[ "$DRY_RUN" -eq 0 ]]; then kill $decode_pid; fi
+fi
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
new file mode 100644
index 000000000..d17d1a323
--- /dev/null
+++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index d62e6bc4b..62c38b45a 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -77,7 +77,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     fi
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
-    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then
+    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]] || [[ "$FRAMEWORK" == "atom-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
     else
         BENCHMARK_SUBDIR="single_node/fixed_seq_len"