From 3867378e524a5827a6e161a807add1558ed38884 Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Wed, 3 Jun 2026 19:55:13 +0300
Subject: [PATCH 1/8] [NV] llm-d: add llm-d-vllm framework with H200 wide-EP
 and simple 1P+1D recipes

Adds the llm-d-vllm benchmark framework, with two H200 multi-node
DeepSeek-R1 fp8 P/D disagg recipes:

  dsr1-fp8-h200-llm-d-vllm
    Wide-EP shape mirroring the upstream llm-d wide-EP-lws guide:
    1 prefill instance + 1 decode instance, each spanning 2 H200
    nodes, DP=16 EP=16, ISL=2k/OSL=2k. Total 4 H200 nodes / 32 GPUs.

  dsr1-fp8-h200-llm-d-vllm-simple
    Phase 0 single-node-per-role 1P+1D shape (DP=8 EP=8 dp-attn,
    NIXL P-to-D KV transfer, no DeepEP / ibgda) for an apples-to-
    apples comparison vs Dynamo's H200 1P+1D entries.

Both recipes point at ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0,
the combined image (ghcr.io/llm-d/llm-d-cuda:v0.7.0 base + EPP +
pd-sidecar + envoy on top). EPP and pd-sidecar binaries come from
the upstream dev tags
(ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main,
 ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main).

Configs (epp-config.yaml, envoy.yaml) are NOT baked into the image.
job.slurm bind-mounts them at /etc/epp/config.yaml and
/etc/envoy/envoy.yaml so config-only iteration does not require an
image rebuild.

EPP config mirrors the upstream well-lit-path
guides/pd-disaggregation/router/pd-disaggregation.values.yaml in
github.com/llm-d/llm-d - same plugin set (disagg-headers-handler,
always-disagg-pd-decider, disagg-profile-handler, prefill-filter,
decode-filter, prefix-cache-scorer, queue-scorer,
kv-cache-utilization-scorer, active-request-scorer,
max-score-picker), same prefill/decode profiles and weights. The
only delta vs upstream is the file-discovery plugin pointing at
/tmp/endpoints.yaml; this benchmark runs under SLURM so there is no
K8s control plane to drive endpoint discovery. The coordinator node
writes /tmp/endpoints.yaml at job start; per-recipe variants live
under benchmarks/multi_node/llm-d-recipes/.

envoy.yaml ext_proc is configured for the dev EPP. Setting
request/response body mode to FULL_DUPLEX_STREAMED and trailer +
response-header modes to SEND is required because the dev EPP does
not ack BUFFERED body mode and Envoy times out with 504s.
message_timeout is 1000s to mirror the upstream guide; per-message
generation can take many seconds.

server.sh branches on LWS_GROUP_SIZE: the cross-process DP
coordination flags (--data-parallel-hybrid-lb, size-local, address,
rpc-port, start-rank) are only set when an instance spans more than
one node. /tmp/endpoints.yaml entries carry namespace=inferencex so
that the file-discovery plugin does not drop them (it filters by
namespace; EPP runs with --pool-namespace=inferencex).

runners/launch_h200-dgxc-slurm.sh gains the llm-d-vllm dispatch hook
so the multinode benchmark template can launch this framework.

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 .github/configs/nvidia-master.yaml            |  93 +++++++
 benchmarks/llm-d/Dockerfile                   |  22 ++
 benchmarks/llm-d/README.md                    |  16 ++
 benchmarks/llm-d/envoy.yaml                   |  85 ++++++
 benchmarks/llm-d/epp-config.yaml              |  61 +++++
 .../multi_node/dsr1_fp8_h200_llm-d-vllm.sh    |  48 ++++
 .../dsr1-fp8-h200-1p1d-simple.yaml            | 101 +++++++
 .../dsr1-fp8-h200-1p1d-wideep.yaml            |  98 +++++++
 benchmarks/multi_node/llm-d/README.md         | 133 ++++++++++
 benchmarks/multi_node/llm-d/job.slurm         | 132 ++++++++++
 benchmarks/multi_node/llm-d/server.sh         | 249 ++++++++++++++++++
 benchmarks/multi_node/llm-d/submit.sh         | 107 ++++++++
 runners/launch_h200-dgxc-slurm.sh             |  83 +++++-
 13 files changed, 1227 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/llm-d/Dockerfile
 create mode 100644 benchmarks/llm-d/README.md
 create mode 100644 benchmarks/llm-d/envoy.yaml
 create mode 100644 benchmarks/llm-d/epp-config.yaml
 create mode 100755 benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
 create mode 100644 benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
 create mode 100644 benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml
 create mode 100644 benchmarks/multi_node/llm-d/README.md
 create mode 100644 benchmarks/multi_node/llm-d/job.slurm
 create mode 100755 benchmarks/multi_node/llm-d/server.sh
 create mode 100755 benchmarks/multi_node/llm-d/submit.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ad469b28e..163ee0b89 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11267,3 +11267,96 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           tp: 4
           ep: 4
           dp-attn: true
+
+
+# llm-d-vllm wide-EP P/D disagg on H200.
+#
+# Mirrors the llm-d wide-EP-lws guide reference topology:
+#   1 prefill instance, 2 H200 nodes, DP=16 (EP across 16 ranks)
+#   1 decode  instance, 2 H200 nodes, DP=16
+#   total 4 H200 nodes / 32 GPUs.
+# Phase 1 entry: a single search-space row exercising the wide-EP-lws
+# guide's reference benchmark (ISL=2k / OSL=2k). EPP scheduling, per-role
+# vLLM extra-args, and SLURM time limit all live in
+# benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml,
+# selected via additional-settings: CONFIG_FILE=...
+#
+# Note: each prefill / decode "instance" spans 2 nodes here. The
+# num-worker field in this schema describes Dynamo's worker-per-node
+# model; for llm-d it is fixed at 1 instance per role and the topology
+# is communicated via PREFILL_NODES / DECODE_NODES additional-settings.
+dsr1-fp8-h200-llm-d-vllm:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode
+  precision: fp8
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 2048
+      osl: 2048
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 2048, 1024, 512, 256, 128 ]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 16
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "RANDOM_RANGE_RATIO=0.05"
+          - "CONFIG_FILE=dsr1-fp8-h200-1p1d-wideep.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 16
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+
+# llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0).
+#
+# Simplest possible multi-node llm-d-vllm shape:
+#   1 prefill node (DP=8 EP=8 dp-attn) + 1 decode node (DP=8 EP=8 dp-attn).
+#   Total 2 H200 nodes. No DeepEP, no NVSHMEM ibgda, no full-mesh RDMA.
+#   KV transfer prefill -> decode via NIXL point-to-point.
+#
+# Apples-to-apples shape vs Dynamo's H200 1P+1D entries (which use
+# sglang or trt; this is the same topology but with vLLM and the llm-d
+# router).
+dsr1-fp8-h200-llm-d-vllm-simple:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode
+  precision: fp8
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 4, 16, 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "RANDOM_RANGE_RATIO=0.05"
+          - "CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/llm-d/Dockerfile b/benchmarks/llm-d/Dockerfile
new file mode 100644
index 000000000..0e5228136
--- /dev/null
+++ b/benchmarks/llm-d/Dockerfile
@@ -0,0 +1,22 @@
+# Combined image for the InferenceX llm-d-vllm framework.
+#
+# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP +
+# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top
+# so every node in a SLURM allocation can play any role (prefill, decode,
+# or coordinator) from a single image.
+#
+# Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT
+# baked in. They are mounted at runtime by job.slurm so config-only
+# iteration does not require an image rebuild. See
+# benchmarks/multi_node/llm-d/job.slurm for the expected mount layout.
+
+FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0
+
+COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \
+       /app/epp /usr/local/bin/epp
+
+COPY --from=ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main \
+       /app/pd-sidecar /usr/local/bin/pd-sidecar
+
+COPY --from=envoyproxy/envoy:distroless-v1.33.2 \
+     /usr/local/bin/envoy /usr/local/bin/
diff --git a/benchmarks/llm-d/README.md b/benchmarks/llm-d/README.md
new file mode 100644
index 000000000..cd6e0bf51
--- /dev/null
+++ b/benchmarks/llm-d/README.md
@@ -0,0 +1,16 @@
+# llm-d-vllm framework artifacts
+
+This directory holds the static, baked-into-the-image pieces of the
+`llm-d-vllm` benchmark framework.
+
+| File | Purpose |
+|---|---|
+| `Dockerfile` | Combined image: vLLM (DeepEP-enabled), EPP, routing-sidecar, Envoy. One image, every node uses what its role requires. |
+| `epp-config.yaml` | Fallback EPP scheduling config. Used when no recipe overrides it via `CONFIG_FILE`. `disagg-profile-handler` + `kv-cache-utilization-scorer` + `random-picker` over the file-discovery endpoint set. |
+| `envoy.yaml` | Static Envoy: listener `:8080`, ext_proc to `127.0.0.1:9002`, ORIGINAL_DST cluster reading `x-gateway-destination-endpoint`. |
+
+The runtime pieces (per-node `server.sh`, the SLURM job script, recipe
+files, and the endpoint discovery mechanism) live under
+`benchmarks/multi_node/llm-d/` and `benchmarks/multi_node/llm-d-recipes/`.
+See the README in `benchmarks/multi_node/llm-d/` for the endpoints-file
+generation flow.
diff --git a/benchmarks/llm-d/envoy.yaml b/benchmarks/llm-d/envoy.yaml
new file mode 100644
index 000000000..20bbe60a6
--- /dev/null
+++ b/benchmarks/llm-d/envoy.yaml
@@ -0,0 +1,85 @@
+# Envoy front door for the llm-d-vllm framework.
+#
+# Listener  : 0.0.0.0:8080  (benchmark client target)
+# ext_proc  : EPP on 127.0.0.1:9002
+# Cluster   : ORIGINAL_DST, picks the address from the
+#             x-gateway-destination-endpoint header that EPP sets.
+
+static_resources:
+  listeners:
+    - name: main
+      address:
+        socket_address: { address: 0.0.0.0, port_value: 8080 }
+      filter_chains:
+        - filters:
+            - name: envoy.filters.network.http_connection_manager
+              typed_config:
+                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+                stat_prefix: ingress_http
+                codec_type: AUTO
+                stream_idle_timeout: 0s
+                request_timeout: 0s
+                route_config:
+                  name: route
+                  virtual_hosts:
+                    - name: vh
+                      domains: ["*"]
+                      routes:
+                        - match: { prefix: "/" }
+                          route:
+                            cluster: original_dst
+                            timeout: 0s
+                http_filters:
+                  - name: envoy.filters.http.ext_proc
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+                      grpc_service:
+                        envoy_grpc:
+                          cluster_name: epp
+                        timeout: 10s
+                      # message_timeout caps how long Envoy will wait for any
+                      # one ext_proc message ack from EPP. Generation can take
+                      # many seconds; 1000s mirrors the upstream llm-d guide.
+                      message_timeout: 1000s
+                      # FULL_DUPLEX_STREAMED for both directions: the dev EPP
+                      # (ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main)
+                      # does not ack BUFFERED body mode and Envoy times out
+                      # with 504. Trailer modes also have to be SEND for the
+                      # request lifecycle to terminate cleanly.
+                      processing_mode:
+                        request_header_mode: SEND
+                        response_header_mode: SEND
+                        request_body_mode: FULL_DUPLEX_STREAMED
+                        response_body_mode: FULL_DUPLEX_STREAMED
+                        request_trailer_mode: SEND
+                        response_trailer_mode: SEND
+                  - name: envoy.filters.http.router
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+  clusters:
+    - name: epp
+      type: STATIC
+      connect_timeout: 1s
+      typed_extension_protocol_options:
+        envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+          "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+          explicit_http_config:
+            http2_protocol_options: {}
+      load_assignment:
+        cluster_name: epp
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address: { address: 127.0.0.1, port_value: 9002 }
+    - name: original_dst
+      type: ORIGINAL_DST
+      lb_policy: CLUSTER_PROVIDED
+      connect_timeout: 5s
+      original_dst_lb_config:
+        use_http_header: true
+        http_header_name: x-gateway-destination-endpoint
+
+admin:
+  address:
+    socket_address: { address: 0.0.0.0, port_value: 9901 }
diff --git a/benchmarks/llm-d/epp-config.yaml b/benchmarks/llm-d/epp-config.yaml
new file mode 100644
index 000000000..3ff0eea87
--- /dev/null
+++ b/benchmarks/llm-d/epp-config.yaml
@@ -0,0 +1,61 @@
+# Default EPP scheduling config (fallback when CONFIG_FILE is unset).
+#
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream.
+#
+# Single delta vs upstream: file-discovery. The upstream guide assumes
+# a Kubernetes control plane drives endpoint discovery; in our SLURM
+# setup the coordinator node writes /tmp/endpoints.yaml at job start
+# (see benchmarks/multi_node/llm-d/README.md) and EPP loads it via the
+# file-discovery plugin instead.
+
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  # Endpoint discovery (replaces upstream's K8s discovery).
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  # P/D routing - identical to upstream pd-disaggregation guide.
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
diff --git a/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
new file mode 100755
index 000000000..61978c199
--- /dev/null
+++ b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Wrapper for the DSR1-FP8 H200 wide-EP llm-d-vllm benchmark.
+# Sets topology env (PREFILL_NODES, DECODE_NODES) and calls
+# benchmarks/multi_node/llm-d/submit.sh, which prints JOB_ID on stdout.
+# Same shape as benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh.
+
+set -euo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    MODEL_PATH \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1
+
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Concurrency list passes through to bench server. Use 'x'-delimited form
+# (matches sglang-disagg wrapper convention).
+JOB_ID=$(bash ./submit.sh \
+    "$PREFILL_NODES" \
+    "$DECODE_NODES" \
+    "$ISL" "$OSL" "${CONC_LIST// /x}" inf \
+    "$RANDOM_RANGE_RATIO")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
new file mode 100644
index 000000000..e3ca415b6
--- /dev/null
+++ b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
@@ -0,0 +1,101 @@
+# DeepSeek-R1-0528 fp8 on H200, simple 1P+1D P/D disagg.
+#
+# Phase 0 starting point - the simplest possible llm-d-vllm multi-node
+# config:
+#   1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   1 decode  node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   total 2 H200 nodes / 16 GPUs.
+#
+# No DeepEP, no NVSHMEM ibgda, no full-mesh-RDMA requirement, no
+# cross-node MoE all-to-all. KV transfer between prefill and decode goes
+# through NIXL point-to-point. This mirrors the shape of the simplest
+# Dynamo H200 multi-node disagg entries (e.g. dsr1-fp8-h200-dynamo-sglang
+# 1P+1D EP=8) but with vLLM as the engine and llm-d as the router.
+#
+# Selected via additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml
+# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.
+
+# ---- EPP scheduling config ----
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream. Single delta: file-discovery
+# replaces upstream's K8s endpoint discovery, since this benchmark runs
+# under SLURM. The coordinator node writes /tmp/endpoints.yaml at job
+# start (see benchmarks/multi_node/llm-d/README.md).
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Common flags (--enable-expert-parallel, --tensor-parallel-size,
+# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
+# server.sh. The cross-node DP coordination flags
+# (--data-parallel-hybrid-lb, --data-parallel-size-local, etc.) are NOT
+# emitted because LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
+prefill:
+  extra-args: >-
+    --gpu-memory-utilization 0.85
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 32768
+    --max-num-seqs 16
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+decode:
+  extra-args: >-
+    --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 256
+    --max-num-seqs 256
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "04:00:00"
diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml
new file mode 100644
index 000000000..702f66367
--- /dev/null
+++ b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml
@@ -0,0 +1,98 @@
+# DeepSeek-R1-0528 fp8 on H200, wide-EP P/D disagg.
+#
+# Mirrors guides/wide-ep-lws/modelserver/gpu/vllm/base/{prefill,decode}.yaml
+# from the llm-d wide-EP guide:
+#   1 prefill instance, 2 H200 nodes, DP=16 (EP across 16 ranks)
+#   1 decode  instance, 2 H200 nodes, DP=16
+#   total 4 H200 nodes / 32 GPUs.
+#
+# In our SLURM dispatcher, this recipe is selected via:
+#   additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-wideep.yaml
+# and PREFILL_NODES=2, DECODE_NODES=2 are passed to submit.sh by the
+# wrapper benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh.
+#
+# Three independent sections, all overridable per recipe:
+#   - top-level plugins/schedulingProfiles/dataLayer: EPP config, fed
+#     directly into --config-file on the EPP binary.
+#   - prefill / decode: per-role vLLM extra-args + env vars, appended to
+#     the vLLM launch command on each node.
+#   - slurm.time_limit: overrides TIME_LIMIT for this recipe.
+
+# ---- EPP scheduling config ----
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - name: prefill-filter
+    type: prefill-filter
+  - name: decode-filter
+    type: decode-filter
+  - name: kv-cache-scorer
+    type: kv-cache-utilization-scorer
+  - name: random-picker
+    type: random-picker
+
+  - name: prefill-profile
+    type: single-profile-handler
+    parameters:
+      filter: prefill-filter
+      scorer: kv-cache-scorer
+      picker: random-picker
+  - name: decode-profile
+    type: single-profile-handler
+    parameters:
+      filter: decode-filter
+      scorer: kv-cache-scorer
+      picker: random-picker
+
+  - name: disagg-handler
+    type: disagg-profile-handler
+    parameters:
+      profiles:
+        prefill: prefill-profile
+        decode: decode-profile
+
+schedulingProfiles:
+  - name: default
+    plugins:
+      - pluginRef: disagg-handler
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Mirrored from the llm-d wide-EP-lws prefill.yaml / decode.yaml manifests.
+# Common flags (data-parallel-hybrid-lb, enable-expert-parallel,
+# kv_transfer_config, moe-backend, etc.) are set in server.sh.
+
+prefill:
+  extra-args: >-
+    --gpu-memory-utilization 0.80
+    --enable-dbo
+    --dbo-prefill-token-threshold 32
+    --enable-eplb
+    --eplb-config '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
+    --all2all-backend deepep_high_throughput
+  env:
+    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
+
+decode:
+  extra-args: >-
+    --gpu-memory-utilization 0.90
+    --max-num-batched-tokens 256
+    --max-num-seqs 256
+    --enable-dbo
+    --dbo-decode-token-threshold 32
+    --enable-eplb
+    --eplb-config '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
+    --all2all-backend deepep_low_latency
+  env:
+    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "08:00:00"
diff --git a/benchmarks/multi_node/llm-d/README.md b/benchmarks/multi_node/llm-d/README.md
new file mode 100644
index 000000000..b57333dea
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/README.md
@@ -0,0 +1,133 @@
+# llm-d-vllm multi-node SLURM scaffolding
+
+This directory holds the SLURM-side orchestration for the `llm-d-vllm`
+benchmark framework. It mirrors the AMD `sglang-disagg` pattern under
+`benchmarks/multi_node/amd_utils/` (NOT the Dynamo / srt-slurm pattern):
+InferenceX itself owns the SLURM job, no vendor multi-node tool involved.
+
+| File | Role |
+|---|---|
+| `submit.sh` | sbatch wrapper. Validates env, exports tuning vars, returns `JOB_ID`. May read `slurm.time_limit` from the recipe to override `TIME_LIMIT`. |
+| `job.slurm` | sbatch entrypoint. Allocates `PREFILL_NODES + DECODE_NODES` nodes, derives per-node IPs, runs one Docker container per node via `srun`, threads role assignment env into each. |
+| `server.sh` | Per-node entry. Reads `NODE_RANK = SLURM_PROCID`, picks role, starts vLLM (with the wide-EP / DeepEP / NIXL flag set from the llm-d wide-EP-lws guide), starts the pd-sidecar on each leader, and on the decode leader additionally writes `endpoints.yaml`, starts EPP + Envoy, runs `benchmark_serving.py`, and `scancel`s the job. |
+
+## Topology
+
+For an `xP` prefill nodes / `yD` decode nodes run, total nodes = `xP + yD`.
+There is **no dedicated coordinator node**. The decode leader doubles as
+the coordinator (EPP + Envoy + bench), exactly like the AMD path's
+"decode rank 0" coordinator role.
+
+| Rank | Role |
+|---|---|
+| `0` | prefill leader (`LWS_WORKER_INDEX=0`, DP rank 0) + pd-sidecar |
+| `1 .. xP-1` | prefill workers |
+| `xP` | decode leader + pd-sidecar + EPP + Envoy + benchmark client |
+| `xP+1 .. xP+yD-1` | decode workers |
+
+Each instance (prefill or decode) is one vLLM engine spanning multiple
+nodes via `--data-parallel-hybrid-lb`. With `xP=2, yD=2,
+GPUS_PER_NODE=8` you get DP=16 prefill + DP=16 decode (the wide-EP
+reference). Per-rank split: `--data-parallel-size 16
+--data-parallel-size-local 8 --data-parallel-start-rank
+$((LWS_WORKER_INDEX * 8))`.
+
+## How `endpoints.yaml` is generated (file-discovery contract)
+
+The EPP runs in **no-Kubernetes mode**, using the `file-discovery` plugin
+from `llm-d-inference-scheduler` (branch `filediscovery-4`). At startup
+it reads `/tmp/endpoints.yaml`; the file lists every backend the EPP can
+route to, with role labels.
+
+The file is generated at runtime by `server.sh` on the decode leader
+(rank `PREFILL_NODES`). Because all node IPs are only known after
+`sbatch` allocates the job, the file cannot be baked into the image and
+is not part of the repo.
+
+Generation flow:
+
+1. `submit.sh` calls `sbatch -N (xP+yD)`. `sbatch` allocates nodes.
+2. `job.slurm` resolves each node's IP via `srun ip route get 1.1.1.1`,
+   slices them into `PREFILL_LEADER_IP` (= IPS[0]) and `DECODE_LEADER_IP`
+   (= IPS[PREFILL_NODES]), and passes both into the container as env
+   vars.
+3. On the decode leader, `server.sh` writes `/tmp/endpoints.yaml`
+   inside the container with one entry per leader:
+
+   ```yaml
+   endpoints:
+     - name: prefill-0
+       address: <PREFILL_LEADER_IP>
+       port: "8000"            # pd-sidecar port
+       labels:
+         llm-d.ai/role: prefill
+     - name: decode-0
+       address: <DECODE_LEADER_IP>
+       port: "8000"
+       labels:
+         llm-d.ai/role: decode
+   ```
+
+4. The EPP (started immediately after) loads the file via
+   `dataLayer.discovery.pluginRef: file-disc` (see
+   `benchmarks/llm-d/epp-config.yaml`). The plugin enumerates the
+   endpoints into the EPP datastore before the EPP starts serving
+   `ext_proc`, so Envoy never gets a request before discovery is ready.
+5. The `disagg-profile-handler` in the EPP config uses `prefill-filter`
+   and `decode-filter` to pick the right backend per request phase,
+   matching on the `llm-d.ai/role` label.
+
+### Why one entry per *leader* (not per node)
+
+In the wide-EP guide each instance is a single vLLM engine that spans
+multiple nodes via `--data-parallel-hybrid-lb`. With hybrid-lb, the
+leader pod (`LWS_WORKER_INDEX=0`) accepts external traffic and
+distributes it internally across the local DP ranks; in our LWS-free
+SLURM mapping, the prefill-leader and decode-leader are the only nodes
+addressable from outside. Adding an entry per worker would cause EPP to
+route directly to a worker, bypassing the engine's internal load
+balancing.
+
+If we later want to expose all pods of an instance (the alternative
+hybrid-lb interpretation: external LB across nodes too), we can extend
+the loop in `server.sh` to emit one entry per `IPS[i]` in the prefill
+range and one per `IPS[i]` in the decode range, all carrying the same
+role label. EPP then load-balances across them via `random-picker`.
+
+### Live reload
+
+`watchFile: false` in `epp-config.yaml`. Endpoints are static for the
+job lifetime - no reason to pay for `fsnotify` here. Set `watchFile:
+true` (and rewrite `/tmp/endpoints.yaml` from the coordinator) only if
+you want to drain or add an instance mid-run.
+
+### Validation rules (enforced by the plugin)
+
+- `address` must be a literal IPv4 address (no IPv6, no hostnames).
+- `port` is a string in `1..65535`.
+- File capped at 1 MiB.
+- Names must be unique within their namespace (we use the default
+  namespace, so they must be globally unique in the file).
+
+The IPs we collect from `ip route get 1.1.1.1` are always IPv4 on the
+H200 / B200 cluster's primary fabric; if you point at a different
+interface and it returns an IPv6 address, EPP will reject the file at
+startup.
+
+## Recipe files
+
+`benchmarks/multi_node/llm-d-recipes/<name>.yaml` is selected via
+`CONFIG_FILE=<name>.yaml` in the master config's `additional-settings`.
+Each recipe carries:
+
+- top-level `plugins:` / `schedulingProfiles:` / `dataLayer:` - fed into
+  the EPP via `--config-file`. Lets you change routing strategy without
+  rebuilding the image.
+- `prefill:` / `decode:` blocks with `extra-args` (appended to the vLLM
+  launch command on each node of that role) and `env` (exported before
+  vLLM starts).
+- `slurm.time_limit` - overrides `TIME_LIMIT` for that recipe.
+
+When `CONFIG_FILE` is unset or the file is missing, the EPP falls back
+to `/etc/epp/config.yaml` baked into the image, and vLLM runs with no
+extra flags beyond the wide-EP common set in `server.sh`.
diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm
new file mode 100644
index 000000000..05f8812ee
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/job.slurm
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH --job-name=llm-d-bench
+#SBATCH --ntasks-per-node=1
+# --output, --error, -N, -n, --time set by submit.sh
+#
+# Allocates PREFILL_NODES + DECODE_NODES nodes, derives per-node IPs, then
+# srun-runs server.sh inside one Docker container per node. NODE_RANK
+# (= SLURM_PROCID) drives role selection inside server.sh.
+
+set -euo pipefail
+
+echo "=== llm-d job start ==="
+echo "UTC: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')"
+
+# Repo root (benchmarks/multi_node/llm-d/job.slurm -> ../../..)
+DI_REPO_DIR=$(cd "$(dirname "$0")/../../.." && pwd)
+export DI_REPO_DIR
+
+ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
+echo "Allocated nodes ($TOTAL_NODES): $(echo "$ALL_NODES" | tr '\n' ' ')"
+
+if [[ "$TOTAL_NODES" -ne "$NUM_NODES" ]]; then
+    echo "Error: SLURM allocated $TOTAL_NODES nodes, expected $NUM_NODES" >&2
+    exit 1
+fi
+
+# Per-node IPs in rank order.
+IPS=()
+for NODE in $ALL_NODES; do
+    IP=$(srun --nodes=1 --ntasks=1 --nodelist="$NODE" \
+         bash -c 'ip route get 1.1.1.1 | awk "/src/ {print \$7}"')
+    IPS+=("$IP")
+done
+echo "Node IPs: ${IPS[*]}"
+
+# Rank slicing:
+#   prefill leader = rank 0
+#   prefill workers = ranks 1 .. PREFILL_NODES-1
+#   decode  leader = rank PREFILL_NODES (also coordinator: EPP + Envoy + bench)
+#   decode  workers = ranks PREFILL_NODES+1 .. NUM_NODES-1
+PREFILL_LEADER_IP="${IPS[0]}"
+DECODE_LEADER_IP="${IPS[$PREFILL_NODES]}"
+
+# DP leader addresses for vLLM --data-parallel-address (rank 0 of each instance).
+PREFILL_DP_ADDR="$PREFILL_LEADER_IP"
+DECODE_DP_ADDR="$DECODE_LEADER_IP"
+
+ALL_IP_LIST=$(IFS=,; echo "${IPS[*]}")
+
+SANITIZED_USER=$(echo "${USER:-runner}" | tr -c 'a-zA-Z0-9_.-' '_')
+DOCKER_CONT_NAME="llmd_bench_${SANITIZED_USER}_${SLURM_JOB_ID}"
+export DOCKER_CONT_NAME
+export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+DOCKER_MOUNT_PATH="/workspace"
+
+cleanup() {
+    echo "[${SLURM_JOB_ID}] cleanup on $(hostname)"
+}
+trap cleanup INT TERM HUP EXIT
+
+# One docker run per node, one task per node. server.sh dispatches by NODE_RANK.
+srun \
+    --kill-on-bad-exit=1 \
+    --signal=TERM@30 \
+    --unbuffered \
+    bash -lc "
+set -euo pipefail
+echo \"Rank \$SLURM_PROCID on \$(hostname)\"
+
+sudo docker ps -aq --filter name=^llmd_bench_ | xargs -r sudo docker rm -f || true
+
+exec sudo docker run --rm \
+    --init \
+    --stop-timeout 10 \
+    --network host \
+    --ipc host \
+    --gpus all \
+    --ulimit memlock=-1 --ulimit stack=67108864 \
+    --shm-size 32G \
+    --cap-add SYS_PTRACE --cap-add IPC_LOCK --cap-add SYS_RAWIO \
+    --device /dev/infiniband \
+    --security-opt seccomp=unconfined \
+    --privileged \
+    -v ${MODEL_DIR}:/models:ro \
+    -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
+    -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    -v ${DI_REPO_DIR}/benchmarks/multi_node/llm-d-recipes:/etc/llmd-recipes:ro \
+    -v ${DI_REPO_DIR}/benchmarks/llm-d/epp-config.yaml:/etc/epp/config.yaml:ro \
+    -v ${DI_REPO_DIR}/benchmarks/llm-d/envoy.yaml:/etc/envoy/envoy.yaml:ro \
+    -e SLURM_JOB_ID=\$SLURM_JOB_ID \
+    -e NODE_RANK=\$SLURM_PROCID \
+    -e NUM_NODES=$NUM_NODES \
+    -e PREFILL_NODES=$PREFILL_NODES \
+    -e DECODE_NODES=$DECODE_NODES \
+    -e ALL_IPS=$ALL_IP_LIST \
+    -e PREFILL_LEADER_IP=$PREFILL_LEADER_IP \
+    -e DECODE_LEADER_IP=$DECODE_LEADER_IP \
+    -e PREFILL_DP_ADDR=$PREFILL_DP_ADDR \
+    -e DECODE_DP_ADDR=$DECODE_DP_ADDR \
+    -e MODEL_DIR=/models \
+    -e MODEL_NAME=$MODEL_NAME \
+    -e GPUS_PER_NODE=$GPUS_PER_NODE \
+    -e PREFILL_DP_SIZE=$PREFILL_DP_SIZE \
+    -e DECODE_DP_SIZE=$DECODE_DP_SIZE \
+    -e BENCH_INPUT_LEN=$BENCH_INPUT_LEN \
+    -e BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN \
+    -e BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY \
+    -e BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE \
+    -e BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO \
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER \
+    -e BENCHMARK_LOGS_DIR=/benchmark_logs \
+    -e RUN_EVAL=$RUN_EVAL \
+    -e EVAL_ONLY=$EVAL_ONLY \
+    -e EVAL_CONC=$EVAL_CONC \
+    -e FRAMEWORK=$FRAMEWORK \
+    -e PRECISION=$PRECISION \
+    -e MODEL_PREFIX=$MODEL_PREFIX \
+    -e RUNNER_TYPE=$RUNNER_TYPE \
+    -e RESULT_FILENAME=$RESULT_FILENAME \
+    -e SPEC_DECODING=$SPEC_DECODING \
+    -e IS_MULTINODE=$IS_MULTINODE \
+    -e CONFIG_FILE=$CONFIG_FILE \
+    --name \"${DOCKER_CONT_NAME}_\$SLURM_PROCID\" \
+    \"\$DOCKER_IMAGE_NAME\" bash -lc '
+        set -o pipefail
+        ${DOCKER_MOUNT_PATH}/benchmarks/multi_node/llm-d/server.sh \
+            2>&1 | tee /benchmark_logs/slurm_job-'\"\$SLURM_JOB_ID\"'_rank_'\"\$SLURM_PROCID\"'.log
+    '
+"
+
+srun bash -c 'sudo docker ps -aq --filter name=^llmd_bench_ | xargs -r sudo docker rm -f' || true
diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
new file mode 100755
index 000000000..017ba83c4
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -0,0 +1,249 @@
+#!/usr/bin/env bash
+#
+# Per-node entrypoint for the llm-d-vllm wide-EP P/D disagg benchmark.
+# NODE_RANK is set by srun (= $SLURM_PROCID) in job.slurm.
+#
+# Roles:
+#   Rank 0                         -> prefill leader (DP rank 0)
+#   Ranks 1 .. PREFILL_NODES-1     -> prefill workers
+#   Rank PREFILL_NODES             -> decode leader (DP rank 0) + pd-sidecar
+#                                     + EPP + Envoy + benchmark client
+#                                     (the coordinator, like AMD's decode-0)
+#   Ranks PREFILL_NODES+1 ..       -> decode workers
+#
+# Each "instance" (prefill or decode) is a single vLLM engine spanning
+# PREFILL_NODES (or DECODE_NODES) nodes via --data-parallel-hybrid-lb. The
+# leader pod accepts external traffic; workers handle their local DP ranks.
+
+set -euo pipefail
+
+source /workspace/benchmarks/benchmark_lib.sh
+
+NODE_RANK="${NODE_RANK:-${SLURM_PROCID:-0}}"
+PREFILL_NODES="${PREFILL_NODES:-1}"
+DECODE_NODES="${DECODE_NODES:-1}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+VLLM_PORT=8200
+SIDECAR_PORT=8000
+ENVOY_PORT=8080
+EPP_GRPC_PORT=9002
+EPP_HEALTH_PORT=9003
+EPP_METRICS_PORT=9090
+
+MODEL="${MODEL_DIR}/${MODEL_NAME}"
+HOST_IP=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+
+VLLM_LOG="/benchmark_logs/vllm_rank${NODE_RANK}.log"
+SIDECAR_LOG="/benchmark_logs/sidecar_rank${NODE_RANK}.log"
+EPP_LOG="/benchmark_logs/epp.log"
+ENVOY_LOG="/benchmark_logs/envoy.log"
+
+echo "=== rank=$NODE_RANK host=$HOST_IP model=$MODEL ==="
+
+# ----------------------------------------------------------------
+# Role assignment
+# ----------------------------------------------------------------
+if [[ "$NODE_RANK" -lt "$PREFILL_NODES" ]]; then
+    ROLE="prefill"
+    DP_SIZE="$PREFILL_DP_SIZE"
+    DP_ADDR="$PREFILL_DP_ADDR"
+    LWS_WORKER_INDEX="$NODE_RANK"
+    LWS_GROUP_SIZE="$PREFILL_NODES"
+elif [[ "$NODE_RANK" -lt $((PREFILL_NODES + DECODE_NODES)) ]]; then
+    ROLE="decode"
+    DP_SIZE="$DECODE_DP_SIZE"
+    DP_ADDR="$DECODE_DP_ADDR"
+    LWS_WORKER_INDEX=$((NODE_RANK - PREFILL_NODES))
+    LWS_GROUP_SIZE="$DECODE_NODES"
+else
+    echo "ERROR: NODE_RANK=$NODE_RANK out of range" >&2
+    exit 1
+fi
+
+DP_SIZE_LOCAL="$GPUS_PER_NODE"
+START_RANK=$((LWS_WORKER_INDEX * DP_SIZE_LOCAL))
+TP_SIZE=1
+
+echo "ROLE=$ROLE DP_SIZE=$DP_SIZE DP_ADDR=$DP_ADDR LWS_WORKER_INDEX=$LWS_WORKER_INDEX START_RANK=$START_RANK"
+
+# ----------------------------------------------------------------
+# Read role-specific extra-args and env from the recipe file.
+# ----------------------------------------------------------------
+ROLE_EXTRA_ARGS=""
+if [[ -n "${CONFIG_FILE:-}" ]]; then
+    RECIPE_PATH="/etc/llmd-recipes/${CONFIG_FILE}"
+    if [[ -f "$RECIPE_PATH" ]]; then
+        echo "Loading $ROLE recipe from $RECIPE_PATH"
+        eval "$(python3 - <<PY
+import yaml
+recipe = yaml.safe_load(open('${RECIPE_PATH}'))
+section = recipe.get('${ROLE}', {}) or {}
+extra = (section.get('extra-args') or '').strip()
+print(f'ROLE_EXTRA_ARGS={extra!r}')
+for k, v in (section.get('env') or {}).items():
+    print(f'export {k}={v!r}')
+PY
+)"
+    else
+        echo "WARNING: CONFIG_FILE=$CONFIG_FILE but $RECIPE_PATH not found; using defaults" >&2
+    fi
+fi
+
+# ----------------------------------------------------------------
+# Wide-EP / P/D env (from the llm-d wide-EP-lws guide manifests).
+# ----------------------------------------------------------------
+export NVIDIA_GDRCOPY=enabled
+export NVSHMEM_REMOTE_TRANSPORT=ibgda
+export NVSHMEM_IB_ENABLE_IBGDA=true
+export NVSHMEM_SYMMETRIC_SIZE=16G
+export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-eth0}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
+export VLLM_SKIP_P2P_CHECK=1
+export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1
+export VLLM_USE_DEEP_GEMM=1
+export VLLM_NIXL_SIDE_CHANNEL_HOST="$HOST_IP"
+export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+
+# ----------------------------------------------------------------
+# Start vLLM (every node, prefill or decode)
+#
+# Flags split into:
+#   * COMMON_ARGS - always passed.
+#   * MULTINODE_DP_ARGS - only when an instance spans more than one node
+#     (LWS_GROUP_SIZE > 1, i.e. wide-EP topology). vLLM's
+#     --data-parallel-hybrid-lb and the cross-process DP coordination
+#     flags are wrong for the single-node-per-instance case where DP is
+#     contained inside one engine process.
+# ----------------------------------------------------------------
+KV_TRANSFER_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load_failure_policy":"fail"}'
+
+COMMON_ARGS=(
+    --port "$VLLM_PORT"
+    --trust-remote-code
+    --api-server-count 1
+    --disable-access-log-for-endpoints=/health,/metrics
+    --enable-expert-parallel
+    --tensor-parallel-size "$TP_SIZE"
+    --data-parallel-size "$DP_SIZE"
+    --kv_transfer_config "$KV_TRANSFER_CONFIG"
+    --moe-backend deep_gemm
+)
+
+if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then
+    COMMON_ARGS+=(
+        --data-parallel-hybrid-lb
+        --data-parallel-size-local "$DP_SIZE_LOCAL"
+        --data-parallel-address "$DP_ADDR"
+        --data-parallel-rpc-port 5555
+        --data-parallel-start-rank "$START_RANK"
+    )
+fi
+
+echo "Starting vLLM ($ROLE) DP=$DP_SIZE local=$DP_SIZE_LOCAL start_rank=$START_RANK group_size=$LWS_GROUP_SIZE"
+# shellcheck disable=SC2086
+vllm serve "$MODEL" "${COMMON_ARGS[@]}" $ROLE_EXTRA_ARGS \
+    > "$VLLM_LOG" 2>&1 &
+VLLM_PID=$!
+
+# Only the leader of each instance accepts external requests on $VLLM_PORT.
+if [[ "$LWS_WORKER_INDEX" -eq 0 ]]; then
+    wait_for_server_ready --port "$VLLM_PORT" --server-log "$VLLM_LOG" --server-pid "$VLLM_PID"
+    echo "vLLM leader ready on rank $NODE_RANK"
+
+    # ------------------------------------------------------------
+    # Start pd-sidecar on each leader (prefill leader and decode leader).
+    # The decode-side sidecar is what EPP routes to; the prefill-side
+    # sidecar is the target the decode sidecar pulls KVs from.
+    # ------------------------------------------------------------
+    SIDECAR_CONNECTOR="nixlv2"
+    SIDECAR_FLAGS=(--port="$SIDECAR_PORT" --vllm-port="$VLLM_PORT"
+                   --kv-connector="$SIDECAR_CONNECTOR" --secure-proxy=false)
+    if [[ "$ROLE" == "decode" ]]; then
+        SIDECAR_FLAGS+=(--enable-prefiller-sampling)
+    fi
+    echo "Starting pd-sidecar ($ROLE leader): ${SIDECAR_FLAGS[*]}"
+    pd-sidecar "${SIDECAR_FLAGS[@]}" > "$SIDECAR_LOG" 2>&1 &
+    wait_for_server_ready --port "$SIDECAR_PORT" --server-log "$SIDECAR_LOG"
+    echo "pd-sidecar ready on $HOST_IP:$SIDECAR_PORT"
+fi
+
+# ----------------------------------------------------------------
+# Coordinator: decode leader runs EPP + Envoy + benchmark client.
+# ----------------------------------------------------------------
+if [[ "$ROLE" == "decode" && "$LWS_WORKER_INDEX" -eq 0 ]]; then
+
+    # Write endpoints.yaml. See benchmarks/multi_node/llm-d/README.md for
+    # the discovery contract.
+    # NOTE: endpoint 'namespace' must match EPP's --pool-namespace below
+    # (file-discovery filters endpoints by namespace; the schema default
+    # 'default' would otherwise drop every entry).
+    python3 - <<PY
+import os, yaml
+NS = 'inferencex'
+endpoints = [
+    {'name': 'prefill-0',
+     'namespace': NS,
+     'address': os.environ['PREFILL_LEADER_IP'],
+     'port': '$SIDECAR_PORT',
+     'labels': {'llm-d.ai/role': 'prefill'}},
+    {'name': 'decode-0',
+     'namespace': NS,
+     'address': os.environ['DECODE_LEADER_IP'],
+     'port': '$SIDECAR_PORT',
+     'labels': {'llm-d.ai/role': 'decode'}},
+]
+yaml.safe_dump({'endpoints': endpoints}, open('/tmp/endpoints.yaml', 'w'))
+print('endpoints.yaml:')
+print(open('/tmp/endpoints.yaml').read())
+PY
+
+    # EPP config: recipe override, else the default mounted by job.slurm
+    # at /etc/epp/config.yaml (sourced from benchmarks/llm-d/epp-config.yaml).
+    if [[ -n "$CONFIG_FILE" && -f "/etc/llmd-recipes/$CONFIG_FILE" ]]; then
+        EPP_CONFIG="/etc/llmd-recipes/$CONFIG_FILE"
+    else
+        EPP_CONFIG="/etc/epp/config.yaml"
+    fi
+    echo "EPP config: $EPP_CONFIG"
+
+    epp \
+        --pool-name=epp \
+        --pool-namespace=inferencex \
+        --config-file="$EPP_CONFIG" \
+        --grpc-port="$EPP_GRPC_PORT" \
+        --grpc-health-port="$EPP_HEALTH_PORT" \
+        --metrics-port="$EPP_METRICS_PORT" \
+        > "$EPP_LOG" 2>&1 &
+
+    envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 &
+
+    wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG"
+
+    # Wait for the prefill leader's sidecar before starting the bench.
+    wait_for_server_ready --port "$SIDECAR_PORT" --host "$PREFILL_LEADER_IP"
+
+    # Bench against Envoy. EPP routes to decode (and decode sidecar pulls
+    # from prefill via NIXL).
+    run_benchmark_serving \
+        --model "$MODEL" \
+        --port "$ENVOY_PORT" \
+        --backend openai \
+        --input-len "$BENCH_INPUT_LEN" \
+        --output-len "$BENCH_OUTPUT_LEN" \
+        --random-range-ratio "$BENCH_RANDOM_RANGE_RATIO" \
+        --num-prompts "$((BENCH_MAX_CONCURRENCY * BENCH_NUM_PROMPTS_MULTIPLIER))" \
+        --max-concurrency "$BENCH_MAX_CONCURRENCY" \
+        --result-filename "$RESULT_FILENAME" \
+        --result-dir "$BENCHMARK_LOGS_DIR/"
+
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        run_eval --framework lm-eval --port "$ENVOY_PORT"
+        append_lm_eval_summary
+    fi
+
+    scancel "$SLURM_JOB_ID"
+else
+    # Workers (prefill workers, decode workers, prefill leader): just keep vLLM alive.
+    wait
+fi
diff --git a/benchmarks/multi_node/llm-d/submit.sh b/benchmarks/multi_node/llm-d/submit.sh
new file mode 100755
index 000000000..0f35e841c
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/submit.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+#
+# Submit a multi-node llm-d-vllm wide-EP P/D disagg benchmark job to SLURM.
+# Modeled after benchmarks/multi_node/amd_utils/submit.sh; prints JOB_ID on
+# stdout so the runner can poll for completion.
+#
+# Topology (matches the llm-d wide-EP guide reference):
+#   1 prefill instance with DP=PREFILL_NODES * GPUS_PER_NODE
+#   1 decode  instance with DP=DECODE_NODES  * GPUS_PER_NODE
+#   each instance spans PREFILL_NODES / DECODE_NODES nodes via vLLM
+#   --data-parallel-hybrid-lb. Total nodes = PREFILL_NODES + DECODE_NODES.
+
+set -euo pipefail
+
+check_env() {
+    local name="$1"
+    if [[ -z "${!name:-}" ]]; then
+        echo "Error: ${name} not set" >&2
+        exit 1
+    fi
+}
+
+check_env SLURM_ACCOUNT
+check_env SLURM_PARTITION
+check_env TIME_LIMIT
+check_env MODEL_PATH
+check_env MODEL_NAME
+check_env CONTAINER_IMAGE
+check_env RUNNER_NAME
+
+PREFILL_NODES=$1
+DECODE_NODES=$2
+ISL=$3
+OSL=$4
+CONCURRENCIES=$5
+REQUEST_RATE=${6:-inf}
+RANDOM_RANGE_RATIO=${7:-0.8}
+
+NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE
+export MODEL_DIR=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export NUM_NODES=$NUM_NODES
+export PREFILL_NODES=$PREFILL_NODES
+export DECODE_NODES=$DECODE_NODES
+export GPUS_PER_NODE=$GPUS_PER_NODE
+export PREFILL_DP_SIZE=$((PREFILL_NODES * GPUS_PER_NODE))
+export DECODE_DP_SIZE=$((DECODE_NODES  * GPUS_PER_NODE))
+export BENCH_INPUT_LEN=$ISL
+export BENCH_OUTPUT_LEN=$OSL
+export BENCH_MAX_CONCURRENCY=$CONCURRENCIES
+export BENCH_REQUEST_RATE=$REQUEST_RATE
+export BENCH_RANDOM_RANGE_RATIO=$RANDOM_RANGE_RATIO
+export BENCH_NUM_PROMPTS_MULTIPLIER=10
+
+export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
+export FRAMEWORK="${FRAMEWORK:-llm-d-vllm}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-none}"
+export IS_MULTINODE="${IS_MULTINODE:-true}"
+export CONFIG_FILE="${CONFIG_FILE:-}"
+
+# Recipe may override SLURM time limit (longer topologies need more wall time).
+if [[ -n "$CONFIG_FILE" ]]; then
+    RECIPE_PATH="benchmarks/multi_node/llm-d-recipes/${CONFIG_FILE}"
+    if [[ -f "$RECIPE_PATH" ]]; then
+        RECIPE_TIME=$(python3 -c "
+import yaml, sys
+r = yaml.safe_load(open('$RECIPE_PATH'))
+t = r.get('slurm', {}).get('time_limit', '')
+print(t)
+" 2>/dev/null || true)
+        [[ -n "$RECIPE_TIME" ]] && TIME_LIMIT="$RECIPE_TIME"
+    fi
+fi
+
+export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+mkdir -p "$BENCHMARK_LOGS_DIR"
+
+JOB_ID=$(sbatch \
+    --parsable \
+    --exclusive \
+    -N "$NUM_NODES" \
+    -n "$NUM_NODES" \
+    --ntasks-per-node=1 \
+    --gres=gpu:"$GPUS_PER_NODE" \
+    --time "$TIME_LIMIT" \
+    --partition "$SLURM_PARTITION" \
+    --account "$SLURM_ACCOUNT" \
+    --job-name "$RUNNER_NAME" \
+    --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" \
+    --error  "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" \
+    "$(dirname "$0")/job.slurm")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Error: sbatch failed" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 572056956..23ad6e9e7 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -8,6 +8,87 @@ set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
 
+    # ------------------------------------------------------------------
+    # llm-d-vllm: InferenceX-owned multi-node path (no srt-slurm).
+    # Mirrors the AMD sglang-disagg dispatch shape: wrapper script ->
+    # benchmarks/multi_node/llm-d/submit.sh -> sbatch -> JOB_ID.
+    # ------------------------------------------------------------------
+    if [[ "$FRAMEWORK" == "llm-d-vllm" ]]; then
+        if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
+            export MODEL_PATH="/models/DeepSeek-R1-0528"
+            export MODEL_NAME="DeepSeek-R1-0528"
+        else
+            echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2
+            exit 1
+        fi
+
+        # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible); mirrors AMD path.
+        export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+        mkdir -p "$BENCHMARK_LOGS_DIR"
+
+        SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_h200_llm-d-vllm.sh"
+        BENCH_SCRIPT="benchmarks/multi_node/${SCRIPT_NAME}"
+        if [[ ! -f "$BENCH_SCRIPT" ]]; then
+            echo "Error: llm-d wrapper not found: $BENCH_SCRIPT" >&2
+            exit 1
+        fi
+
+        JOB_ID=$(bash "$BENCH_SCRIPT")
+        if [[ -z "$JOB_ID" ]]; then
+            echo "Error: failed to submit llm-d job" >&2
+            exit 1
+        fi
+        echo "Submitted llm-d job: $JOB_ID"
+
+        LOG_FILE="${BENCHMARK_LOGS_DIR}/slurm_job-${JOB_ID}.out"
+
+        # Wait for log file (also catch early failures).
+        while ! ls "$LOG_FILE" &>/dev/null; do
+            if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+                echo "ERROR: job $JOB_ID failed before creating log file"
+                scontrol show job "$JOB_ID" || true
+                exit 1
+            fi
+            sleep 5
+        done
+
+        # Background poll, foreground tail.
+        (
+            while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+                sleep 10
+            done
+        ) &
+        POLL_PID=$!
+
+        tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+        wait $POLL_PID
+
+        # Result collection: same shape as AMD path.
+        for result_file in $(find "${BENCHMARK_LOGS_DIR}" -name "${RESULT_FILENAME}*.json" 2>/dev/null); do
+            file_name=$(basename "$result_file")
+            cp "$result_file" "$GITHUB_WORKSPACE/${file_name}"
+            echo "Copied result: $file_name"
+        done
+
+        if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+            EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR" -type d -name eval_results 2>/dev/null | head -1)
+            if [[ -n "$EVAL_DIR" && -d "$EVAL_DIR" ]]; then
+                shopt -s nullglob
+                for eval_file in "$EVAL_DIR"/*; do
+                    [ -f "$eval_file" ] || continue
+                    cp "$eval_file" "$GITHUB_WORKSPACE/"
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                done
+                shopt -u nullglob
+            else
+                echo "WARNING: RUN_EVAL=true but no eval_results found under $BENCHMARK_LOGS_DIR"
+            fi
+        fi
+
+        scancel "$JOB_ID" 2>/dev/null || true
+        exit 0
+    fi
+
     # MODEL_PATH: Override with pre-downloaded paths on H200 runner
     # The yaml files specify HuggingFace model IDs for portability, but we use
     # local paths to avoid repeated downloading on the shared H200 cluster.
@@ -29,7 +110,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             exit 1
         fi
     else
-        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang"
+        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, llm-d-vllm"
         exit 1
     fi
 

From 1ff425809f3cc63939154dce48269910b9ddf64b Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 14:20:53 +0300
Subject: [PATCH 2/8] [NV] llm-d: address PR #1660 review (C1 sweep, C2 health
 waits, C3 NVSHMEM gate)

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/server.sh | 79 +++++++++++++++++++--------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 017ba83c4..74aff0cc0 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -90,13 +90,8 @@ PY
 fi
 
 # ----------------------------------------------------------------
-# Wide-EP / P/D env (from the llm-d wide-EP-lws guide manifests).
+# Multi-node DP / NIXL P/D env: needed in any topology.
 # ----------------------------------------------------------------
-export NVIDIA_GDRCOPY=enabled
-export NVSHMEM_REMOTE_TRANSPORT=ibgda
-export NVSHMEM_IB_ENABLE_IBGDA=true
-export NVSHMEM_SYMMETRIC_SIZE=16G
-export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-eth0}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export VLLM_SKIP_P2P_CHECK=1
@@ -105,6 +100,21 @@ export VLLM_USE_DEEP_GEMM=1
 export VLLM_NIXL_SIDE_CHANNEL_HOST="$HOST_IP"
 export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
 
+# ----------------------------------------------------------------
+# Wide-EP NVSHMEM / ibgda env (from the llm-d wide-EP-lws guide
+# manifests). Gated on LWS_GROUP_SIZE > 1 - the simple 1P+1D recipe
+# explicitly avoids DeepEP, NVSHMEM ibgda, and full-mesh RDMA, so
+# leaving these set on a single-node-per-role topology is misleading
+# and could trigger ibgda code paths it does not need.
+# ----------------------------------------------------------------
+if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then
+    export NVIDIA_GDRCOPY=enabled
+    export NVSHMEM_REMOTE_TRANSPORT=ibgda
+    export NVSHMEM_IB_ENABLE_IBGDA=true
+    export NVSHMEM_SYMMETRIC_SIZE=16G
+    export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-eth0}
+fi
+
 # ----------------------------------------------------------------
 # Start vLLM (every node, prefill or decode)
 #
@@ -164,7 +174,8 @@ if [[ "$LWS_WORKER_INDEX" -eq 0 ]]; then
     fi
     echo "Starting pd-sidecar ($ROLE leader): ${SIDECAR_FLAGS[*]}"
     pd-sidecar "${SIDECAR_FLAGS[@]}" > "$SIDECAR_LOG" 2>&1 &
-    wait_for_server_ready --port "$SIDECAR_PORT" --server-log "$SIDECAR_LOG"
+    SIDECAR_PID=$!
+    wait_for_server_ready --port "$SIDECAR_PORT" --server-log "$SIDECAR_LOG" --server-pid "$SIDECAR_PID"
     echo "pd-sidecar ready on $HOST_IP:$SIDECAR_PORT"
 fi
 
@@ -217,25 +228,47 @@ PY
         > "$EPP_LOG" 2>&1 &
 
     envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 &
+    ENVOY_PID=$!
 
-    wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG"
+    wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG" --server-pid "$ENVOY_PID"
 
     # Wait for the prefill leader's sidecar before starting the bench.
-    wait_for_server_ready --port "$SIDECAR_PORT" --host "$PREFILL_LEADER_IP"
-
-    # Bench against Envoy. EPP routes to decode (and decode sidecar pulls
-    # from prefill via NIXL).
-    run_benchmark_serving \
-        --model "$MODEL" \
-        --port "$ENVOY_PORT" \
-        --backend openai \
-        --input-len "$BENCH_INPUT_LEN" \
-        --output-len "$BENCH_OUTPUT_LEN" \
-        --random-range-ratio "$BENCH_RANDOM_RANGE_RATIO" \
-        --num-prompts "$((BENCH_MAX_CONCURRENCY * BENCH_NUM_PROMPTS_MULTIPLIER))" \
-        --max-concurrency "$BENCH_MAX_CONCURRENCY" \
-        --result-filename "$RESULT_FILENAME" \
-        --result-dir "$BENCHMARK_LOGS_DIR/"
+    # wait_for_server_ready can only probe localhost; the prefill leader
+    # is on a different node, so poll directly with a deadline.
+    echo "Waiting for prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT/health"
+    PREFILL_WAIT_DEADLINE=$(( $(date +%s) + 300 ))
+    until curl --output /dev/null --silent --fail \
+            "http://$PREFILL_LEADER_IP:$SIDECAR_PORT/health"; do
+        if [[ "$(date +%s)" -ge "$PREFILL_WAIT_DEADLINE" ]]; then
+            echo "ERROR: prefill sidecar did not become ready within 5 min" >&2
+            exit 1
+        fi
+        sleep 5
+    done
+    echo "Prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT is ready"
+
+    # Sweep concurrency. BENCH_MAX_CONCURRENCY arrives from submit.sh as
+    # an 'x'-delimited list (e.g. "2048x1024x512"); the runner / sweep
+    # configs expect one bench run per level. Same shape as
+    # benchmarks/multi_node/amd_utils/bench.sh.
+    IFS='x' read -r -a CONCURRENCIES <<< "$BENCH_MAX_CONCURRENCY"
+    for max_concurrency in "${CONCURRENCIES[@]}"; do
+        num_prompts=$(( max_concurrency * BENCH_NUM_PROMPTS_MULTIPLIER ))
+        [[ "$num_prompts" -lt 16 ]] && num_prompts=16
+        # Bench against Envoy. EPP routes to decode (and decode sidecar
+        # pulls from prefill via NIXL).
+        run_benchmark_serving \
+            --model "$MODEL" \
+            --port "$ENVOY_PORT" \
+            --backend openai \
+            --input-len "$BENCH_INPUT_LEN" \
+            --output-len "$BENCH_OUTPUT_LEN" \
+            --random-range-ratio "$BENCH_RANDOM_RANGE_RATIO" \
+            --num-prompts "$num_prompts" \
+            --max-concurrency "$max_concurrency" \
+            --result-filename "${RESULT_FILENAME}_c${max_concurrency}" \
+            --result-dir "$BENCHMARK_LOGS_DIR/"
+    done
 
     if [[ "${RUN_EVAL:-false}" == "true" ]]; then
         run_eval --framework lm-eval --port "$ENVOY_PORT"

From 43d5d05e459f340bf9a32f65c3b4631f31b58e61 Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 15:37:30 +0300
Subject: [PATCH 3/8] [NV] llm-d: remove H200 wide-EP entry, keep simple 1P+1D

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 .github/configs/nvidia-master.yaml            | 49 ----------
 .../dsr1-fp8-h200-1p1d-wideep.yaml            | 98 -------------------
 2 files changed, 147 deletions(-)
 delete mode 100644 benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 163ee0b89..675250e2b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11269,55 +11269,6 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           dp-attn: true
 
 
-# llm-d-vllm wide-EP P/D disagg on H200.
-#
-# Mirrors the llm-d wide-EP-lws guide reference topology:
-#   1 prefill instance, 2 H200 nodes, DP=16 (EP across 16 ranks)
-#   1 decode  instance, 2 H200 nodes, DP=16
-#   total 4 H200 nodes / 32 GPUs.
-# Phase 1 entry: a single search-space row exercising the wide-EP-lws
-# guide's reference benchmark (ISL=2k / OSL=2k). EPP scheduling, per-role
-# vLLM extra-args, and SLURM time limit all live in
-# benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml,
-# selected via additional-settings: CONFIG_FILE=...
-#
-# Note: each prefill / decode "instance" spans 2 nodes here. The
-# num-worker field in this schema describes Dynamo's worker-per-node
-# model; for llm-d it is fixed at 1 instance per role and the topology
-# is communicated via PREFILL_NODES / DECODE_NODES additional-settings.
-dsr1-fp8-h200-llm-d-vllm:
-  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: h200-multinode
-  precision: fp8
-  framework: llm-d-vllm
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 2048
-      osl: 2048
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 2048, 1024, 512, 256, 128 ]
-        prefill:
-          num-worker: 1
-          tp: 1
-          ep: 16
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-          - "RANDOM_RANGE_RATIO=0.05"
-          - "CONFIG_FILE=dsr1-fp8-h200-1p1d-wideep.yaml"
-        decode:
-          num-worker: 1
-          tp: 1
-          ep: 16
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=2"
-
 # llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0).
 #
 # Simplest possible multi-node llm-d-vllm shape:
diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml
deleted file mode 100644
index 702f66367..000000000
--- a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-wideep.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-# DeepSeek-R1-0528 fp8 on H200, wide-EP P/D disagg.
-#
-# Mirrors guides/wide-ep-lws/modelserver/gpu/vllm/base/{prefill,decode}.yaml
-# from the llm-d wide-EP guide:
-#   1 prefill instance, 2 H200 nodes, DP=16 (EP across 16 ranks)
-#   1 decode  instance, 2 H200 nodes, DP=16
-#   total 4 H200 nodes / 32 GPUs.
-#
-# In our SLURM dispatcher, this recipe is selected via:
-#   additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-wideep.yaml
-# and PREFILL_NODES=2, DECODE_NODES=2 are passed to submit.sh by the
-# wrapper benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh.
-#
-# Three independent sections, all overridable per recipe:
-#   - top-level plugins/schedulingProfiles/dataLayer: EPP config, fed
-#     directly into --config-file on the EPP binary.
-#   - prefill / decode: per-role vLLM extra-args + env vars, appended to
-#     the vLLM launch command on each node.
-#   - slurm.time_limit: overrides TIME_LIMIT for this recipe.
-
-# ---- EPP scheduling config ----
-plugins:
-  - name: file-disc
-    type: file-discovery
-    parameters:
-      path: /tmp/endpoints.yaml
-      watchFile: false
-
-  - name: prefill-filter
-    type: prefill-filter
-  - name: decode-filter
-    type: decode-filter
-  - name: kv-cache-scorer
-    type: kv-cache-utilization-scorer
-  - name: random-picker
-    type: random-picker
-
-  - name: prefill-profile
-    type: single-profile-handler
-    parameters:
-      filter: prefill-filter
-      scorer: kv-cache-scorer
-      picker: random-picker
-  - name: decode-profile
-    type: single-profile-handler
-    parameters:
-      filter: decode-filter
-      scorer: kv-cache-scorer
-      picker: random-picker
-
-  - name: disagg-handler
-    type: disagg-profile-handler
-    parameters:
-      profiles:
-        prefill: prefill-profile
-        decode: decode-profile
-
-schedulingProfiles:
-  - name: default
-    plugins:
-      - pluginRef: disagg-handler
-
-dataLayer:
-  discovery:
-    pluginRef: file-disc
-
-# ---- Per-role vLLM flags ----
-# Mirrored from the llm-d wide-EP-lws prefill.yaml / decode.yaml manifests.
-# Common flags (data-parallel-hybrid-lb, enable-expert-parallel,
-# kv_transfer_config, moe-backend, etc.) are set in server.sh.
-
-prefill:
-  extra-args: >-
-    --gpu-memory-utilization 0.80
-    --enable-dbo
-    --dbo-prefill-token-threshold 32
-    --enable-eplb
-    --eplb-config '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
-    --all2all-backend deepep_high_throughput
-  env:
-    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
-
-decode:
-  extra-args: >-
-    --gpu-memory-utilization 0.90
-    --max-num-batched-tokens 256
-    --max-num-seqs 256
-    --enable-dbo
-    --dbo-decode-token-threshold 32
-    --enable-eplb
-    --eplb-config '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
-    --all2all-backend deepep_low_latency
-  env:
-    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
-
-# ---- SLURM resource directives ----
-slurm:
-  time_limit: "08:00:00"

From 5baf9c6692f09fa320dd4c256ef1a4ffd93a6279 Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 15:52:39 +0300
Subject: [PATCH 4/8] [NV] llm-d: address PR #1660 review (C4 export slurm
 vars, C5 recipe path, C7 docker filter, C8 EPP wait)

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/job.slurm |  4 ++--
 benchmarks/multi_node/llm-d/server.sh | 20 ++++++++++++++++++++
 benchmarks/multi_node/llm-d/submit.sh |  7 ++++++-
 runners/launch_h200-dgxc-slurm.sh     |  8 +++++---
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm
index 05f8812ee..3265402f6 100644
--- a/benchmarks/multi_node/llm-d/job.slurm
+++ b/benchmarks/multi_node/llm-d/job.slurm
@@ -68,7 +68,7 @@ srun \
 set -euo pipefail
 echo \"Rank \$SLURM_PROCID on \$(hostname)\"
 
-sudo docker ps -aq --filter name=^llmd_bench_ | xargs -r sudo docker rm -f || true
+sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f || true
 
 exec sudo docker run --rm \
     --init \
@@ -129,4 +129,4 @@ exec sudo docker run --rm \
     '
 "
 
-srun bash -c 'sudo docker ps -aq --filter name=^llmd_bench_ | xargs -r sudo docker rm -f' || true
+srun bash -c "sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f" || true
diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 74aff0cc0..0dd3de975 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -226,6 +226,26 @@ PY
         --grpc-health-port="$EPP_HEALTH_PORT" \
         --metrics-port="$EPP_METRICS_PORT" \
         > "$EPP_LOG" 2>&1 &
+    EPP_PID=$!
+
+    # Wait for EPP to bind its gRPC port before starting Envoy. Envoy's
+    # ext_proc filter dials 127.0.0.1:$EPP_GRPC_PORT - if Envoy comes up
+    # first the early bench requests hit ext_proc connection errors.
+    # gRPC has no plain HTTP /health, so probe the TCP listener directly.
+    echo "Waiting for EPP on 127.0.0.1:$EPP_GRPC_PORT"
+    EPP_WAIT_DEADLINE=$(( $(date +%s) + 60 ))
+    until (echo > "/dev/tcp/127.0.0.1/$EPP_GRPC_PORT") 2>/dev/null; do
+        if ! kill -0 "$EPP_PID" 2>/dev/null; then
+            echo "ERROR: EPP died before binding $EPP_GRPC_PORT" >&2
+            exit 1
+        fi
+        if [[ "$(date +%s)" -ge "$EPP_WAIT_DEADLINE" ]]; then
+            echo "ERROR: EPP did not bind $EPP_GRPC_PORT within 60s" >&2
+            exit 1
+        fi
+        sleep 1
+    done
+    echo "EPP listening on $EPP_GRPC_PORT"
 
     envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 &
     ENVOY_PID=$!
diff --git a/benchmarks/multi_node/llm-d/submit.sh b/benchmarks/multi_node/llm-d/submit.sh
index 0f35e841c..663885426 100755
--- a/benchmarks/multi_node/llm-d/submit.sh
+++ b/benchmarks/multi_node/llm-d/submit.sh
@@ -12,6 +12,11 @@
 
 set -euo pipefail
 
+# Repo root resolved from this script's location, so paths below are
+# independent of the caller's $PWD (the wrapper cd's into llm-d/ before
+# invoking this script).
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+
 check_env() {
     local name="$1"
     if [[ -z "${!name:-}" ]]; then
@@ -69,7 +74,7 @@ export CONFIG_FILE="${CONFIG_FILE:-}"
 
 # Recipe may override SLURM time limit (longer topologies need more wall time).
 if [[ -n "$CONFIG_FILE" ]]; then
-    RECIPE_PATH="benchmarks/multi_node/llm-d-recipes/${CONFIG_FILE}"
+    RECIPE_PATH="${REPO_ROOT}/benchmarks/multi_node/llm-d-recipes/${CONFIG_FILE}"
     if [[ -f "$RECIPE_PATH" ]]; then
         RECIPE_TIME=$(python3 -c "
 import yaml, sys
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 23ad6e9e7..1a948b41e 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -1,8 +1,10 @@
 #!/usr/bin/bash
 
-# System-specific configuration for H200 DGXC Slurm cluster
-SLURM_PARTITION="main"
-SLURM_ACCOUNT="sa-shared"
+# System-specific configuration for H200 DGXC Slurm cluster.
+# Exported so child processes (e.g. submit.sh invoked via nested bash)
+# inherit them.
+export SLURM_PARTITION="main"
+export SLURM_ACCOUNT="sa-shared"
 
 set -x
 

From 4863630172b78650b76f34c05aa59fbbb2e45dde Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 16:52:53 +0300
Subject: [PATCH 5/8] [NV] llm-d: address PR #1660 review (C9 marker-file
 teardown instead of in-container scancel)

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/job.slurm | 15 +++++++++++++++
 benchmarks/multi_node/llm-d/server.sh |  7 ++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm
index 3265402f6..46a026ced 100644
--- a/benchmarks/multi_node/llm-d/job.slurm
+++ b/benchmarks/multi_node/llm-d/job.slurm
@@ -56,9 +56,24 @@ DOCKER_MOUNT_PATH="/workspace"
 
 cleanup() {
     echo "[${SLURM_JOB_ID}] cleanup on $(hostname)"
+    [[ -n "${WATCHER_PID:-}" ]] && kill "$WATCHER_PID" 2>/dev/null || true
 }
 trap cleanup INT TERM HUP EXIT
 
+# Coordinator-done watcher. server.sh on the decode coordinator writes
+# this marker after the bench finishes; we then scancel the allocation
+# from outside the container (the image has no SLURM client tools).
+# Without this, workers `wait` on local vLLM forever and the job runs
+# to TIME_LIMIT.
+BENCH_DONE_MARKER="$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID"
+rm -f "$BENCH_DONE_MARKER"
+(
+    while [[ ! -f "$BENCH_DONE_MARKER" ]]; do sleep 5; done
+    echo "[${SLURM_JOB_ID}] coordinator finished; scancel'ing job"
+    scancel "$SLURM_JOB_ID" 2>/dev/null || true
+) &
+WATCHER_PID=$!
+
 # One docker run per node, one task per node. server.sh dispatches by NODE_RANK.
 srun \
     --kill-on-bad-exit=1 \
diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 0dd3de975..1a1679e70 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -295,7 +295,12 @@ PY
         append_lm_eval_summary
     fi
 
-    scancel "$SLURM_JOB_ID"
+    # Signal job.slurm (running outside the container, where SLURM
+    # client tools are available) to scancel the allocation. The image
+    # does not bundle scancel, so calling it here would just trip
+    # set -e. Workers end server.sh in `wait`; without this signal
+    # they would hold the job until TIME_LIMIT.
+    touch "$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID"
 else
     # Workers (prefill workers, decode workers, prefill leader): just keep vLLM alive.
     wait

From fe5870b7f5e16e2f35a52cadcdff537ecbb0faaf Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 17:02:40 +0300
Subject: [PATCH 6/8] [NV] llm-d: address PR #1660 review (C10 auto-detect
 default NIC for NCCL/GLOO/NVSHMEM)

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/server.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 1a1679e70..67249ae70 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -32,6 +32,11 @@ EPP_METRICS_PORT=9090
 
 MODEL="${MODEL_DIR}/${MODEL_NAME}"
 HOST_IP=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+# Default NIC for NCCL / Gloo / NVSHMEM bootstrap. Pulled from the same
+# default route HOST_IP came from so the iface and the IP stay
+# consistent across clusters where the routed NIC is not eth0.
+DEFAULT_IFACE=$(ip -o -4 route show to default | awk '{print $5; exit}')
+DEFAULT_IFACE="${DEFAULT_IFACE:-eth0}"
 
 VLLM_LOG="/benchmark_logs/vllm_rank${NODE_RANK}.log"
 SIDECAR_LOG="/benchmark_logs/sidecar_rank${NODE_RANK}.log"
@@ -92,8 +97,8 @@ fi
 # ----------------------------------------------------------------
 # Multi-node DP / NIXL P/D env: needed in any topology.
 # ----------------------------------------------------------------
-export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
-export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$DEFAULT_IFACE}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$DEFAULT_IFACE}
 export VLLM_SKIP_P2P_CHECK=1
 export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1
 export VLLM_USE_DEEP_GEMM=1
@@ -112,7 +117,7 @@ if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then
     export NVSHMEM_REMOTE_TRANSPORT=ibgda
     export NVSHMEM_IB_ENABLE_IBGDA=true
     export NVSHMEM_SYMMETRIC_SIZE=16G
-    export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-eth0}
+    export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-$DEFAULT_IFACE}
 fi
 
 # ----------------------------------------------------------------

From 873fe04384572e9fea894ec739540e2711d9d0fc Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 17:16:04 +0300
Subject: [PATCH 7/8] [NV] llm-d: address PR #1660 review (C11 split MODEL into
 path and served-name)

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/server.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 67249ae70..683a18d03 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -30,7 +30,12 @@ EPP_GRPC_PORT=9002
 EPP_HEALTH_PORT=9003
 EPP_METRICS_PORT=9090
 
-MODEL="${MODEL_DIR}/${MODEL_NAME}"
+# Filesystem path to the weights inside the container. job.slurm mounts
+# the host model directory at /models and sets MODEL_DIR=/models, so the
+# weights live directly under MODEL_DIR. MODEL_NAME is the OpenAI-API
+# served name passed via --served-model-name; it is not part of the
+# filesystem path.
+MODEL="${MODEL_DIR}"
 HOST_IP=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
 # Default NIC for NCCL / Gloo / NVSHMEM bootstrap. Pulled from the same
 # default route HOST_IP came from so the iface and the IP stay
@@ -135,6 +140,7 @@ KV_TRANSFER_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load
 
 COMMON_ARGS=(
     --port "$VLLM_PORT"
+    --served-model-name "$MODEL_NAME"
     --trust-remote-code
     --api-server-count 1
     --disable-access-log-for-endpoints=/health,/metrics
@@ -283,7 +289,7 @@ PY
         # Bench against Envoy. EPP routes to decode (and decode sidecar
         # pulls from prefill via NIXL).
         run_benchmark_serving \
-            --model "$MODEL" \
+            --model "$MODEL_NAME" \
             --port "$ENVOY_PORT" \
             --backend openai \
             --input-len "$BENCH_INPUT_LEN" \

From 95b8b2fc78c039d06b5d55d0f746857d43df846b Mon Sep 17 00:00:00 2001
From: Ezra Silvera <ezra@il.ibm.com>
Date: Thu, 4 Jun 2026 17:31:39 +0300
Subject: [PATCH 8/8] [NV] llm-d: have every rank wait for its own vLLM /health
 before falling through

Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
---
 benchmarks/multi_node/llm-d/server.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
index 683a18d03..b5c264be6 100755
--- a/benchmarks/multi_node/llm-d/server.sh
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -167,11 +167,15 @@ vllm serve "$MODEL" "${COMMON_ARGS[@]}" $ROLE_EXTRA_ARGS \
     > "$VLLM_LOG" 2>&1 &
 VLLM_PID=$!
 
+# Every rank waits for its own engine to bind /health before falling
+# through. For wide-EP (LWS_GROUP_SIZE > 1) this prevents the bench
+# from starting before the worker-side DP shards have come up; for the
+# single-node case it is a no-op extra check.
+wait_for_server_ready --port "$VLLM_PORT" --server-log "$VLLM_LOG" --server-pid "$VLLM_PID"
+echo "vLLM ready on rank $NODE_RANK ($ROLE worker_index=$LWS_WORKER_INDEX)"
+
 # Only the leader of each instance accepts external requests on $VLLM_PORT.
 if [[ "$LWS_WORKER_INDEX" -eq 0 ]]; then
-    wait_for_server_ready --port "$VLLM_PORT" --server-log "$VLLM_LOG" --server-pid "$VLLM_PID"
-    echo "vLLM leader ready on rank $NODE_RANK"
-
     # ------------------------------------------------------------
     # Start pd-sidecar on each leader (prefill leader and decode leader).
     # The decode-side sidecar is what EPP routes to; the prefill-side