From 2667022429cdeb53a8fd7819e962ae7eb7d274a1 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Mon, 8 Jun 2026 12:51:33 -0700
Subject: [PATCH 1/4] dsr1-fp4-b200-dynamo-sglang-mtp: 8k1k 6-variant MTP
 disagg sweep on local split recipes

---
 .github/configs/nvidia-master.yaml            | 111 +++++---------
 .../disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml | 143 ++++++++++++++++++
 .../disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml | 143 ++++++++++++++++++
 .../disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml | 143 ++++++++++++++++++
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml    | 140 +++++++++++++++++
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml    | 140 +++++++++++++++++
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml    | 140 +++++++++++++++++
 perf-changelog.yaml                           |  10 ++
 runners/launch_b200-dgxc.sh                   |   6 +
 9 files changed, 904 insertions(+), 72 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..17ce00504 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7911,143 +7911,110 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
           dp-attn: true
 
 dsr1-fp4-b200-dynamo-sglang-mtp:
-  image: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  image: "lmsysorg/sglang:v0.5.12.post1"
   model: deepseek-r1-fp4
   model-prefix: dsr1
-  runner: b200-multinode
+  runner: b200
   precision: fp4
   framework: dynamo-sglang
   multinode: true
   disagg: true
   scenarios:
     fixed-seq-len:
-    - isl: 1024
+    - isl: 8192
       osl: 1024
       search-space:
+      # 1p5d low-latency (decode-heavy).
       - spec-decoding: "mtp"
-        conc-list: [16, 512]
+        conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
-          dp-attn: true
+          ep: 1
+          dp-attn: false
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
         decode:
           num-worker: 5
           tp: 8
-          ep: 8
+          ep: 1
           dp-attn: false
+      # 1p3d low-latency.
       - spec-decoding: "mtp"
-        conc-list: [32, 64, 256, 512]
+        conc-list: [32, 64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
-          dp-attn: true
+          ep: 1
+          dp-attn: false
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
         decode:
-          num-worker: 6
+          num-worker: 3
           tp: 8
-          ep: 8
+          ep: 1
           dp-attn: false
+      # 1p1d low-latency.
       - spec-decoding: "mtp"
-        conc-list: [512, 1024]
+        conc-list: [32, 64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
-          dp-attn: true
+          ep: 1
+          dp-attn: false
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
         decode:
           num-worker: 1
           tp: 8
-          ep: 8
-          dp-attn: true
+          ep: 1
+          dp-attn: false
+      # MTP2 high-throughput (DEP4 prefill / DEP8 decode), one Pareto point each.
+      # 2p1d throughput.
       - spec-decoding: "mtp"
-        conc-list: [512]
+        conc-list: [768]
         prefill:
-          num-worker: 1
+          num-worker: 2
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml"
         decode:
-          num-worker: 2
+          num-worker: 1
           tp: 8
           ep: 8
           dp-attn: true
-
-    - isl: 8192
-      osl: 1024
-      search-space:
+      # 3p1d throughput.
       - spec-decoding: "mtp"
-        conc-list: [64, 128]
+        conc-list: [1024]
         prefill:
-          num-worker: 1
+          num-worker: 3
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml"
         decode:
           num-worker: 1
           tp: 8
           ep: 8
-          dp-attn: false
-      - spec-decoding: "mtp"
-        conc-list: [8]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
           dp-attn: true
-          additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
-        decode:
-          num-worker: 5
-          tp: 8
-          ep: 8
-          dp-attn: false
+      # 5p1d throughput.
       - spec-decoding: "mtp"
-        conc-list: [4, 128]
+        conc-list: [2048]
         prefill:
-          num-worker: 2
+          num-worker: 5
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
-        decode:
-          num-worker: 5
-          tp: 8
-          ep: 8
-          dp-attn: false
-      - spec-decoding: "mtp"
-        conc-list: [4, 8, 16, 64]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml"
         decode:
           num-worker: 1
           tp: 8
-          ep: 1
-          dp-attn: false
+          ep: 8
+          dp-attn: true
 
 kimik2.5-fp4-gb200-dynamo-trt:
   image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml
new file mode 100644
index 000000000..37225d44e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-mtp2-throughput-2p-dep4-1d-dep8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + override_mtp2_throughput_2p1d, DEP4 prefill / DEP8 decode, MTP2).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: null
+      data-parallel-size: 4
+      tensor-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 1024
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      data-parallel-size: 8
+      tensor-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "768"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml
new file mode 100644
index 000000000..d8e401620
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-mtp2-throughput-3p-dep4-1d-dep8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + override_mtp2_throughput_3p1d, DEP4 prefill / DEP8 decode, MTP2).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  decode_nodes: 1
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: null
+      data-parallel-size: 4
+      tensor-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 1024
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      data-parallel-size: 8
+      tensor-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "1024"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml
new file mode 100644
index 000000000..bcbedcb68
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-mtp2-throughput-5p-dep4-1d-dep8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + override_mtp2_throughput_5p1d, DEP4 prefill / DEP8 decode, MTP2).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 3
+  decode_nodes: 1
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: null
+      data-parallel-size: 4
+      tensor-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 1024
+      max-running-requests: 2048
+      scheduler-recv-interval: 1
+      data-parallel-size: 8
+      tensor-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "2048"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
new file mode 100644
index 000000000..160854ebc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
@@ -0,0 +1,140 @@
+name: "b200-fp4-mtp-low-latency-1p-tp4-5d-tp8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + zip_override_mtp_lowlat[0], 1p-tp4 prefill / 5d-tp8 decode).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  decode_nodes: 5
+  prefill_workers: 1
+  decode_workers: 5
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      tensor-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "4x8x16x32x64"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
new file mode 100644
index 000000000..fd1d4a4f5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
@@ -0,0 +1,140 @@
+name: "b200-fp4-mtp-low-latency-1p-tp4-3d-tp8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + zip_override_mtp_lowlat[1], 1p-tp4 prefill / 3d-tp8 decode).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  decode_nodes: 3
+  prefill_workers: 1
+  decode_workers: 3
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      tensor-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "32x64x128x256x512"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
new file mode 100644
index 000000000..bcceaf872
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
@@ -0,0 +1,140 @@
+name: "b200-fp4-mtp-low-latency-1p-tp4-1d-tp8"
+
+# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml
+# base + zip_override_mtp_lowlat[2], 1p-tp4 prefill / 1d-tp8 decode).
+# One flat YAML per concrete topology, matching the InferenceX glm5 disagg
+# layout (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+dynamo:
+  hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727"
+  install: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.12.post1"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768
+      chunked-prefill-size: 24576
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.85
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      tensor-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  req_rate: 300
+  concurrencies: "32x64x128x256x512"
+  use_chat_template: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..1877af727 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,13 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsr1-fp4-b200-dynamo-sglang-mtp
+  description:
+    - "Restructure the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to an 8k1k-only, 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/)"
+    - "Variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)"
+    - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container"
+    - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments"
+    - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes"
+  pr-link: XXX
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 9eeed2af6..58823355f 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -122,6 +122,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         git checkout sa-submission-q2-2026
         mkdir -p recipes/sglang/glm5/b200-fp8
         cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8" recipes/sglang/glm5/b200-fp8
+    elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
+        git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+        cd "$SRT_REPO_DIR" || exit 1
+        git checkout main
+        mkdir -p recipes/sglang/dsr1/b200-fp4
+        cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4" recipes/sglang/dsr1/b200-fp4
     else
         git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
         cd "$SRT_REPO_DIR" || exit 1

From b2a48726da49a8ca6022cc7a724a1e91825bd466 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Mon, 8 Jun 2026 12:52:10 -0700
Subject: [PATCH 2/4] Update perf-changelog pr-link for #1688

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1877af727..0dd760963 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3540,4 +3540,4 @@
     - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container"
     - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments"
     - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes"
-  pr-link: XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688

From f0142affd23975ec3fc9d4a809d4b42df0d0fedc Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Mon, 8 Jun 2026 13:04:55 -0700
Subject: [PATCH 3/4] dsr1-fp4-b200-dynamo-sglang-mtp: keep 1k1k scenario,
 restore b200-multinode runner

Only the 8k1k scenario is updated (6-variant local split recipes). The
1k1k scenario and the b200-multinode runner are unchanged from main; the
image bump to v0.5.12.post1 is shared (1k1k follows via the dynamo-sglang
container alias).
---
 .github/configs/nvidia-master.yaml | 65 +++++++++++++++++++++++++++++-
 perf-changelog.yaml                | 10 ++---
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 17ce00504..1633d90d4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7914,13 +7914,76 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
   image: "lmsysorg/sglang:v0.5.12.post1"
   model: deepseek-r1-fp4
   model-prefix: dsr1
-  runner: b200
+  runner: b200-multinode
   precision: fp4
   framework: dynamo-sglang
   multinode: true
   disagg: true
   scenarios:
     fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "mtp"
+        conc-list: [16, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
+          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
+        decode:
+          num-worker: 5
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - spec-decoding: "mtp"
+        conc-list: [32, 64, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
+          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
+        decode:
+          num-worker: 6
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - spec-decoding: "mtp"
+        conc-list: [512, 1024]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
+          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - spec-decoding: "mtp"
+        conc-list: [512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
+          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
     - isl: 8192
       osl: 1024
       search-space:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0dd760963..9d68ca5ff 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3535,9 +3535,9 @@
 - config-keys:
     - dsr1-fp4-b200-dynamo-sglang-mtp
   description:
-    - "Restructure the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to an 8k1k-only, 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/)"
-    - "Variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)"
-    - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container"
-    - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments"
-    - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes"
+    - "Update the 8k1k scenario of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to a 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/); the 1k1k scenario is unchanged"
+    - "8k1k variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)"
+    - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the 8k1k recipe model.container; the 1k1k in-repo recipes use the dynamo-sglang container alias and follow the config image"
+    - "8k1k MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments"
+    - "Point the dsr1-fp4 srt-slurm clone at NVIDIA/srt-slurm@main, which carries both the b200-fp4 1k1k and 8k1k in-repo recipes"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688

From 79a9f56f263efe5e2189f1d40a03714958f62b5e Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Mon, 8 Jun 2026 13:42:07 -0700
Subject: [PATCH 4/4] dsr1-fp4-b200-dynamo-sglang-mtp: move 1k1k scenario to
 local split recipes

Flatten the srt-slurm b200-fp4 1k1k recipe (base + zip_override_mtp_*[i])
into 4 standalone per-topology recipes under
recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/, matching the 8k1k local
layout, and point the config at them instead of srt-slurm. Behavior is
unchanged (faithful flatten; dynamo-sglang container alias preserved).
---
 .github/configs/nvidia-master.yaml            |  12 +-
 .../1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml    | 117 +++++++++++++++++
 .../1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml    | 117 +++++++++++++++++
 .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml    | 122 ++++++++++++++++++
 .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml    | 122 ++++++++++++++++++
 perf-changelog.yaml                           |  10 +-
 6 files changed, 487 insertions(+), 13 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1633d90d4..cb8708bc7 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7932,8 +7932,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml"
         decode:
           num-worker: 5
           tp: 8
@@ -7947,8 +7946,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml"
         decode:
           num-worker: 6
           tp: 8
@@ -7962,8 +7960,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml"
         decode:
           num-worker: 1
           tp: 8
@@ -7977,8 +7974,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml"
         decode:
           num-worker: 2
           tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml
new file mode 100644
index 000000000..098a4575a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml
@@ -0,0 +1,117 @@
+# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
+# base + zip_override_mtp_lowlat[0]): 1p5d low-latency (dep4 prefill / tep8 decode, 5 decode nodes).
+# One flat YAML per concrete topology, matching the 8k1k local recipe layout
+# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+name: b200-fp4-mtp-low-latency-dep4-1p-tep8-5d
+model:
+  path: dsr1
+  container: dynamo-sglang
+  precision: fp4
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+    decode:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  req_rate: inf
+  concurrencies: 16x512
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml
new file mode 100644
index 000000000..4ab9a7558
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml
@@ -0,0 +1,117 @@
+# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
+# base + zip_override_mtp_lowlat[1]): 1p6d low-latency (dep4 prefill / tep8 decode, 6 decode nodes).
+# One flat YAML per concrete topology, matching the 8k1k local recipe layout
+# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+name: b200-fp4-mtp-low-latency-dep4-1p-tep8-6d
+model:
+  path: dsr1
+  container: dynamo-sglang
+  precision: fp4
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+    decode:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  req_rate: inf
+  concurrencies: 32x64x256x512
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml
new file mode 100644
index 000000000..8ffbfeaf0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml
@@ -0,0 +1,122 @@
+# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
+# base + zip_override_mtp_maxtpt[0]): 1p1d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.75).
+# One flat YAML per concrete topology, matching the 8k1k local recipe layout
+# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d
+model:
+  path: dsr1
+  container: dynamo-sglang
+  precision: fp4
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_MOE_NVFP4_DISPATCH: '1'
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      disable-cuda-graph: true
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+    decode:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  req_rate: inf
+  concurrencies: 512x1024
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml
new file mode 100644
index 000000000..a59fd6ea2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml
@@ -0,0 +1,122 @@
+# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
+# base + zip_override_mtp_maxtpt[1]): 1p2d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.85).
+# One flat YAML per concrete topology, matching the 8k1k local recipe layout
+# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d
+model:
+  path: dsr1
+  container: dynamo-sglang
+  precision: fp4
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 2
+  gpus_per_node: 8
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_MOE_NVFP4_DISPATCH: '1'
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+    decode:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  req_rate: inf
+  concurrencies: '512'
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9d68ca5ff..1d830e8cf 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3535,9 +3535,9 @@
 - config-keys:
     - dsr1-fp4-b200-dynamo-sglang-mtp
   description:
-    - "Update the 8k1k scenario of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to a 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/); the 1k1k scenario is unchanged"
-    - "8k1k variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)"
-    - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the 8k1k recipe model.container; the 1k1k in-repo recipes use the dynamo-sglang container alias and follow the config image"
-    - "8k1k MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments"
-    - "Point the dsr1-fp4 srt-slurm clone at NVIDIA/srt-slurm@main, which carries both the b200-fp4 1k1k and 8k1k in-repo recipes"
+    - "Move both the 1k1k and 8k1k scenarios of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/{1k1k,8k1k}/disagg/mtp/), rather than referencing recipes from the srt-slurm repo"
+    - "1k1k: 4 MTP variants, behavior unchanged from the previous srt-slurm 1k1k recipe — 2 low-latency (dep4-1p prefill / tep8 decode at 5 and 6 decode nodes, conc up to 512) + 2 max-throughput (dep4-1p prefill / dep8 decode at 1 and 2 decode nodes, conc up to 1024)"
+    - "8k1k: 6-variant sweep — 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048); MTP2 recipes use scheduler-recv-interval=1, enable-dp-lm-head, spec 2 steps / 3 draft tokens, and UCX_TLS in the prefill/decode environments"
+    - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130); the 1k1k recipes keep the dynamo-sglang container alias and follow the config image"
+    - "Clone srt-slurm at NVIDIA/srt-slurm@main for the srtctl/dynamo tooling and copy the local b200-fp4 recipes into the checkout"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688