SemiAnalysisAI · Ankur-singh · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -7911,7 +7911,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
           dp-attn: true
 
 dsr1-fp4-b200-dynamo-sglang-mtp:
-  image: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  image: "lmsysorg/sglang:v0.5.12.post1"
   model: deepseek-r1-fp4
   model-prefix: dsr1
   runner: b200-multinode
@@ -7932,8 +7932,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml"
         decode:
           num-worker: 5
           tp: 8
@@ -7947,8 +7946,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml"
         decode:
           num-worker: 6
           tp: 8
@@ -7962,8 +7960,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml"
         decode:
           num-worker: 1
           tp: 8
@@ -7977,77 +7974,106 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: true
-
     - isl: 8192
       osl: 1024
       search-space:
+      # 1p5d low-latency (decode-heavy).
       - spec-decoding: "mtp"
-        conc-list: [64, 128]
+        conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
-          dp-attn: true
+          ep: 1
+          dp-attn: false
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
         decode:
+          num-worker: 5
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # 1p3d low-latency.
+      - spec-decoding: "mtp"
+        conc-list: [32, 64, 128, 256, 512]
+        prefill:
           num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
+        decode:
+          num-worker: 3
           tp: 8
-          ep: 8
+          ep: 1
           dp-attn: false
+      # 1p1d low-latency.
       - spec-decoding: "mtp"
-        conc-list: [8]
+        conc-list: [32, 64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
-          dp-attn: true
+          ep: 1
+          dp-attn: false
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
         decode:
-          num-worker: 5
+          num-worker: 1
           tp: 8
-          ep: 8
+          ep: 1
           dp-attn: false
+      # MTP2 high-throughput (DEP4 prefill / DEP8 decode), one Pareto point each.
+      # 2p1d throughput.
       - spec-decoding: "mtp"
-        conc-list: [4, 128]
+        conc-list: [768]
         prefill:
           num-worker: 2
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml"
         decode:
-          num-worker: 5
+          num-worker: 1
           tp: 8
           ep: 8
-          dp-attn: false
+          dp-attn: true
+      # 3p1d throughput.
       - spec-decoding: "mtp"
-        conc-list: [4, 8, 16, 64]
+        conc-list: [1024]
         prefill:
+          num-worker: 3
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml"
+        decode:
           num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+      # 5p1d throughput.
+      - spec-decoding: "mtp"
+        conc-list: [2048]
+        prefill:
+          num-worker: 5
           tp: 4
-          ep: 1
-          dp-attn: false
+          ep: 4
+          dp-attn: true
           additional-settings:
-          # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-          - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4"
+          - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml"
         decode:
           num-worker: 1
           tp: 8
-          ep: 1
-          dp-attn: false
+          ep: 8
+          dp-attn: true
 
 kimik2.5-fp4-gb200-dynamo-trt:
   image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2

diff --git a/.../multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/.../multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml
@@ -0,0 +1,117 @@
+# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
+# base + zip_override_mtp_lowlat[0]): 1p5d low-latency (dep4 prefill / tep8 decode, 5 decode nodes).
+# One flat YAML per concrete topology, matching the 8k1k local recipe layout
+# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).
+
+name: b200-fp4-mtp-low-latency-dep4-1p-tep8-5d
+model:
+  path: dsr1
+  container: dynamo-sglang
+  precision: fp4
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+    decode:
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: modelopt_fp4
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+      moe-runner-backend: flashinfer_trtllm
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: flashinfer_trtllm
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  req_rate: inf
+  concurrencies: 16x512