From db84c591866af8e33d59a86280b175bd9be684af Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 13:28:08 -0700 Subject: [PATCH 1/2] dsv4-fp4-gb300-dynamo-trt: add disagg trtllm recipes for STP and MTP on GB300 --- .github/configs/nvidia-master.yaml | 819 +++++++++++++++++++++++++++++ perf-changelog.yaml | 12 + runners/launch_gb300-nv.sh | 7 + 3 files changed, 838 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..184d8846f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8753,6 +8753,825 @@ dsv4-fp4-gb300-dynamo-vllm: ep: 16 dp-attn: true +dsv4-fp4-gb300-dynamo-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-nv + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [5] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [15] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [25] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [55] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [167] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch4_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch4_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [333] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch8_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch8_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [666] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [1229] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [4301] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [8192] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep16_batch512_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep16_batch512_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [8192] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [16384] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep16_batch1024_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep16_batch1024_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [5] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [15] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [25] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [55] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [154] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen1dep32_batch4_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx1dep4_gen1dep32_batch4_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [308] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [615] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [1127] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [1229] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep8_batch256_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep8_batch256_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 9 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4301] + prefill: + num-worker: 10 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep8_batch512_eplb384_mtp0.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep8_batch512_eplb384_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + +dsv4-fp4-gb300-dynamo-trt-mtp: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-nv + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [15] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [25] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [90] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch2_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch2_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [167] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch4_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch4_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch8_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch8_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [615] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2253] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [4301] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep16_batch256_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep16_batch256_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [4301] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx4dep4_gen1dep32_batch128_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx4dep4_gen1dep32_batch128_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep8_batch1024_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep8_batch1024_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep16_batch512_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep16_batch512_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [15] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [30] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [84] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep32_batch2_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep32_batch2_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [180] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx4dep4_gen1dep32_batch8_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx4dep4_gen1dep32_batch8_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [615] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch16_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch16_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx6dep4_gen1dep16_batch32_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx6dep4_gen1dep16_batch32_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 10 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb384_mtp3.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb384_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2253] + prefill: + num-worker: 9 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx9dep4_gen1dep8_batch256_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx9dep4_gen1dep8_batch256_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [4301] + prefill: + num-worker: 12 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx12dep4_gen1dep8_batch512_eplb384_mtp1.yaml + - "CONFIG_FILE=recipes/DeepSeek-V4-Pro/disagg/trtllm_dynamo/gb300_mxfp4/ISL8K_OSL1K/MTP/ctx12dep4_gen1dep8_batch512_eplb384_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd model: deepseek-ai/DeepSeek-V4-Pro diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..4cacd0747 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,15 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-gb300-dynamo-trt + - dsv4-fp4-gb300-dynamo-trt-mtp + description: + - "Add DeepSeek-V4-Pro MXFP4 GB300 disaggregated TRT-LLM benchmarks via Dynamo (27 STP + 27 MTP configs)" + - "New configs: dsv4-fp4-gb300-dynamo-trt (STP) and dsv4-fp4-gb300-dynamo-trt-mtp (MTP)" + - "Covers ISL 1024/OSL 1024 (14 STP + 14 MTP) and ISL 8192/OSL 1024 (13 STP + 13 MTP)" + - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 329b2326d..d21c91d10 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -155,6 +155,13 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5" recipes/vllm/minimax-m2.5 +elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "dsv4" ]]; then + # DSv4 dynamo-trt recipes use the HuggingFace model ID as model.path, + # so override SRT_SLURM_MODEL_PREFIX to match the recipe's model path key. + SRT_SLURM_MODEL_PREFIX="deepseek-ai/DeepSeek-V4-Pro" + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From e2dd5ca5b334e33b6692f81d0dbf9ce59b860d96 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 13:28:38 -0700 Subject: [PATCH 2/2] Update perf-changelog pr-link for #1689 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4cacd0747..cd137b83d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3542,4 +3542,4 @@ - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689