Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 61 additions & 35 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7911,7 +7911,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
dp-attn: true

dsr1-fp4-b200-dynamo-sglang-mtp:
image: "lmsysorg/sglang:v0.5.8.post1-cu130"
image: "lmsysorg/sglang:v0.5.12.post1"
model: deepseek-r1-fp4
model-prefix: dsr1
runner: b200-multinode
Expand All @@ -7932,8 +7932,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml"
decode:
num-worker: 5
tp: 8
Expand All @@ -7947,8 +7946,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml"
decode:
num-worker: 6
tp: 8
Expand All @@ -7962,8 +7960,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml"
decode:
num-worker: 1
tp: 8
Expand All @@ -7977,77 +7974,106 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# 1p5d low-latency (decode-heavy).
- spec-decoding: "mtp"
conc-list: [64, 128]
conc-list: [4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
ep: 1
dp-attn: false
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
decode:
num-worker: 5
tp: 8
ep: 1
dp-attn: false
# 1p3d low-latency.
- spec-decoding: "mtp"
conc-list: [32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
decode:
num-worker: 3
tp: 8
ep: 8
ep: 1
dp-attn: false
# 1p1d low-latency.
- spec-decoding: "mtp"
conc-list: [8]
conc-list: [32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
ep: 1
dp-attn: false
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
decode:
num-worker: 5
num-worker: 1
tp: 8
ep: 8
ep: 1
dp-attn: false
# MTP2 high-throughput (DEP4 prefill / DEP8 decode), one Pareto point each.
# 2p1d throughput.
- spec-decoding: "mtp"
conc-list: [4, 128]
conc-list: [768]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml"
decode:
num-worker: 5
num-worker: 1
tp: 8
ep: 8
dp-attn: false
dp-attn: true
# 3p1d throughput.
- spec-decoding: "mtp"
conc-list: [4, 8, 16, 64]
conc-list: [1024]
prefill:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# 5p1d throughput.
- spec-decoding: "mtp"
conc-list: [2048]
prefill:
num-worker: 5
tp: 4
ep: 1
dp-attn: false
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
- "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4"
- "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
ep: 8
dp-attn: true

kimik2.5-fp4-gb200-dynamo-trt:
image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml
# base + zip_override_mtp_lowlat[0]): 1p5d low-latency (dep4 prefill / tep8 decode, 5 decode nodes).
# One flat YAML per concrete topology, matching the 8k1k local recipe layout
# (sglang/<model>/<hw>-<precision>/<seq>/disagg/<variant>/...).

name: b200-fp4-mtp-low-latency-dep4-1p-tep8-5d
model:
path: dsr1
container: dynamo-sglang
precision: fp4
resources:
gpu_type: b200
prefill_nodes: 1
prefill_workers: 1
gpus_per_prefill: 4
decode_nodes: 5
decode_workers: 5
gpus_per_node: 8
backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
DYN_REQUEST_PLANE: nats
SGLANG_ENABLE_SPEC_V2: '1'
decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_ENABLE_JIT_DEEPGEMM: 'false'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
DYN_REQUEST_PLANE: nats
SGLANG_ENABLE_SPEC_V2: '1'
sglang_config:
prefill:
served-model-name: deepseek-ai/DeepSeek-R1
trust-remote-code: true
quantization: modelopt_fp4
disaggregation-mode: prefill
disaggregation-transfer-backend: nixl
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
disable-cuda-graph: true
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true
attention-backend: trtllm_mla
kv-cache-dtype: fp8_e4m3
moe-runner-backend: flashinfer_trtllm
moe-dense-tp-size: 1
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
fp4-gemm-backend: flashinfer_trtllm
decode:
served-model-name: deepseek-ai/DeepSeek-R1
trust-remote-code: true
quantization: modelopt_fp4
disaggregation-mode: decode
disaggregation-transfer-backend: nixl
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
cuda-graph-max-bs: 512
tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 8
attention-backend: trtllm_mla
kv-cache-dtype: fp8_e4m3
moe-runner-backend: flashinfer_trtllm
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
fp4-gemm-backend: flashinfer_trtllm
speculative-algorithm: EAGLE
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3
health_check:
max_attempts: 360
interval_seconds: 10
benchmark:
type: sa-bench
isl: 1024
osl: 1024
req_rate: inf
concurrencies: 16x512
Loading