Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8604,6 +8604,143 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
ep: 8
dp-attn: true

# MTP variant of dsv4-fp4-gb200-dynamo-sglang.
dsv4-fp4-gb200-dynamo-sglang-mtp:
image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# Low-latency baseline: 1p1d-tp8-tp8. 4 nodes.
- spec-decoding: "mtp"
conc-list: [1]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Low-latency 1p6d-dep8-tp8: 1P (DEP=8) + 6 TP=8 decode workers. 14 nodes.
# Recipe runs concurrencies=32x64x128; matrix tracks the max.
- spec-decoding: "mtp"
conc-list: [128]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml"
decode:
num-worker: 6
tp: 8
ep: 1
dp-attn: false
# Mid curve 1p1d-dep8-dep16. 6 nodes.
- spec-decoding: "mtp"
conc-list: [1024]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 2p1d-dep8-dep16. 8 nodes.
- spec-decoding: "mtp"
conc-list: [2048]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 3p1d-dep8-dep16. 10 nodes.
- spec-decoding: "mtp"
conc-list: [3072]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 4p1d-dep8-dep16. 12 nodes.
- spec-decoding: "mtp"
conc-list: [6144]
prefill:
num-worker: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 5p1d-dep8-dep16. 14 nodes.
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 5
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 6p1d-dep8-dep16. 16 nodes.
- spec-decoding: "mtp"
conc-list: [16384]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: "dsv4-pro-gb200-disagg-8k1k-low-latency-1p1d-tp8-tp8-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85"
precision: "fp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
prefill_workers: 1
gpus_per_prefill: 8
decode_nodes: 2
decode_workers: 1
gpus_per_decode: 8

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing multinode all-reduce disable

High Severity

The low-latency GB200 MTP recipes run TP8 decode across two nodes per worker but their decode_environment blocks never set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 to 0. The mid-curve GB200 MTP recipes added in the same change do set that flag, and existing DSV4 slurm recipe notes tie custom all-reduce v2 to incorrect results on multi-node decode.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 5c201f6. Configure here.

Comment on lines +50 to +64

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The two new low-latency GB200 recipes (this file and disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml) configure decode at TP=8 on a GPU with gpus_per_node: 4, so each decode worker spans 2 nodes — but their decode_environment does not set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0". SGLang's default for CAR_V2 is on, and existing recipes in this directory document that CAR_V2 "is single-node only and corrupts results in 2-node decode setups". The six new GB200 mid-curve recipes in this same PR all correctly set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" — please mirror that in both low-latency decode envs as well.

Extended reasoning...

What the bug is. Both new low-latency GB200 files in this PR — disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml and disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml — declare gpu_type: "gb200", gpus_per_node: 4, and decode tensor-parallel-size: 8. Each decode worker therefore spans 8/4 = 2 nodes. In that regime, every other recipe in this directory explicitly disables SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 in decode_environment, but these two new files do not.\n\nWhy existing code doesn't prevent it. The default for CAR_V2 in SGLang is on. perf-changelog.yaml:3221 (PR #1506) explicitly notes "Remove env vars redundant with sglang defaults (..., SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)", and the single-node benchmark benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh:29 sets SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 to match. So omitting the var in the YAML means CAR_V2 runs.\n\nDocumented impact. disagg-low-latency-1p1d-tp4-tp4-mtp.yaml carries this comment for the omitted var:\n\n> # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 \n> # is single-node only and corrupts results in 2-node decode setups.\n\nThat comment is fine for the GB300 1p1d-tp4-tp4 / 1p6d-dep4-tp4 files because their decode TP=4 fits a single 4-GPU node. It does not apply to the new GB200 low-latency files, whose decode TP=8 always spans 2 nodes.\n\nInconsistency with the rest of this same PR. All six new GB200 mid-curve recipes added here (disagg-gb200-mid-curve-{1..6}p1d-dep8-dep16-mtp.yaml) set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" in decode_environment (lines ~75 of each). Their decode is TP=16 (4 nodes); the same multi-node constraint applies. The low-latency files are the only outliers in the PR — a clear copy/paste oversight rather than an intentional choice.\n\nStep-by-step proof for disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml.\n1. The file declares gpus_per_node: 4 (line 23) and decode tensor-parallel-size: 8 (line 100, in the decode block of sglang_config).\n2. With 4 GPUs per node and TP=8, the decode worker spans 8/4 = 2 nodes — confirmed by the file's own decode_nodes: 2, decode_workers: 1 (lines 26–28).\n3. The container is lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 whose default for SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 is 1 (on) per the perf-changelog #1506 entry above.\n4. decode_environment (lines 50–64) does not override this var, so CAR_V2 is active on a 2-node decode all-reduce — the exact regime the in-repo comments say "corrupts results."\n5. The same logic applies to disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml, whose decode workers also run TP=8 on gpus_per_node: 4.\n\nFix. Add a single line to the decode_environment of both new GB200 low-latency files (mirroring the mid-curve siblings):\n\nyaml\n SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only.\n\n\nLeaving the bug in place means the low-latency points on the published curve this PR adds will be silently incorrect rather than crashing, which is the worst failure mode for a benchmark recipe.


sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

mem-fraction-static: 0.9
max-running-requests: 16
cuda-graph-max-bs: 8
chunked-prefill-size: 65536

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

Loading
Loading