Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8604,6 +8604,133 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
ep: 8
dp-attn: true

dsv4-fp4-gb200-dynamo-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# Low latency: 1p1d-tp8-tp8. 4 nodes.
- conc-list: [1]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-tp8-tp8-4-c1.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# 1p4d-dep8-tp8. 10 nodes.
- conc-list: [64]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-10-c64.yaml"
decode:
num-worker: 4
tp: 8
ep: 1
dp-attn: false
# WideEP TP=16 decode: 1p2d-dep8-dep16. 10 nodes.
- conc-list: [256]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p2d-dep8-dep16-10-c256.yaml"
decode:
num-worker: 2
tp: 16
ep: 16
dp-attn: true
# WideEP TP=16 decode: 1p1d-dep8-dep16. 6 nodes.
- conc-list: [1024]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# WideEP TP=16 decode: 2p1d-dep8-dep16. 8 nodes.
- conc-list: [2048]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep16-8-c2048.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# WideEP TP=16 decode: 4p1d-dep8-dep16. 12 nodes.
- conc-list: [4096]
prefill:
num-worker: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-4p1d-dep8-dep16-12-c4096.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# WideEP TP=16 decode: 5p1d-dep8-dep16. 14 nodes.
- conc-list: [8192]
prefill:
num-worker: 5
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-5p1d-dep8-dep16-14-c8192.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# WideEP TP=12 decode: 6p1d-dep8-dep12. 15 nodes.
- conc-list: [8192]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-6p1d-dep8-dep12-15-c8192.yaml"
decode:
num-worker: 1
tp: 12
ep: 12
dp-attn: true

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
name: "disagg-gb200-1p1d-dep8-dep16-6-c1024"


model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85"
precision: "fp4"

dynamo:
hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e"
install: true

slurm:
time_limit: "03:00:00"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
prefill_workers: 1
gpus_per_prefill: 8
decode_nodes: 4
decode_workers: 1
gpus_per_decode: 16

frontend:
type: dynamo
enable_multiple_frontends: false
env:
DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
args:
router-mode: "kv"
router-kv-overlap-score-weight: 0
router-queue-threshold: 64
router-temperature: 0.5
no-kv-events: true

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_ENABLE_THINKING: "1"
SGLANG_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
SGLANG_OPT_FP8_WO_A_GEMM: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_ENABLE_THINKING: "1"
SGLANG_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

tensor-parallel-size: 8
data-parallel-size: 8
expert-parallel-size: 8

enable-dp-attention: true
moe-a2a-backend: "megamoe"
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
moe-dense-tp-size: 1

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

mem-fraction-static: 0.80
max-running-requests: 1024
chunked-prefill-size: 65536

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

load-balance-method: "total_requests"
moe-a2a-backend: "megamoe"

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake
disaggregation-decode-polling-interval: 8

mem-fraction-static: 0.94
swa-full-tokens-ratio: 0.056
context-length: 9216
tensor-parallel-size: 16
data-parallel-size: 16
expert-parallel-size: 16
enable-dp-attention: true
enable-dp-lm-head: true
max-running-requests: 21504
cuda-graph-max-bs: 1280


benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1024"
req_rate: "inf"
use_chat_template: false
Comment on lines +154 to +159

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 All 8 new GB200 yaml files omit custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" from the benchmark: block, while all 14 existing DSv4 SGLang sibling configs in the same directory set it. Since 7 of the 8 new configs set skip-tokenizer-init: true on both prefill and decode (SGLang exchanges raw token IDs with the client), the sa-bench client must own DSv4-Pro tokenization end-to-end — without this field, it falls back to a default tokenizer producing wrong token-id streams and skewing ISL/OSL accounting, making the new GB200 numbers non-comparable to the GB300 baseline. Affected files: disagg-gb200-1p1d-tp8-tp8-4-c1.yaml, 1p4d-dep8-tp8-10-c64.yaml, 1p2d-dep8-dep16-10-c256.yaml, 1p1d-dep8-dep16-6-c1024.yaml, 2p1d-dep8-dep16-8-c2048.yaml, 4p1d-dep8-dep16-12-c4096.yaml, 5p1d-dep8-dep16-14-c8192.yaml, 6p1d-dep8-dep12-15-c8192.yaml. Fix: add custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" to each benchmark: block.

Extended reasoning...

What the bug is

The benchmark: block at the bottom of every one of the 8 new GB200 DSv4 SGLang yaml files looks like:

benchmark:
  type: "sa-bench"
  isl: 8192
  osl: 1024
  concurrencies: "1024"
  req_rate: "inf"
  use_chat_template: false

The custom_tokenizer field is missing. Every other DSv4 SGLang recipe in the same directory — benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/ — sets it, including the direct GB300 counterparts that this sweep is meant to complement:

File Line
disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml 182
disagg-gb300-1p1d-tp4-tp4-2-c1.yaml 165
disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml 182
disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml 182
disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml 182
disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml 182
8 mid-curve / high-conc / low-latency variants 121–141

That is 14/14 existing configs setting custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" and 0/8 new ones.

How it manifests

7 of the 8 new wideep configs (everything except 1p1d-tp8-tp8-4-c1.yaml) set skip-tokenizer-init: true on both prefill and decode sglang_config: blocks. With that flag, SGLang refuses to do tokenization itself and expects the client to send raw token-id streams — DSv4-Pro's own tokenizer with its special tokens (<|begin▁of▁sentence|>, <|User|>, <|Assistant|>, thinking/DSML/task tokens) must be applied client-side. Without custom_tokenizer, sa-bench falls back to its default tokenizer, which does not encode these specials correctly. The result: wrong token-id streams, ISL/OSL accounting that does not match what the GB300 baseline measured, and 8k/1k numbers that cannot be apples-to-apples compared with the existing GB300 sweep.

Even the 1p1d-tp8-tp8-4-c1.yaml low-latency config (which uses disable-radix-cache: true instead of skip-tokenizer-init: true) still breaks the established convention — its GB300 sibling at disagg-gb300-1p1d-tp4-tp4-2-c1.yaml:165 does set custom_tokenizer.

Why existing code does not prevent it

The benchmark: block has no schema validation that would flag a missing custom tokenizer; sa-bench just picks a default. Failure is silent — the run completes, numbers come back, and only a careful reviewer comparing against the GB300 baseline would notice the discrepancy.

Step-by-step proof of impact

  1. Launch sweep picks up the new dsv4-fp4-gb200-dynamo-sglang config from .github/configs/nvidia-master.yaml.
  2. For e.g. concurrency 1024, the harness selects disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml.
  3. SGLang prefill and decode are started with skip-tokenizer-init: true (lines 108, 131 of that file) — the server now accepts only token ids, not text.
  4. sa-bench reads the benchmark: block. Because custom_tokenizer is absent, it falls back to a generic HF tokenizer (or whatever its default is).
  5. The generic tokenizer encodes the 8k input prompt differently from SGLangDeepseekV4Tokenizer — DSv4-specific specials are tokenized as multiple BPE pieces instead of the intended single ids.
  6. The token stream sent to SGLang has a different length and content than the GB300 sweep produced. ISL is no longer exactly 8192 in DSv4 tokenization, OSL accounting is correspondingly off, and per-token latency / throughput numbers are skewed relative to the GB300 baseline.
  7. The PR description explicitly positions this as the GB200 counterpart to the existing GB300 sweep, so apples-to-apples comparison is the whole point — and it is silently broken.

Fix

Add one line to each of the 8 new yaml benchmark: blocks, matching the existing convention:

benchmark:
  type: "sa-bench"
  isl: 8192
  osl: 1024
  concurrencies: "..."
  req_rate: "inf"
  use_chat_template: false
  custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

Loading
Loading