From 85613e47c46a44f7d171f70cf81c961512e9a9ce Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 5 Jun 2026 16:25:34 -0700 Subject: [PATCH 1/3] Add DSv4-Pro FP4 GB200 SGLang disagg + MTP config Initial submission of the MTP-decoded variant of the DSv4-Pro FP4 disagg GB200 SGLang config at 8k/1k. Eight prefill/decode topologies: two low-latency (1p1d-tp8-tp8, 1p6d-dep8-tp8) and six mid-curve points (1p1d through 6p1d-dep8-dep16). Each scenario sets `spec-decoding: "mtp"` so the matrix turns on the MTP speculative-decode path; chat template enabled accordingly. Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 --- .github/configs/nvidia-master.yaml | 137 +++++++++++++++++ ...gg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml | 124 +++++++++++++++ ...g-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml | 131 ++++++++++++++++ ...g-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ ...g-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ ...g-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ ...g-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ ...g-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ ...g-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml | 143 ++++++++++++++++++ perf-changelog.yaml | 8 + 10 files changed, 1258 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e24ec38c0..e41cfc302 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8604,6 +8604,143 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +# MTP variant of dsv4-fp4-gb200-dynamo-sglang. +dsv4-fp4-gb200-dynamo-sglang-mtp: + image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # Low-latency baseline: 1p1d-tp8-tp8. 4 nodes. + - spec-decoding: "mtp" + conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Low-latency 1p6d-dep8-tp8: 1P (DEP=8) + 6 TP=8 decode workers. 14 nodes. + # Recipe runs concurrencies=32x64x128; matrix tracks the max. + - spec-decoding: "mtp" + conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # Mid curve 1p1d-dep8-dep16. 6 nodes. + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 2p1d-dep8-dep16. 8 nodes. + - spec-decoding: "mtp" + conc-list: [2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 3p1d-dep8-dep16. 10 nodes. + - spec-decoding: "mtp" + conc-list: [3072] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 4p1d-dep8-dep16. 12 nodes. + - spec-decoding: "mtp" + conc-list: [6144] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 5p1d-dep8-dep16. 14 nodes. + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 6p1d-dep8-dep16. 16 nodes. + - spec-decoding: "mtp" + conc-list: [16384] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml new file mode 100644 index 000000000..0b2423a8e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml @@ -0,0 +1,124 @@ +name: "dsv4-pro-gb200-disagg-8k1k-low-latency-1p1d-tp8-tp8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 16 + cuda-graph-max-bs: 8 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml new file mode 100644 index 000000000..79c9a46bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml @@ -0,0 +1,131 @@ +name: "dsv4-pro-gb200-disagg-8k1k-low-latency-1p6d-dep8-tp8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 12 + decode_workers: 6 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 128 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "32x64x128" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..1bf4f0e85 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-1p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..82519e378 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-2p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 2 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "2048" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..e69c5e604 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-3p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + prefill_workers: 3 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 4096 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "3072" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..73bcecaec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-4p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 4 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 6144 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "6144" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..66829c404 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-5p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 5 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 16384 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml new file mode 100644 index 000000000..34b71a918 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml @@ -0,0 +1,143 @@ +name: "dsv4-pro-gb200-disagg-8k1k-mid-curve-6p1d-dep8-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 6 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.8 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 65536 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.85 + max-running-requests: 21504 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "16384" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 47cfcebc1..26ff85b00 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3502,3 +3502,11 @@ - "Update GPT-OSS model for MI355X vLLM from amd/gpt-oss-120b-w-mxfp4-a-fp8 to openai/gpt-oss-120b" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1670 +- config-keys: + - dsv4-fp4-gb200-dynamo-sglang-mtp + description: + - "Initial submission: DSv4-Pro FP4 disagg on GB200 with SGLang + MTP speculative decoding (8k/1k)." + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + - "8 topologies: low-latency 1p1d-tp8-tp8 + 1p6d-dep8-tp8; mid-curve 1p1d through 6p1d-dep8-dep16." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + From 5c201f649425e53da06707d597f7ef05de66cee7 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 5 Jun 2026 16:25:57 -0700 Subject: [PATCH 2/3] Update perf-changelog pr-link for #1676 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 26ff85b00..b2f51cb90 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3508,5 +3508,5 @@ - "Initial submission: DSv4-Pro FP4 disagg on GB200 with SGLang + MTP speculative decoding (8k/1k)." - "Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" - "8 topologies: low-latency 1p1d-tp8-tp8 + 1p6d-dep8-tp8; mid-curve 1p1d through 6p1d-dep8-dep16." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1676 From e222dc75d93733b2ce216067151aa95d4f79481b Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 5 Jun 2026 16:36:00 -0700 Subject: [PATCH 3/3] Use NVIDIA/srt-slurm:main for DSv4 SGLang clone (drop submission-branch pin) --- runners/launch_gb200-nv.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 45ef3a952..351f12d31 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -249,13 +249,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then - # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 - # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) - # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm - # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + # Stay on NVIDIA/srt-slurm:main (default) — submission branch no + # longer needed; overlay our hand-rolled DSV4 sglang recipes onto it. git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then