diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e24ec38c0..f70a46f83 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8604,6 +8604,133 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +dsv4-fp4-gb200-dynamo-sglang: + image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # Low latency: 1p1d-tp8-tp8. 4 nodes. + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-tp8-tp8-4-c1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # 1p4d-dep8-tp8. 10 nodes. + - conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-10-c64.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + # WideEP TP=16 decode: 1p2d-dep8-dep16. 10 nodes. + - conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p2d-dep8-dep16-10-c256.yaml" + decode: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + # WideEP TP=16 decode: 1p1d-dep8-dep16. 6 nodes. + - conc-list: [1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # WideEP TP=16 decode: 2p1d-dep8-dep16. 8 nodes. + - conc-list: [2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep16-8-c2048.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # WideEP TP=16 decode: 4p1d-dep8-dep16. 12 nodes. + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-4p1d-dep8-dep16-12-c4096.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # WideEP TP=16 decode: 5p1d-dep8-dep16. 14 nodes. + - conc-list: [8192] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-5p1d-dep8-dep16-14-c8192.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # WideEP TP=12 decode: 6p1d-dep8-dep12. 15 nodes. + - conc-list: [8192] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-6p1d-dep8-dep12-15-c8192.yaml" + decode: + num-worker: 1 + tp: 12 + ep: 12 + dp-attn: true + dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml new file mode 100644 index 000000000..b0ac33276 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep16-6-c1024.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-1p1d-dep8-dep16-6-c1024" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-tp8-tp8-4-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-tp8-tp8-4-c1.yaml new file mode 100644 index 000000000..d84efe9a1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-tp8-tp8-4-c1.yaml @@ -0,0 +1,120 @@ +name: "disagg-gb200-1p1d-tp8-tp8-4-c1" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p2d-dep8-dep16-10-c256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p2d-dep8-dep16-10-c256.yaml new file mode 100644 index 000000000..0ca853ea2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p2d-dep8-dep16-10-c256.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-1p2d-dep8-dep16-10-c256" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 8 + decode_workers: 2 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-10-c64.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-10-c64.yaml new file mode 100644 index 000000000..eb0dba6c4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-10-c64.yaml @@ -0,0 +1,145 @@ +name: "disagg-gb200-1p4d-dep8-tp8-10-c64" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + decode_nodes: 8 + decode_workers: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep16-8-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep16-8-c2048.yaml new file mode 100644 index 000000000..c18e01977 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep16-8-c2048.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-2p1d-dep8-dep16-8-c2048" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 2 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-4p1d-dep8-dep16-12-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-4p1d-dep8-dep16-12-c4096.yaml new file mode 100644 index 000000000..c54738627 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-4p1d-dep8-dep16-12-c4096.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-4p1d-dep8-dep16-12-c4096" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 4 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-5p1d-dep8-dep16-14-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-5p1d-dep8-dep16-14-c8192.yaml new file mode 100644 index 000000000..9e43a2b0b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-5p1d-dep8-dep16-14-c8192.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-5p1d-dep8-dep16-14-c8192" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 5 + gpus_per_prefill: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-6p1d-dep8-dep12-15-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-6p1d-dep8-dep12-15-c8192.yaml new file mode 100644 index 000000000..84e90dc2c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-6p1d-dep8-dep12-15-c8192.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb200-6p1d-dep8-dep12-15-c8192" + + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + precision: "fp4" + +dynamo: + hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 6 + gpus_per_prefill: 8 + decode_nodes: 3 + decode_workers: 1 + gpus_per_decode: 12 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.80 + max-running-requests: 1024 + chunked-prefill-size: 65536 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.056 + context-length: 9216 + tensor-parallel-size: 12 + data-parallel-size: 12 + expert-parallel-size: 12 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 21504 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 47cfcebc1..1e4b5c9eb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3502,3 +3502,11 @@ - "Update GPT-OSS model for MI355X vLLM from amd/gpt-oss-120b-w-mxfp4-a-fp8 to openai/gpt-oss-120b" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1670 +- config-keys: + - dsv4-fp4-gb200-dynamo-sglang + description: + - "Initial submission: DSv4-Pro FP4 disagg on GB200 with SGLang (8k/1k)." + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" + - "8 topologies sweeping low-latency (1p1d-tp8-tp8) through max throughput (6p1d-dep8-dep12)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1675 + diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 45ef3a952..36c8af203 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -16,10 +16,10 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Same compute-node-local NVMe path as the dynamo-vllm dsv4 - # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX - # matches the model.path alias in our DSV4 sglang recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + # Lustre-resident weights staged on the GB200 external cluster. + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our + # DSV4 sglang recipes. + export MODEL_PATH="/mnt/lustre01/models/deepseek-v4-pro" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else export MODEL_PATH=$MODEL @@ -249,13 +249,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then - # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 - # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) - # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm - # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + # Stay on NVIDIA/srt-slurm:main (default) — submission branch no + # longer needed; overlay our hand-rolled DSV4 sglang recipes onto it. git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then