From 2667022429cdeb53a8fd7819e962ae7eb7d274a1 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 12:51:33 -0700 Subject: [PATCH 1/4] dsr1-fp4-b200-dynamo-sglang-mtp: 8k1k 6-variant MTP disagg sweep on local split recipes --- .github/configs/nvidia-master.yaml | 111 +++++--------- .../disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml | 143 ++++++++++++++++++ .../disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml | 143 ++++++++++++++++++ .../disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml | 143 ++++++++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml | 140 +++++++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml | 140 +++++++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml | 140 +++++++++++++++++ perf-changelog.yaml | 10 ++ runners/launch_b200-dgxc.sh | 6 + 9 files changed, 904 insertions(+), 72 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..17ce00504 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7911,143 +7911,110 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true dsr1-fp4-b200-dynamo-sglang-mtp: - image: "lmsysorg/sglang:v0.5.8.post1-cu130" + image: "lmsysorg/sglang:v0.5.12.post1" model: deepseek-r1-fp4 model-prefix: dsr1 - runner: b200-multinode + runner: b200 precision: fp4 framework: dynamo-sglang multinode: true disagg: true scenarios: fixed-seq-len: - - isl: 1024 + - isl: 8192 osl: 1024 search-space: + # 1p5d low-latency (decode-heavy). - spec-decoding: "mtp" - conc-list: [16, 512] + conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 4 - ep: 4 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml" decode: num-worker: 5 tp: 8 - ep: 8 + ep: 1 dp-attn: false + # 1p3d low-latency. - spec-decoding: "mtp" - conc-list: [32, 64, 256, 512] + conc-list: [32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 4 - ep: 4 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml" decode: - num-worker: 6 + num-worker: 3 tp: 8 - ep: 8 + ep: 1 dp-attn: false + # 1p1d low-latency. - spec-decoding: "mtp" - conc-list: [512, 1024] + conc-list: [32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 4 - ep: 4 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml" decode: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false + # MTP2 high-throughput (DEP4 prefill / DEP8 decode), one Pareto point each. + # 2p1d throughput. - spec-decoding: "mtp" - conc-list: [512] + conc-list: [768] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: + # 3p1d throughput. - spec-decoding: "mtp" - conc-list: [64, 128] + conc-list: [1024] prefill: - num-worker: 1 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml" decode: num-worker: 1 tp: 8 ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false + # 5p1d throughput. - spec-decoding: "mtp" - conc-list: [4, 128] + conc-list: [2048] prefill: - num-worker: 2 + num-worker: 5 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [4, 8, 16, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml" decode: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false + ep: 8 + dp-attn: true kimik2.5-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml new file mode 100644 index 000000000..37225d44e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_2p1d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-mtp2-throughput-2p-dep4-1d-dep8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + override_mtp2_throughput_2p1d, DEP4 prefill / DEP8 decode, MTP2). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 2048 + scheduler-recv-interval: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: null + data-parallel-size: 4 + tensor-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 1024 + max-running-requests: 2048 + scheduler-recv-interval: 1 + data-parallel-size: 8 + tensor-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "768" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml new file mode 100644 index 000000000..d8e401620 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_3p1d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-mtp2-throughput-3p-dep4-1d-dep8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + override_mtp2_throughput_3p1d, DEP4 prefill / DEP8 decode, MTP2). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + decode_nodes: 1 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 2048 + scheduler-recv-interval: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: null + data-parallel-size: 4 + tensor-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 1024 + max-running-requests: 2048 + scheduler-recv-interval: 1 + data-parallel-size: 8 + tensor-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "1024" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml new file mode 100644 index 000000000..bcbedcb68 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp2_throughput_5p1d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-mtp2-throughput-5p-dep4-1d-dep8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + override_mtp2_throughput_5p1d, DEP4 prefill / DEP8 decode, MTP2). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 3 + decode_nodes: 1 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 2048 + scheduler-recv-interval: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: null + data-parallel-size: 4 + tensor-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 1024 + max-running-requests: 2048 + scheduler-recv-interval: 1 + data-parallel-size: 8 + tensor-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "2048" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml new file mode 100644 index 000000000..160854ebc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml @@ -0,0 +1,140 @@ +name: "b200-fp4-mtp-low-latency-1p-tp4-5d-tp8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + zip_override_mtp_lowlat[0], 1p-tp4 prefill / 5d-tp8 decode). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + decode_nodes: 5 + prefill_workers: 1 + decode_workers: 5 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + scheduler-recv-interval: 10 + tensor-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "4x8x16x32x64" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml new file mode 100644 index 000000000..fd1d4a4f5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml @@ -0,0 +1,140 @@ +name: "b200-fp4-mtp-low-latency-1p-tp4-3d-tp8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + zip_override_mtp_lowlat[1], 1p-tp4 prefill / 3d-tp8 decode). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + scheduler-recv-interval: 10 + tensor-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "32x64x128x256x512" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml new file mode 100644 index 000000000..bcceaf872 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml @@ -0,0 +1,140 @@ +name: "b200-fp4-mtp-low-latency-1p-tp4-1d-tp8" + +# Derived from the srt-slurm b200-fp4 8k1k recipe (recipes/b200-fp4/8k1k.yaml +# base + zip_override_mtp_lowlat[2], 1p-tp4 prefill / 1d-tp8 decode). +# One flat YAML per concrete topology, matching the InferenceX glm5 disagg +# layout (sglang//-//disagg//...). + +dynamo: + hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.12.post1" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_SPEC_V2: "1" + UCX_TLS: "rc,cuda_ipc,cuda_copy,tcp,self" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.85 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + scheduler-recv-interval: 10 + tensor-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + speculative-algorithm: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: 300 + concurrencies: "32x64x128x256x512" + use_chat_template: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..1877af727 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,13 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsr1-fp4-b200-dynamo-sglang-mtp + description: + - "Restructure the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to an 8k1k-only, 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/)" + - "Variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)" + - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container" + - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments" + - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes" + pr-link: XXX diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 9eeed2af6..58823355f 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -122,6 +122,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/glm5/b200-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8" recipes/sglang/glm5/b200-fp8 + elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/sglang/dsr1/b200-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4" recipes/sglang/dsr1/b200-fp4 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 From b2a48726da49a8ca6022cc7a724a1e91825bd466 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 12:52:10 -0700 Subject: [PATCH 2/4] Update perf-changelog pr-link for #1688 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1877af727..0dd760963 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3540,4 +3540,4 @@ - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container" - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments" - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes" - pr-link: XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688 From f0142affd23975ec3fc9d4a809d4b42df0d0fedc Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 13:04:55 -0700 Subject: [PATCH 3/4] dsr1-fp4-b200-dynamo-sglang-mtp: keep 1k1k scenario, restore b200-multinode runner Only the 8k1k scenario is updated (6-variant local split recipes). The 1k1k scenario and the b200-multinode runner are unchanged from main; the image bump to v0.5.12.post1 is shared (1k1k follows via the dynamo-sglang container alias). --- .github/configs/nvidia-master.yaml | 65 +++++++++++++++++++++++++++++- perf-changelog.yaml | 10 ++--- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 17ce00504..1633d90d4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7914,13 +7914,76 @@ dsr1-fp4-b200-dynamo-sglang-mtp: image: "lmsysorg/sglang:v0.5.12.post1" model: deepseek-r1-fp4 model-prefix: dsr1 - runner: b200 + runner: b200-multinode precision: fp4 framework: dynamo-sglang multinode: true disagg: true scenarios: fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [16, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32, 64, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [512, 1024] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true - isl: 8192 osl: 1024 search-space: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0dd760963..9d68ca5ff 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3535,9 +3535,9 @@ - config-keys: - dsr1-fp4-b200-dynamo-sglang-mtp description: - - "Restructure the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to an 8k1k-only, 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/)" - - "Variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)" - - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the recipe model.container" - - "MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments" - - "Clone srt-slurm at NVIDIA/srt-slurm@main for the dsr1-fp4 b200 recipes" + - "Update the 8k1k scenario of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to a 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/); the 1k1k scenario is unchanged" + - "8k1k variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)" + - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the 8k1k recipe model.container; the 1k1k in-repo recipes use the dynamo-sglang container alias and follow the config image" + - "8k1k MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments" + - "Point the dsr1-fp4 srt-slurm clone at NVIDIA/srt-slurm@main, which carries both the b200-fp4 1k1k and 8k1k in-repo recipes" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688 From 79a9f56f263efe5e2189f1d40a03714958f62b5e Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 8 Jun 2026 13:42:07 -0700 Subject: [PATCH 4/4] dsr1-fp4-b200-dynamo-sglang-mtp: move 1k1k scenario to local split recipes Flatten the srt-slurm b200-fp4 1k1k recipe (base + zip_override_mtp_*[i]) into 4 standalone per-topology recipes under recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/, matching the 8k1k local layout, and point the config at them instead of srt-slurm. Behavior is unchanged (faithful flatten; dynamo-sglang container alias preserved). --- .github/configs/nvidia-master.yaml | 12 +- .../1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml | 117 +++++++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml | 117 +++++++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml | 122 ++++++++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml | 122 ++++++++++++++++++ perf-changelog.yaml | 10 +- 6 files changed, 487 insertions(+), 13 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1633d90d4..cb8708bc7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7932,8 +7932,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml" decode: num-worker: 5 tp: 8 @@ -7947,8 +7946,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml" decode: num-worker: 6 tp: 8 @@ -7962,8 +7960,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml" decode: num-worker: 1 tp: 8 @@ -7977,8 +7974,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml new file mode 100644 index 000000000..098a4575a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml @@ -0,0 +1,117 @@ +# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml +# base + zip_override_mtp_lowlat[0]): 1p5d low-latency (dep4 prefill / tep8 decode, 5 decode nodes). +# One flat YAML per concrete topology, matching the 8k1k local recipe layout +# (sglang//-//disagg//...). + +name: b200-fp4-mtp-low-latency-dep4-1p-tep8-5d +model: + path: dsr1 + container: dynamo-sglang + precision: fp4 +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + decode: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + req_rate: inf + concurrencies: 16x512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml new file mode 100644 index 000000000..4ab9a7558 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml @@ -0,0 +1,117 @@ +# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml +# base + zip_override_mtp_lowlat[1]): 1p6d low-latency (dep4 prefill / tep8 decode, 6 decode nodes). +# One flat YAML per concrete topology, matching the 8k1k local recipe layout +# (sglang//-//disagg//...). + +name: b200-fp4-mtp-low-latency-dep4-1p-tep8-6d +model: + path: dsr1 + container: dynamo-sglang + precision: fp4 +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + decode: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + req_rate: inf + concurrencies: 32x64x256x512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml new file mode 100644 index 000000000..8ffbfeaf0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml @@ -0,0 +1,122 @@ +# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml +# base + zip_override_mtp_maxtpt[0]): 1p1d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.75). +# One flat YAML per concrete topology, matching the 8k1k local recipe layout +# (sglang//-//disagg//...). + +name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d +model: + path: dsr1 + container: dynamo-sglang + precision: fp4 +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_MOE_NVFP4_DISPATCH: '1' + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + disable-cuda-graph: true + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + decode: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.75 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + req_rate: inf + concurrencies: 512x1024 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml new file mode 100644 index 000000000..a59fd6ea2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml @@ -0,0 +1,122 @@ +# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml +# base + zip_override_mtp_maxtpt[1]): 1p2d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.85). +# One flat YAML per concrete topology, matching the 8k1k local recipe layout +# (sglang//-//disagg//...). + +name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d +model: + path: dsr1 + container: dynamo-sglang + precision: fp4 +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 2 + gpus_per_node: 8 +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_ENABLE_JIT_DEEPGEMM: 'false' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + DYN_REQUEST_PLANE: nats + SGLANG_MOE_NVFP4_DISPATCH: '1' + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + decode: + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: modelopt_fp4 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + moe-runner-backend: flashinfer_trtllm + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: flashinfer_trtllm + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + req_rate: inf + concurrencies: '512' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9d68ca5ff..1d830e8cf 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3535,9 +3535,9 @@ - config-keys: - dsr1-fp4-b200-dynamo-sglang-mtp description: - - "Update the 8k1k scenario of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to a 6-variant configuration backed by local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/); the 1k1k scenario is unchanged" - - "8k1k variants: 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048)" - - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130) to match the 8k1k recipe model.container; the 1k1k in-repo recipes use the dynamo-sglang container alias and follow the config image" - - "8k1k MTP2 throughput recipes use scheduler-recv-interval=1, enable-dp-lm-head, and spec 2 steps / 3 draft tokens; add UCX_TLS to the prefill/decode environments" - - "Point the dsr1-fp4 srt-slurm clone at NVIDIA/srt-slurm@main, which carries both the b200-fp4 1k1k and 8k1k in-repo recipes" + - "Move both the 1k1k and 8k1k scenarios of the DeepSeek-R1 FP4 B200 dynamo-sglang MTP disagg sweep to local split recipes (one flat recipe YAML per topology under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/{1k1k,8k1k}/disagg/mtp/), rather than referencing recipes from the srt-slurm repo" + - "1k1k: 4 MTP variants, behavior unchanged from the previous srt-slurm 1k1k recipe — 2 low-latency (dep4-1p prefill / tep8 decode at 5 and 6 decode nodes, conc up to 512) + 2 max-throughput (dep4-1p prefill / dep8 decode at 1 and 2 decode nodes, conc up to 1024)" + - "8k1k: 6-variant sweep — 3 low-latency (1p5d / 1p3d / 1p1d, TP4 prefill / TP8 decode, conc up to 512) + 3 MTP2 high-throughput (2p1d / 3p1d / 5p1d, DEP4 prefill / DEP8 decode, single concurrency 768 / 1024 / 2048); MTP2 recipes use scheduler-recv-interval=1, enable-dp-lm-head, spec 2 steps / 3 draft tokens, and UCX_TLS in the prefill/decode environments" + - "Bump container image to lmsysorg/sglang:v0.5.12.post1 (from v0.5.8.post1-cu130); the 1k1k recipes keep the dynamo-sglang container alias and follow the config image" + - "Clone srt-slurm at NVIDIA/srt-slurm@main for the srtctl/dynamo tooling and copy the local b200-fp4 recipes into the checkout" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1688