From 97ae8034092cf47c83da245324f0ecc8db9895f9 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 16:52:09 +0800 Subject: [PATCH 1/6] dsv4-fp4-b300-sglang: align env vars to GB300 and add fp4-indexer flag --- .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index b451dee0d..2079d4165 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -30,8 +30,11 @@ fi nvidia-smi -# ─── Common env vars (all profiles) ─────────────────────────────────────────── -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# ─── Common env vars (all profiles, GB300-aligned) ────────────────────────── +export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 +export SGLANG_RADIX_FORCE_MISS=1 +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 SERVER_LOG="$PWD/server.log" @@ -46,9 +49,25 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" +# ─── DP-attention env vars (GB300-aligned) ─────────────────────────────────── +# Shared across all DP-attention profiles (conc >= 512). Set before per-conc +# tuning so individual blocks only carry NVSHMEM / batch-size overrides. +if [ "$CONC" != "1" ] && [ "$CONC" != "32" ]; then + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_LOG_MS=1 + export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60 + export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8 +fi + # ─── Per-concurrency launch profile ────────────────────────────────────────── # Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, -# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. +# and optionally MAX_RUNNING_REQUESTS. # # SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. @@ -61,11 +80,11 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --moe-runner-backend flashinfer_mxfp4 --chunked-prefill-size 8192 --disable-flashinfer-autotune + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "512" ]; then # DP attention, flashinfer_mxfp4 - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 MEM_FRACTION_STATIC=0.94 SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) PARALLEL_ARGS=( @@ -75,15 +94,12 @@ elif [ "$CONC" = "512" ]; then --disable-flashinfer-autotune --chunked-prefill-size 16384 --enable-prefill-delayer + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "2048" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 MEM_FRACTION_STATIC=0.87 SWA_FULL_TOKENS_RATIO=0.06 MAX_RUNNING_REQUESTS=2560 @@ -95,14 +111,12 @@ elif [ "$CONC" = "2048" ]; then --chunked-prefill-size 65536 --tokenizer-worker-num 4 --enable-prefill-delayer + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "4096" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 MEM_FRACTION_STATIC=0.835 SWA_FULL_TOKENS_RATIO=0.075 MAX_RUNNING_REQUESTS=4352 @@ -115,15 +129,12 @@ elif [ "$CONC" = "4096" ]; then --tokenizer-worker-num 8 --enable-prefill-delayer --decode-log-interval 5 + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "8192" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_USE_ONLINE_COMPRESS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 MEM_FRACTION_STATIC=0.80 SWA_FULL_TOKENS_RATIO=0.3 MAX_RUNNING_REQUESTS=8192 @@ -136,6 +147,7 @@ elif [ "$CONC" = "8192" ]; then --tokenizer-worker-num 16 --enable-prefill-delayer --stream-interval 30 + --enable-deepseek-v4-fp4-indexer ) else From a2ad3bdb2e9a43ccbc757f02afabf2f1c22ded92 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 16:54:25 +0800 Subject: [PATCH 2/6] dsv4-fp4-b300-sglang: bump image to nightly-dev-cu13-20260608-303757cc --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..f6374402c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 16133e4f29c26ef899a27ecfaa09466ab72b4ff0 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 17:25:10 +0800 Subject: [PATCH 3/6] Add perf-changelog entry for dsv4-fp4-b300-sglang env var alignment --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 178a95abb..17e96cfee 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3523,3 +3523,12 @@ - "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512" - "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Align env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max" + - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192" + - "Add --enable-deepseek-v4-fp4-indexer to all concurrency profiles" + - "Bump image to nightly-dev-cu13-20260608-303757cc" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682 From 6befeb8db298e87a782d95e4ee562ae7ad312afe Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 00:39:34 +0800 Subject: [PATCH 4/6] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260606-b3e4c204 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f6374402c..51ea64ff4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc + image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From fc3bad2b536f13c37578dae62edd6e7b779ad92a Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 01:19:51 +0800 Subject: [PATCH 5/6] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260604-14ed9b44 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 51ea64ff4..bc6b5effb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204 + image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 80a03bac7c061928c561c39f8505ab24a006afc7 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 02:00:01 +0800 Subject: [PATCH 6/6] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260601-373cadc9 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bc6b5effb..d204aa7c1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 + image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300