Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
# DeepSeek-V4-Pro on B300 with sglang (non-MTP).
# Uses nightly image with megamoe backend for high-concurrency profiles.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand Down
42 changes: 27 additions & 15 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ fi

nvidia-smi

# ─── Common env vars (all profiles) ───────────────────────────────────────────
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
# ─── Common env vars (all profiles, GB300-aligned) ──────────────────────────
export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
export SGLANG_RADIX_FORCE_MISS=1
export SGLANG_DEFAULT_THINKING=1
export SGLANG_DSV4_REASONING_EFFORT=max
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1

SERVER_LOG="$PWD/server.log"
Expand All @@ -46,9 +49,25 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# ─── DP-attention env vars (GB300-aligned) ───────────────────────────────────
# Shared across all DP-attention profiles (conc >= 512). Set before per-conc
# tuning so individual blocks only carry NVSHMEM / batch-size overrides.
if [ "$CONC" != "1" ] && [ "$CONC" != "32" ]; then
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_LOG_MS=1
export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
fi

# ─── Per-concurrency launch profile ──────────────────────────────────────────
# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
# and optionally MAX_RUNNING_REQUESTS.
#
# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
Expand All @@ -61,11 +80,11 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
--enable-deepseek-v4-fp4-indexer
)

elif [ "$CONC" = "512" ]; then
# DP attention, flashinfer_mxfp4
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
MEM_FRACTION_STATIC=0.94
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
Expand All @@ -75,15 +94,12 @@ elif [ "$CONC" = "512" ]; then
--disable-flashinfer-autotune
--chunked-prefill-size 16384
--enable-prefill-delayer
--enable-deepseek-v4-fp4-indexer
)

elif [ "$CONC" = "2048" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
MAX_RUNNING_REQUESTS=2560
Expand All @@ -95,14 +111,12 @@ elif [ "$CONC" = "2048" ]; then
--chunked-prefill-size 65536
--tokenizer-worker-num 4
--enable-prefill-delayer
--enable-deepseek-v4-fp4-indexer
)

elif [ "$CONC" = "4096" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
MAX_RUNNING_REQUESTS=4352
Expand All @@ -115,15 +129,12 @@ elif [ "$CONC" = "4096" ]; then
--tokenizer-worker-num 8
--enable-prefill-delayer
--decode-log-interval 5
--enable-deepseek-v4-fp4-indexer
)

elif [ "$CONC" = "8192" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
MAX_RUNNING_REQUESTS=8192
Expand All @@ -136,6 +147,7 @@ elif [ "$CONC" = "8192" ]; then
--tokenizer-worker-num 16
--enable-prefill-delayer
--stream-interval 30
--enable-deepseek-v4-fp4-indexer
)

else
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3523,3 +3523,12 @@
- "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512"
- "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Align env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"
- "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192"
- "Add --enable-deepseek-v4-fp4-indexer to all concurrency profiles"
- "Bump image to nightly-dev-cu13-20260608-303757cc"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682
Loading