From ee1619400d80684a071c2248b322628d7b9f93ad Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Wed, 3 Jun 2026 10:16:31 -0700 Subject: [PATCH 1/4] Use nvfp4 model --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3ee32f6c6..3a2a3e059 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3000,7 +3000,7 @@ dsv4-fp8-h200-sglang-mtp: # layouts on 4 allocated GPUs. dsv4-fp4-b300-vllm: image: vllm/vllm-openai:v0.22.0 - model: deepseek-ai/DeepSeek-V4-Pro + model: nvidia/DeepSeek-V4-Pro-NVFP4 model-prefix: dsv4 runner: b300 precision: fp4 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1b9d2f0db..263f893e9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3430,3 +3430,9 @@ - "Image: vllm/vllm-openai:v0.20.1" - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Update B300 dsv4 image to nvfp4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 From e8ecb38a0985a241ad9e5ac8fd7fbfb29ed8eb16 Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Wed, 3 Jun 2026 10:26:52 -0700 Subject: [PATCH 2/4] Update PR number --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 263f893e9..7f9732b99 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3435,4 +3435,4 @@ - dsv4-fp4-b300-vllm description: - "Update B300 dsv4 image to nvfp4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1656 From 8df7b138dc82b7000a88a965fc0398dfeed2a706 Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Thu, 4 Jun 2026 12:00:18 -0700 Subject: [PATCH 3/4] Stage dsv4 b300 --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh | 6 +++--- runners/launch_b300-nv.sh | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh index 8aef70a8e..8e8be8353 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh @@ -53,9 +53,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then fi MOE_ARGS=() -if [ "${DP_ATTENTION}" = "true" ]; then - MOE_ARGS=(--moe-backend deep_gemm_mega_moe) -fi +# if [ "${DP_ATTENTION}" = "true" ]; then +# MOE_ARGS=(--moe-backend deep_gemm_mega_moe) +# fi if [ "${DP_ATTENTION}" = "true" ]; then MAX_NUM_BATCHED_TOKENS=2048 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e4a253ba3..a1d07d952 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -305,6 +305,7 @@ else DeepSeek-R1-0528-NVFP4-v2 DeepSeek-V4-Flash DeepSeek-V4-Pro + DeepSeek-V4-Pro-NVFP4 GLM-5-FP8 GLM-5-NVFP4 GLM-5.1 From bb079db604550666cf89431ab9c2a6c98dcdff02 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Mon, 8 Jun 2026 16:09:48 -0400 Subject: [PATCH 4/4] Adjust GPU memory utilization parameter --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh index 8e8be8353..f1d680385 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh @@ -92,6 +92,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PO --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ + --gpu-memory-utilization 0.97 \ --max-cudagraph-capture-size 2048 \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &