From ee1619400d80684a071c2248b322628d7b9f93ad Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Wed, 3 Jun 2026 10:16:31 -0700
Subject: [PATCH 1/4] Use nvfp4 model

---
 .github/configs/nvidia-master.yaml | 2 +-
 perf-changelog.yaml                | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3ee32f6c6..3a2a3e059 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3000,7 +3000,7 @@ dsv4-fp8-h200-sglang-mtp:
   # layouts on 4 allocated GPUs.
 dsv4-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.22.0
-  model: deepseek-ai/DeepSeek-V4-Pro
+  model: nvidia/DeepSeek-V4-Pro-NVFP4
   model-prefix: dsv4
   runner: b300
   precision: fp4
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1b9d2f0db..263f893e9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3430,3 +3430,9 @@
     - "Image: vllm/vllm-openai:v0.20.1"
     - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Update B300 dsv4 image to nvfp4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652

From e8ecb38a0985a241ad9e5ac8fd7fbfb29ed8eb16 Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Wed, 3 Jun 2026 10:26:52 -0700
Subject: [PATCH 2/4] Update PR number

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 263f893e9..7f9732b99 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3435,4 +3435,4 @@
     - dsv4-fp4-b300-vllm
   description:
     - "Update B300 dsv4 image to nvfp4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1656

From 8df7b138dc82b7000a88a965fc0398dfeed2a706 Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Thu, 4 Jun 2026 12:00:18 -0700
Subject: [PATCH 3/4] Stage dsv4 b300

---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh | 6 +++---
 runners/launch_b300-nv.sh                                  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
index 8aef70a8e..8e8be8353 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
@@ -53,9 +53,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
 fi
 
 MOE_ARGS=()
-if [ "${DP_ATTENTION}" = "true" ]; then
-    MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
-fi
+# if [ "${DP_ATTENTION}" = "true" ]; then
+#     MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
+# fi
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     MAX_NUM_BATCHED_TOKENS=2048
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index e4a253ba3..a1d07d952 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -305,6 +305,7 @@ else
         DeepSeek-R1-0528-NVFP4-v2
         DeepSeek-V4-Flash
         DeepSeek-V4-Pro
+        DeepSeek-V4-Pro-NVFP4
         GLM-5-FP8
         GLM-5-NVFP4
         GLM-5.1

From bb079db604550666cf89431ab9c2a6c98dcdff02 Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:09:48 -0400
Subject: [PATCH 4/4] Adjust GPU memory utilization parameter

---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
index 8e8be8353..f1d680385 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
@@ -92,6 +92,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PO
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 \
+    --gpu-memory-utilization 0.97 \
     --max-cudagraph-capture-size 2048 \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &