SemiAnalysisAI · wzhao18 · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -3000,7 +3000,7 @@ dsv4-fp8-h200-sglang-mtp:
   # layouts on 4 allocated GPUs.
 dsv4-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.22.0
-  model: deepseek-ai/DeepSeek-V4-Pro
+  model: nvidia/DeepSeek-V4-Pro-NVFP4
   model-prefix: dsv4
   runner: b300
   precision: fp4

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
@@ -53,9 +53,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
 fi
 
 MOE_ARGS=()
-if [ "${DP_ATTENTION}" = "true" ]; then
-    MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
-fi
+# if [ "${DP_ATTENTION}" = "true" ]; then
+#     MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
+# fi
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     MAX_NUM_BATCHED_TOKENS=2048
@@ -92,6 +92,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PO
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 \
+    --gpu-memory-utilization 0.97 \
     --max-cudagraph-capture-size 2048 \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3531,3 +3531,9 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Update B300 dsv4 image to nvfp4"
-    - "Update B300 dsv4 image to nvfp4"
+    - "Switch B300 dsv4 model to nvidia/DeepSeek-V4-Pro-NVFP4 checkpoint; bump gpu-memory-utilization to 0.97; disable deep_gemm_mega_moe backend for dp-attn"
-    - "Update B300 dsv4 image to nvfp4"
+    - "Switch B300 dsv4 model to nvidia/DeepSeek-V4-Pro-NVFP4 checkpoint; bump gpu-memory-utilization to 0.97; disable deep_gemm_mega_moe backend for dp-attn"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1656
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
@@ -314,6 +314,7 @@ else
         DeepSeek-R1-0528-NVFP4-v2
         DeepSeek-V4-Flash
         DeepSeek-V4-Pro
+        DeepSeek-V4-Pro-NVFP4
         GLM-5-FP8
         GLM-5-NVFP4
         GLM-5.1