diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..3da86ad98 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3000,7 +3000,7 @@ dsv4-fp8-h200-sglang-mtp: # layouts on 4 allocated GPUs. dsv4-fp4-b300-vllm: image: vllm/vllm-openai:v0.22.0 - model: deepseek-ai/DeepSeek-V4-Pro + model: nvidia/DeepSeek-V4-Pro-NVFP4 model-prefix: dsv4 runner: b300 precision: fp4 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh index 8aef70a8e..f1d680385 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh @@ -53,9 +53,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then fi MOE_ARGS=() -if [ "${DP_ATTENTION}" = "true" ]; then - MOE_ARGS=(--moe-backend deep_gemm_mega_moe) -fi +# if [ "${DP_ATTENTION}" = "true" ]; then +# MOE_ARGS=(--moe-backend deep_gemm_mega_moe) +# fi if [ "${DP_ATTENTION}" = "true" ]; then MAX_NUM_BATCHED_TOKENS=2048 @@ -92,6 +92,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PO --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ + --gpu-memory-utilization 0.97 \ --max-cudagraph-capture-size 2048 \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..86ef1f46c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,9 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Update B300 dsv4 image to nvfp4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1656 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index fc0ac297f..13f54c33b 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -314,6 +314,7 @@ else DeepSeek-R1-0528-NVFP4-v2 DeepSeek-V4-Flash DeepSeek-V4-Pro + DeepSeek-V4-Pro-NVFP4 GLM-5-FP8 GLM-5-NVFP4 GLM-5.1