From 6c9c79ff7520c6c09629482eb325cd8399e919bc Mon Sep 17 00:00:00 2001 From: "cliu1004@amd.com" Date: Fri, 5 Jun 2026 20:48:03 +0000 Subject: [PATCH] [AMD][MI355X] Bump qwen3.5-bf16 single-node SGLang image to v0.5.12.post1 Pin both qwen3.5-bf16-mi355x-sglang and qwen3.5-bf16-mi355x-sglang-mtp to lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x (was lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517) so the e2e matrix runs on the image where we already measured the MTP EAGLE acceleration. Measured on a single MI355X (mia1-p01-g09), Qwen/Qwen3.5-397B-A17B, 1k/1k, TP=8, EP=1, no DP-attn, --attention-backend triton, EAGLE num_steps=3 / eagle_topk=1 / num_draft_tokens=4. MTP delivers +34..69% total token throughput and -28..42% median TPOT over non-MTP for conc 1..32; the conc=64 row is depressed on tok/s (+6.9%) because EAGLE silently caps max_running_requests=48 and 16 of 64 requests queue (TPOT speedup unchanged at 1.39x). Co-authored-by: Cursor --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78fdffa9a..0e79ad150 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -166,7 +166,7 @@ dsr1-fp8-mi355x-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -185,7 +185,7 @@ qwen3.5-bf16-mi355x-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-bf16-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x