From 6c9c79ff7520c6c09629482eb325cd8399e919bc Mon Sep 17 00:00:00 2001
From: "cliu1004@amd.com" <cliu1004@amd.com@mia1-p01-g18.mia.tensorwave.lan>
Date: Fri, 5 Jun 2026 20:48:03 +0000
Subject: [PATCH] [AMD][MI355X] Bump qwen3.5-bf16 single-node SGLang image to
 v0.5.12.post1

Pin both qwen3.5-bf16-mi355x-sglang and qwen3.5-bf16-mi355x-sglang-mtp
to lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x (was
lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517) so the e2e matrix
runs on the image where we already measured the MTP EAGLE acceleration.

Measured on a single MI355X (mia1-p01-g09), Qwen/Qwen3.5-397B-A17B,
1k/1k, TP=8, EP=1, no DP-attn, --attention-backend triton, EAGLE
num_steps=3 / eagle_topk=1 / num_draft_tokens=4. MTP delivers
+34..69% total token throughput and -28..42% median TPOT over non-MTP
for conc 1..32; the conc=64 row is depressed on tok/s (+6.9%) because
EAGLE silently caps max_running_requests=48 and 16 of 64 requests queue
(TPOT speedup unchanged at 1.39x).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 78fdffa9a..0e79ad150 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -166,7 +166,7 @@ dsr1-fp8-mi355x-sglang-mtp:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 qwen3.5-bf16-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: mi355x
@@ -185,7 +185,7 @@ qwen3.5-bf16-mi355x-sglang:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
 
 qwen3.5-bf16-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang:v0.5.12.post1-rocm720-mi35x
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: mi355x