SemiAnalysisAI · 1am9trash · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -426,7 +426,7 @@ qwen3.5-fp8-mi355x-sglang-disagg:
           - "DECODE_MTP_SIZE=0"
 
 qwen3.5-fp4-mi355x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
   model: amd/Qwen3.5-397B-A17B-MXFP4
   model-prefix: qwen3.5
   runner: mi355x
@@ -468,7 +468,7 @@ qwen3.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
 qwen3.5-fp4-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
   model: amd/Qwen3.5-397B-A17B-MXFP4
   model-prefix: qwen3.5
   runner: mi355x
@@ -480,12 +480,12 @@ qwen3.5-fp4-mi355x-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
       - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
       - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
 
 qwen3.5-fp4-mi355x-sglang-disagg:

diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh
@@ -18,6 +18,7 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGLANG_USE_AITER=1
+export SGLANG_USE_AITER_UNIFIED_ATTN=1
 
 SERVER_LOG=/workspace/server.log
 MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
@@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --model-loader-extra-config '{"enable_multithread_load": true}' \
 --watchdog-timeout 1200  \
 --disable-radix-cache \
+--enable-aiter-allreduce-fusion --max-running-requests $CONC \
+--page-size 16 \
 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
@@ -18,6 +18,7 @@ fi
 hf download "$MODEL"
 
 export SGLANG_USE_AITER=1
+export SGLANG_USE_AITER_UNIFIED_ATTN=1
 
 SERVER_LOG=/workspace/server.log
 MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
@@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --model-loader-extra-config '{"enable_multithread_load": true}' \
 --watchdog-timeout 1200  \
 --disable-radix-cache \
+--enable-aiter-allreduce-fusion --max-running-requests $CONC \
+--page-size 16 \
 --speculative-algorithm EAGLE \
 --speculative-num-steps 3 \
 --speculative-eagle-topk 1 \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3523,3 +3523,14 @@
     - "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512"
     - "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586
+
+- config-keys:
+    - qwen3.5-fp4-mi355x-sglang
+    - qwen3.5-fp4-mi355x-sglang-mtp
+  description:
+    - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604."
+    - "Update script for aiter attention backend from triton."
+    - "Enable aiter unfied attention."
+    - "Enable aiter allreduce fusion."
+    - "Remove sweep config mtp+cc256+tp2, which may OOM."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1680