From 08809b93abc0e595546e976eea1926a01c894c7f Mon Sep 17 00:00:00 2001 From: "jacky.cheng" Date: Mon, 8 Jun 2026 07:48:51 +0000 Subject: [PATCH 1/4] [AMD][MI355X] Qwen3.5-fp4: add aiter unified attn, allreduce fusion, and page-size 16 --- benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh index e400729ff..8cc36c8a3 100644 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh @@ -18,6 +18,7 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_USE_AITER=1 +export SGLANG_USE_AITER_UNIFIED_ATTN=1 SERVER_LOG=/workspace/server.log MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ +--enable-aiter-allreduce-fusion --max-running-requests 512 \ +--page-size 16 \ > $SERVER_LOG 2>&1 & SERVER_PID=$! From 60da2b8a18ec02008a963d7b232b5163c5f81f3d Mon Sep 17 00:00:00 2001 From: thomawan Date: Mon, 8 Jun 2026 16:09:28 +0800 Subject: [PATCH 2/4] Update config --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh | 2 +- .../single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78fdffa9a..9ada18291 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -426,7 +426,7 @@ qwen3.5-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" qwen3.5-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi35x + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -468,7 +468,7 @@ qwen3.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh index 8cc36c8a3..7d87eea27 100644 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh @@ -39,7 +39,7 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ ---enable-aiter-allreduce-fusion --max-running-requests 512 \ +--enable-aiter-allreduce-fusion --max-running-requests $CONC \ --page-size 16 \ > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh index e98dec2db..317147668 100755 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh @@ -18,6 +18,7 @@ fi hf download "$MODEL" export SGLANG_USE_AITER=1 +export SGLANG_USE_AITER_UNIFIED_ATTN=1 SERVER_LOG=/workspace/server.log MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ +--enable-aiter-allreduce-fusion --max-running-requests $CONC \ +--page-size 16 \ --speculative-algorithm EAGLE \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ From 59ba6f59a411f8601a21cb653ac480587af3afa7 Mon Sep 17 00:00:00 2001 From: thomawan Date: Mon, 8 Jun 2026 16:16:28 +0800 Subject: [PATCH 3/4] Update change log --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 178a95abb..60e3d87e1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3523,3 +3523,13 @@ - "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512" - "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586 + +- config-keys: + - qwen3.5-fp4-mi355x-sglang + - qwen3.5-fp4-mi355x-sglang-mtp + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604." + - "Update script for aiter attention backend from triton." + - "Enable aiter unfied attention." + - "Enable aiter allreduce fusion." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1680 From c8804e2252f14f6a127482b6fd9b4e7587185821 Mon Sep 17 00:00:00 2001 From: thomawan Date: Mon, 8 Jun 2026 18:14:56 +0800 Subject: [PATCH 4/4] Update change log and remove OOM config --- .github/configs/amd-master.yaml | 4 ++-- perf-changelog.yaml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9ada18291..714b47706 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -480,12 +480,12 @@ qwen3.5-fp4-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang-disagg: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 60e3d87e1..828170e3f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3532,4 +3532,5 @@ - "Update script for aiter attention backend from triton." - "Enable aiter unfied attention." - "Enable aiter allreduce fusion." + - "Remove sweep config mtp+cc256+tp2, which may OOM." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1680