diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78fdffa9a..714b47706 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -426,7 +426,7 @@ qwen3.5-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" qwen3.5-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi35x + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -468,7 +468,7 @@ qwen3.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -480,12 +480,12 @@ qwen3.5-fp4-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang-disagg: diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh index e400729ff..7d87eea27 100644 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh @@ -18,6 +18,7 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_USE_AITER=1 +export SGLANG_USE_AITER_UNIFIED_ATTN=1 SERVER_LOG=/workspace/server.log MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ +--enable-aiter-allreduce-fusion --max-running-requests $CONC \ +--page-size 16 \ > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh index e98dec2db..317147668 100755 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh @@ -18,6 +18,7 @@ fi hf download "$MODEL" export SGLANG_USE_AITER=1 +export SGLANG_USE_AITER_UNIFIED_ATTN=1 SERVER_LOG=/workspace/server.log MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ +--enable-aiter-allreduce-fusion --max-running-requests $CONC \ +--page-size 16 \ --speculative-algorithm EAGLE \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 178a95abb..828170e3f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3523,3 +3523,14 @@ - "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512" - "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586 + +- config-keys: + - qwen3.5-fp4-mi355x-sglang + - qwen3.5-fp4-mi355x-sglang-mtp + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604." + - "Update script for aiter attention backend from triton." + - "Enable aiter unfied attention." + - "Enable aiter allreduce fusion." + - "Remove sweep config mtp+cc256+tp2, which may OOM." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1680