Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ qwen3.5-fp8-mi355x-sglang-disagg:
- "DECODE_MTP_SIZE=0"

qwen3.5-fp4-mi355x-sglang:
image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
model: amd/Qwen3.5-397B-A17B-MXFP4
model-prefix: qwen3.5
runner: mi355x
Expand Down Expand Up @@ -468,7 +468,7 @@ qwen3.5-fp4-mi355x-atom:
- { tp: 4, conc-start: 4, conc-end: 16 }

qwen3.5-fp4-mi355x-sglang-mtp:
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
model: amd/Qwen3.5-397B-A17B-MXFP4
model-prefix: qwen3.5
runner: mi355x
Expand All @@ -480,12 +480,12 @@ qwen3.5-fp4-mi355x-sglang-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }

qwen3.5-fp4-mi355x-sglang-disagg:
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ fi
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

export SGLANG_USE_AITER=1
export SGLANG_USE_AITER_UNIFIED_ATTN=1

SERVER_LOG=/workspace/server.log
MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
Expand All @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
--model-loader-extra-config '{"enable_multithread_load": true}' \
--watchdog-timeout 1200 \
--disable-radix-cache \
--enable-aiter-allreduce-fusion --max-running-requests $CONC \
--page-size 16 \
Comment thread
1am9trash marked this conversation as resolved.
> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ fi
hf download "$MODEL"

export SGLANG_USE_AITER=1
export SGLANG_USE_AITER_UNIFIED_ATTN=1

SERVER_LOG=/workspace/server.log
MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
Expand All @@ -38,6 +39,8 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
--model-loader-extra-config '{"enable_multithread_load": true}' \
--watchdog-timeout 1200 \
--disable-radix-cache \
--enable-aiter-allreduce-fusion --max-running-requests $CONC \
Comment on lines 21 to +42

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the contributing @1am9trash can u add this too the cookbook?

Image

--page-size 16 \
--speculative-algorithm EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3523,3 +3523,14 @@
- "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512"
- "Remove 4 dominated old configs (4p-dep16-8n, 8p-dep16-12n, 10p-dep16-14n, 12p-dep12-15n) superseded by wide-EP frontier"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586

- config-keys:
- qwen3.5-fp4-mi355x-sglang
- qwen3.5-fp4-mi355x-sglang-mtp
description:
- "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604."
- "Update script for aiter attention backend from triton."
- "Enable aiter unfied attention."
- "Enable aiter allreduce fusion."
- "Remove sweep config mtp+cc256+tp2, which may OOM."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1680