From 286dc1ae7d78a3514286e192c50ebf35f6a47296 Mon Sep 17 00:00:00 2001 From: nehaprakriya Date: Wed, 3 Jun 2026 17:35:36 +0000 Subject: [PATCH] AMD - gpt-oss vllm mxfp4: AITER tuning + n-gram spec decode + server parameter tuning --- .../fixed_seq_len/gptoss_fp4_mi355x.sh | 25 ++++++++++++++++--- perf-changelog.yaml | 8 ++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh index 14dedb141..662c47fd8 100644 --- a/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh @@ -33,13 +33,29 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -export AMDGCN_USE_BUFFER_OPS=0 +# --- AITER backend optimizations (env-var tuning) --- export VLLM_ROCM_USE_AITER=1 +export VLLM_USE_ROCM_AITER_MXFP4=1 +export VLLM_ROCM_USE_AITER_PAGED_ATTN=1 +export VLLM_ROCM_USE_AITER_LINEAR=1 export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=1 +export VLLM_ROCM_USE_AITER_TRITON_GEMM=0 +export VLLM_ROCM_MOE_PADDING=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export AITER_BF16_FP8_BOUND=0 +export AITER_USE_OPUS_MOE_SORTING=1 +export AITER_USE_NT=0 +export AMDGCN_USE_BUFFER_OPS=1 +export CK_MXFP4_MOE_DIM_ALIGNMENT=64 +export GPU_MAX_HW_QUEUES=4 + ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" +# --- Speculative decoding (06/02 — n-gram prompt lookup, lossless) --- +SPEC_DECODE="--speculative-config {\"method\":\"ngram\",\"num_speculative_tokens\":3,\"prompt_lookup_min\":2,\"prompt_lookup_max\":64}" + SERVER_LOG=/workspace/server.log if [ "${EVAL_ONLY}" = "true" ]; then @@ -53,10 +69,13 @@ set -x vllm serve $MODEL --port $PORT \ $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ --tensor-parallel-size=$TP \ - --gpu-memory-utilization 0.95 \ + --gpu-memory-utilization 0.97 \ --max-model-len $MAX_MODEL_LEN \ + --max-num-seqs 256 \ + --max-num-batched-tokens 16384 \ --block-size=64 \ - --no-enable-prefix-caching > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching \ + $SPEC_DECODE > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1b9d2f0db..fcf77b73e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3430,3 +3430,11 @@ - "Image: vllm/vllm-openai:v0.20.1" - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 + +- config-keys: + - gptoss-fp4-mi355x-vllm + description: + - "Enable n-gram speculative decoding (prompt-lookup, num_speculative_tokens=3) for 3.26x decode throughput improvement" + - "Add full AITER env-var tuning: MXFP4, FP4 ASM GEMM, unified paged attention, inductor graph partition, opus MoE sorting" + - "Set gpu-memory-utilization=0.97, max-num-seqs=256, max-num-batched-tokens=16384, GPU_MAX_HW_QUEUES=4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1657