Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7a86001
[AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, refactor…
seungrokj Jun 3, 2026
19d1ca5
[AMD] minimaxm2.5 agentic: change LMCACHE_CHUNK_SIZE default from 256…
seungrokj Jun 3, 2026
d184078
[AMD] agentx-v0.4: add MODEL_PATH support and --served-model-name for…
seungrokj Jun 3, 2026
d2b2826
[AMD] kimik2.5-fp4-mi355x-vllm-agentic-lmcache: fix config, use v0.21…
seungrokj Jun 3, 2026
dc999ef
[AMD] agentx-v0.4: fix configs for MiniMax/Kimi/Qwen agentic entries
seungrokj Jun 3, 2026
fc0d0d4
[AMD] qwen3.5-fp8-mi355x-sglang-agentic-hicache: fix runner to mi355x
seungrokj Jun 3, 2026
fe3afa9
[AMD] qwen3.5-fp8-mi355x-sglang-agentic-hicache: update conc-list to …
seungrokj Jun 3, 2026
dc25a0b
[AMD] minimaxm2.5-fp8-mi355x-vllm-agentic: add lmcache variant config…
seungrokj Jun 3, 2026
34cd6b0
[AMD] glm5.1-fp4-mi355x-sglang-agentic: add hicache variant config an…
seungrokj Jun 3, 2026
3707963
[AMD] qwen3.5-fp8-mi355x-sglang-agentic: update conc-list and remove …
seungrokj Jun 4, 2026
b3b3476
[AMD] qwen3.5-fp4-mi355x-sglang-agentic: add hicache variant config a…
seungrokj Jun 4, 2026
56d9dc5
[AMD] amd-master: fix entry names and tp for qwen3.5-fp4 and minimaxm…
seungrokj Jun 4, 2026
c697c16
[AMD] minimaxm2.5-fp4-mi355x-agentic: increase lmcache DRAM size
seungrokj Jun 4, 2026
6dbef81
[AMD] minimaxm2.5-fp8-mi355x-agentic: increase lmcache DRAM size
seungrokj Jun 4, 2026
0c4464e
[AMD] minimaxm2.5-fp4-mi355x-agentic: switch to 060226 trace (no 256k…
seungrokj Jun 7, 2026
0607cc7
[AMD] kimik2.5-fp4-mi355x-agentic: switch to no-256k trace, increase …
seungrokj Jun 7, 2026
111b2d5
[AMD] amd-master: fix typo in kimik2.5 agentic lmcache entry name (la…
seungrokj Jun 7, 2026
8029bac
[AMD] kimik2.5-fp4-mi355x-agentic: add lmcache-060226DRAM3000GB targe…
seungrokj Jun 7, 2026
4315997
[AMD] kimik2.5-fp4-mi355x-agentic: reduce lmcache DRAM to 2500GB
seungrokj Jun 7, 2026
beba96b
[AMD] kimik2.5-fp4-mi355x-agentic: reduce lmcache DRAM to 1500GB
seungrokj Jun 7, 2026
244de16
[AMD] launch_mi355x-amds: exclude mia1-p01-g37 from salloc
seungrokj Jun 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 104 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang:
- { tp: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 16 }

# target
qwen3.5-fp4-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
model: amd/Qwen3.5-397B-A17B-MXFP4
model-prefix: qwen3.5
runner: mi355x
precision: fp4
framework: sglang
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
- { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }

qwen3.5-fp4-mi355x-atom:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
model: amd/Qwen3.5-397B-A17B-MXFP4
Expand Down Expand Up @@ -872,6 +888,22 @@ minimaxm2.5-fp4-mi355x-atom:
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 4, conc-end: 16 }

# target
minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/MiniMax-M2.5-MXFP4
model-prefix: minimaxm2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
- { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }

minimaxm2.5-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/MiniMax-M2.5-MXFP4
Expand Down Expand Up @@ -2494,6 +2526,23 @@ glm5.1-fp4-mi355x-sglang-agentic:
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }

# target
glm5.1-fp4-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
model: amd/GLM-5.1-MXFP4
model-prefix: glm5.1
runner: mi355x
precision: fp4
framework: sglang
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
- { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
- { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }

kimik2.5-fp4-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/Kimi-K2.5-MXFP4
Expand All @@ -2518,6 +2567,38 @@ kimik2.5-fp4-mi355x-vllm-agentic:
- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
- { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] }

# target
kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
- { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }

# target
kimik2.5-fp4-mi355x-vllm-agentic-lmcache-060226DRAM1500GB:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
#- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
- { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }

minimaxm2.5-fp8-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
Expand All @@ -2536,6 +2617,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
- { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
- { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] }

# target
minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x
precision: fp8
framework: vllm
multinode: false
scenarios:
agentic-coding:
# MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
# Compute saturates first; cpu offload likely won't help, but worth confirming.
# AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
- duration: 1800
search-space:
- { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
- { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }

minimaxm2.5-fp8-mi300x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
Expand Down Expand Up @@ -2573,8 +2673,9 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
- { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }

# target
qwen3.5-fp8-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi355x
Expand All @@ -2585,8 +2686,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache:
agentic-coding:
- duration: 1800
search-space:
- { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
- { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
- { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
- { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }

dsv4-fp4-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
Expand Down
96 changes: 87 additions & 9 deletions benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,29 @@
set -euo pipefail
set -x

# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
#
# Base server recipe follows the upstream MI300X reference
# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
# The agentic harness (resolve_trace_source / build_replay_cmd /
# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
#
# Required env vars:
# MODEL, TP, CONC, RESULT_DIR
# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
#
# OFFLOADING values:
# none - SGLang GPU KV with the default RadixAttention prefix cache.
# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars MODEL TP CONC RESULT_DIR DURATION
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION

if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
MAX_MODEL_LEN=131072
fi
PORT=${PORT:-8888}
DURATION=${DURATION:-1800}
EP_SIZE=${EP_SIZE:-1}

if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
Expand All @@ -30,8 +41,16 @@ else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi

rocm-smi || true
amd-smi || true
# ---- Resolve traces and install deps ----------------------------------------
# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
# signal at high concurrency.
#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
#060226
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k

# ---- Resolve traces and install deps ----------------------------------------
resolve_trace_source
Expand All @@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"

pip install -U transformers

CACHE_ARGS=()
WARMUP_ARGS=()
CUDA_GRAPH_MAX_BS="$CONC"
case "$OFFLOADING" in
none)
# Leave SGLang's default RadixAttention prefix cache on — agentic
# replay needs it; --disable-radix-cache would zero the hit rate.
;;
hicache)
# GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
# so one HiCache host pool per TP rank is sufficient.
# The node-total DRAM budget divides by TP and host-pool count.
TOTAL_CPU_DRAM_GB=3000
HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}"
HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
# GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
# so page_size can be left at the default. Keep the safer direct/layer_first
# copy path on ROCm.
HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
fi
if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
exit 1
fi
echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
CACHE_ARGS=(
--page-size "$HICACHE_PAGE_SIZE"
--enable-hierarchical-cache
--hicache-size "$HICACHE_SIZE_GB"
--hicache-io-backend "$HICACHE_IO_BACKEND"
--hicache-mem-layout "$HICACHE_MEM_LAYOUT"
--hicache-write-policy "$HICACHE_WRITE_POLICY"
)
# HiCache startup reaches API readiness but SGLang's internal warmup
# request can time out on this path; let aiperf own benchmark traffic.
WARMUP_ARGS=(--skip-server-warmup)
# Don't force ROCm graph capture at every high concurrency point; conc=16
# is the highest known-good capture size for this model/server path.
HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
fi
;;
*)
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
exit 1
;;
esac

echo "Starting SGLang server..."
export PYTHONNOUSERSITE=1

pip install -U transformers
python3 -m sglang.launch_server \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--model-path "$MODEL_PATH" \
--served-model-name "$MODEL" \
Comment thread
cursor[bot] marked this conversation as resolved.
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--trust-remote-code \
--cuda-graph-max-bs $CONC \
--max-running-requests $CONC \
--context-length $MAX_MODEL_LEN \
--mem-fraction-static 0.85 \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
--nsa-prefill-backend tilelang \
--nsa-decode-backend tilelang \
--watchdog-timeout 1200 \
--kv-cache-dtype fp8_e4m3 \
--tokenizer-worker-num $((TP*2)) \
"${CACHE_ARGS[@]}" \
"${WARMUP_ARGS[@]}" \
--enable-metrics > "$SERVER_LOG" 2>&1 &
SERVER_PID=$!
echo "Server PID: $SERVER_PID"
Expand All @@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
Loading