Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
f632aa4
agentic(trace-source): default non-DSv4 to v6 (060226) corpus
camatsemianalysis Jun 2, 2026
5544a44
configs(master): consolidate agentic recipes at end + split combined …
cquil11 Jun 2, 2026
76aedd6
configs(master): bump all vllm images to v0.22.0
cquil11 Jun 2, 2026
6dede7b
configs(master): strip stale narrative comments
cquil11 Jun 2, 2026
3257275
chore(aiperf): bump submodule for 060226 loader allowlist fix
cquil11 Jun 2, 2026
321fd44
(testing) b300 dsv4 simple offloading
cquil11 Jun 2, 2026
3283934
runners(b300-nv): remap container UID to root for apt-get install
cquil11 Jun 2, 2026
360bcf0
benchmarks(agentic): skip hf download when MODEL_PATH is pre-staged
cquil11 Jun 2, 2026
57d4adb
benchmarks(agentic): launch server from MODEL_PATH, not the HF id
cquil11 Jun 2, 2026
1bccc5c
benchmarks(dsv4-b300): enable VLLM_PREFIX_CACHE_RETENTION_INTERVAL
cquil11 Jun 2, 2026
0946107
configs(dsv4-b300-vllm-agentic): bump cquil image to 6c529f30 for ret…
cquil11 Jun 3, 2026
38c365c
benchmarks(dsv4-b300-vllm): override trace loader to 060226 (v6)
cquil11 Jun 3, 2026
ee8d743
[AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Q…
seungrokj Jun 3, 2026
616f4db
[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scr…
seungrokj Jun 3, 2026
574d891
Revert "[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/…
seungrokj Jun 3, 2026
5ec21d4
utils(process_agentic_result): align cache metrics + theoretical-trac…
cquil11 Jun 3, 2026
d7841d8
feat(agentic): route DEP traffic through native vLLM router
cquil11 Jun 3, 2026
fc5a792
benchmarks(agentic): disable DCGM gpu_telemetry in aiperf invocation
cquil11 Jun 3, 2026
ba65df8
refactor(agentic): hardcode DSv4 B300 router settings
cquil11 Jun 3, 2026
76a3f09
fix(agentic): fail jobs with excessive aiperf errors
cquil11 Jun 3, 2026
923186d
feat(agentic): route B200 DEP traffic through native vLLM router
cquil11 Jun 3, 2026
5291955
benchmarks(agentic): default DSv4 recipes to v6 (060226) corpus
cquil11 Jun 3, 2026
40736e8
chore(agentic): bump aiperf for warmup progress logging
cquil11 Jun 3, 2026
70529f2
chore(agentic): bump aiperf for phase-continuous replay
cquil11 Jun 3, 2026
1ed0001
chore(agentic): bump aiperf snapshot accessor rename
cquil11 Jun 3, 2026
1c84916
fix(agentic): align B200 DSv4 with bespoke vLLM image
cquil11 Jun 3, 2026
4bd54ce
chore(agentic): bump aiperf for heartbeat-only warmup logging
cquil11 Jun 3, 2026
97576fa
test(agentic): run B300 CPU offload in eager mode
cquil11 Jun 4, 2026
e08ba71
config(dsv4-fp4 agentic): run offloading=none with expanded concurren…
cquil11 Jun 4, 2026
60f3be0
fix(agentic): extend native router request timeout
cquil11 Jun 4, 2026
3747263
fix(agentic): use native B300 KV offloading
cquil11 Jun 4, 2026
cb21694
(testing) add offload off scneario to dsv4 b300
cquil11 Jun 4, 2026
06a4ea7
test(agentic): enable blocking CUDA offload diagnostics
cquil11 Jun 4, 2026
fb362a6
chore(agentic): remove stale B200 sweep comments
cquil11 Jun 4, 2026
2f27bea
feat(agentic): use Mooncake store for B300 offload
cquil11 Jun 4, 2026
97c4b65
test(agentic): validate Mooncake over TCP on B300
cquil11 Jun 4, 2026
49b9967
fix(agentic): pool Mooncake TCP connections
cquil11 Jun 4, 2026
c3cfe74
fix(agentic): increase Mooncake TCP transfer slices
cquil11 Jun 4, 2026
f585282
test(agentic): pin Mooncake RDMA on B300
cquil11 Jun 4, 2026
3599c78
perf(agentic): map Mooncake RDMA NICs on B300
cquil11 Jun 5, 2026
fce4996
fix(agentic): use shared Mooncake RDMA NIC on B300
cquil11 Jun 5, 2026
8d6b735
perf(agentic): tune Mooncake RDMA transfers on B300
cquil11 Jun 5, 2026
d98d7ae
perf(agentic): use full B300 Mooncake memory budget
cquil11 Jun 5, 2026
b83265d
perf(agentic): evict Mooncake cache before rank exhaustion
cquil11 Jun 5, 2026
4178b78
feat(agentic): default to 060526 weka corpus (DSv4 base, others 256k)
cquil11 Jun 5, 2026
0731a8e
go
cquil11 Jun 5, 2026
e6fe59c
feat(agentic): use Mooncake offload for DSv4 B200
cquil11 Jun 5, 2026
cffe496
go
cquil11 Jun 5, 2026
5b0e1a0
fix(agentic): bump aiperf for Weka context resets
cquil11 Jun 8, 2026
63c7c59
test(agentic): reduce B200 Mooncake memory budget
cquil11 Jun 8, 2026
5a566b3
fix(agentic): use Mooncake TCP fallback on B200
cquil11 Jun 8, 2026
6f2d292
fix(agentic): reuse B200 Mooncake TCP connections
cquil11 Jun 8, 2026
4e8ec16
fix(agentic): use current Mooncake TCP transport on B200
cquil11 Jun 8, 2026
bb55646
fix(agentic): preserve cached Mooncake wheel filename
cquil11 Jun 8, 2026
eebefc1
test(agentic): use standalone Mooncake store on B200
cquil11 Jun 8, 2026
1112011
fix(agentic): bound B200 Mooncake transfer batches
cquil11 Jun 8, 2026
191c4fe
test(agentic): raise B200 Mooncake batch limit
cquil11 Jun 8, 2026
8894d58
fix(agentic): extend B200 Mooncake read lease
cquil11 Jun 8, 2026
077a4d0
test(agentic): use stock Mooncake DMA-BUF RDMA on B200
cquil11 Jun 8, 2026
acfeb45
feat(agentic): default to 060826 weka corpus (DSv4 base, others 256k)
cquil11 Jun 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 145 additions & 249 deletions .github/configs/amd-master.yaml

Large diffs are not rendered by default.

812 changes: 346 additions & 466 deletions .github/configs/nvidia-master.yaml

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,10 @@ jobs:
name: agentic_${{ env.RESULT_FILENAME }}
path: |
results/server.log
results/router.log
results/lmcache_server.log
results/mooncake_master.log
results/mooncake_config.json
results/benchmark.log
results/config.yaml
results/lmcache_command.txt
Expand Down Expand Up @@ -279,7 +282,10 @@ jobs:
name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
path: |
${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
if-no-files-found: ignore

- name: Upload GPU metrics
Expand Down
68 changes: 61 additions & 7 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,7 @@ run_eval() {
INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
AIPERF_FAILED_REQUEST_THRESHOLD=0.10

agentic_pip_install() {
local pip_install=(python3 -m pip install)
Expand All @@ -924,8 +925,21 @@ resolve_trace_source() {
# public-dataset loader names allowed by the inferencex-agentx-mvp
# scenario. Used by recipes whose servers have non-default context
# caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
# unfiltered 052726 corpus and switches to the 256k-capped variant).
local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
# unfiltered corpus and switches to the 256k-capped variant), or
# by recipes that want to pin an older corpus generation.
#
# Default (no override): the 060826 v6 corpus, selected by model family.
# DSv4 (full context) rides the unfiltered base corpus; every non-DSv4
# recipe defaults to the 256k-capped variant because those servers run at
# max_model_len ~256k and would reject >256k requests. Any recipe can still
# pin a specific corpus via WEKA_LOADER_OVERRIDE.
local default_loader
if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then
default_loader="semianalysis_cc_traces_weka_with_subagents_060826"
else
default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k"
fi
local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
local dataset
case "$loader" in
semianalysis_cc_traces_weka_with_subagents)
Expand All @@ -934,13 +948,31 @@ resolve_trace_source() {
semianalysis_cc_traces_weka_with_subagents_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060226)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
;;
semianalysis_cc_traces_weka_with_subagents_060226_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060526)
dataset="semianalysisai/cc-traces-weka-with-subagents-060526"
;;
semianalysis_cc_traces_weka_with_subagents_060526_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060826)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826"
;;
semianalysis_cc_traces_weka_with_subagents_060826_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k"
;;
*)
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2
exit 1
;;
esac
TRACE_SOURCE_FLAG="--public-dataset $loader"
echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
# Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
# for model weights) so subsequent runs read from cache instead of
# re-downloading every job.
Expand Down Expand Up @@ -1017,7 +1049,7 @@ build_replay_cmd() {
# transient low-rate failures from killing long sweeps while still
# catching malformed payloads or server crashes before they get aggregated
# as benchmarkable data.
REPLAY_CMD+=" --failed-request-threshold 0.10"
REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD"
# Sample each trajectory's warmup start position uniformly from
# [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
# Avoids starting trajectories right at turn 0 where the KV cache is
Expand All @@ -1031,6 +1063,14 @@ build_replay_cmd() {
# CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server
# usage is authoritative).
REPLAY_CMD+=" --use-server-token-count"
# Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries
# freezes its metric schema on the first DCGM scrape, then KeyErrors when
# an optional field (xid_errors, power_violation, encoder_utilization)
# first appears mid-run. We don't consume the gpu_telemetry artifact in
# downstream processing, and the server-metrics path (Prometheus /metrics
# from vLLM) is unaffected by this flag and still gives us KV usage,
# prefix cache hit rate, etc.
REPLAY_CMD+=" --no-gpu-telemetry"
# aiperf's dataset manager (separate from the inference parser) loads
# the model's tokenizer for trace-prompt tokenization regardless of
# --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4,
Expand Down Expand Up @@ -1070,8 +1110,9 @@ build_replay_cmd() {

write_agentic_result_json() {
# Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json
# into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing
# retry-based existence check is the single success gate.
# into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that
# this file exists; run_agentic_replay_and_write_outputs separately rejects
# aggregates whose request error rate exceeds the configured limit.
local result_dir="$1"
RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \
python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py"
Expand All @@ -1085,6 +1126,7 @@ write_agentic_result_json() {
run_agentic_replay_and_write_outputs() {
local result_dir="$1"
local replay_rc
local validation_rc

echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"

Expand All @@ -1100,8 +1142,20 @@ run_agentic_replay_and_write_outputs() {
python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
"$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true

set +e
python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \
"$result_dir/aiperf_artifacts" \
--failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD"
validation_rc=$?
set -e

if [ "$replay_rc" -ne 0 ]; then
echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
return "$replay_rc"
fi

if [ "$validation_rc" -ne 0 ]; then
echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2
return "$validation_rc"
fi
}
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand All @@ -33,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path $MODEL \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
rocm-smi
amd-smi || true

Expand All @@ -34,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path=$MODEL \
--model-path=$MODEL_PATH --served-model-name=$MODEL \
--host=0.0.0.0 \
--port=$PORT \
--trust-remote-code \
Expand Down
Loading