From a2ec8f6fe9fbd52a8d4ae24e51936a09001faf4a Mon Sep 17 00:00:00 2001 From: zzy Date: Thu, 28 May 2026 21:28:47 +0800 Subject: [PATCH] Add B200 test and SgLang test results --- .../accuracy/accuracy.json | 8 - .../env_info.json | 48 - .../online/result.json | 163 -- .../result.json | 215 --- .../accuracy/accuracy.json | 8 - .../env_info.json | 48 - .../interactive/result.json | 131 -- .../online/result.json | 151 -- .../result.json | 215 --- .../result.json | 652 ------- .../env_info.json | 25 - .../result.json | 963 ----------- .../result.json | 583 ------- .../result.json | 389 ----- .../accuracy/accuracy.json | 8 + .../burst/result.json | 229 +++ .../env_info.json | 118 ++ .../interactive/result.json | 201 +++ .../offline/result.json | 231 +++ .../online/result.json | 245 +++ .../result.json | 650 +++++++ .../sustained/result.json | 493 ++++++ .../1x/offline/result.json | 231 +++ .../1x/result.json | 236 +++ .../2x/offline/result.json | 231 +++ .../2x/result.json | 236 +++ .../4x/offline/result.json | 231 +++ .../4x/result.json | 236 +++ .../8x/offline/result.json | 231 +++ .../8x/result.json | 236 +++ .../accuracy/accuracy.json | 0 .../env_info.json | 118 ++ .../result.json | 273 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 118 ++ .../interactive/result.json | 201 +++ .../offline/result.json | 231 +++ .../online/result.json | 233 +++ .../result.json | 598 +++++++ .../sustained/result.json | 493 ++++++ .../accuracy/accuracy.json | 8 + .../burst/result.json | 164 ++ .../env_info.json | 12 +- .../interactive/result.json | 136 ++ .../offline/result.json | 166 ++ .../online/result.json | 168 ++ .../result.json | 615 +++++++ .../speculative/result.json | 166 ++ .../sustained/result.json | 428 +++++ .../bf16/accuracy/accuracy.json | 8 + .../bf16/offline/result.json | 178 ++ .../bf16/online/result.json | 180 ++ .../bf16/result.json | 395 +++++ .../bf16/sustained/result.json | 278 +++ .../env_info.json | 53 + .../result.json | 963 +++++++++++ .../w4a16/accuracy/accuracy.json | 8 + .../w4a16/offline/result.json | 178 ++ .../w4a16/online/result.json | 180 ++ .../w4a16/result.json | 395 +++++ .../w4a16/sustained/result.json | 278 +++ .../w8a16/accuracy/accuracy.json | 8 + .../w8a16/offline/result.json | 178 ++ .../w8a16/online/result.json | 180 ++ .../w8a16/result.json | 395 +++++ .../w8a16/sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 154 ++ .../online/result.json | 168 ++ .../result.json | 551 ++++++ .../speculative/result.json | 154 ++ .../sustained/result.json | 428 +++++ .../accuracy/accuracy.json | 0 .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 166 ++ .../online/result.json | 156 ++ .../result.json | 371 ++++ .../sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 + .../burst/result.json | 164 ++ .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 166 ++ .../online/result.json | 168 ++ .../result.json | 615 +++++++ .../speculative/result.json | 166 ++ .../sustained/result.json | 428 +++++ .../bf16/accuracy/accuracy.json | 8 + .../bf16/offline/result.json | 178 ++ .../bf16/online/result.json | 180 ++ .../bf16/result.json | 395 +++++ .../bf16/sustained/result.json | 278 +++ .../env_info.json | 53 + .../result.json | 963 +++++++++++ .../w4a16/accuracy/accuracy.json | 8 + .../w4a16/offline/result.json | 178 ++ .../w4a16/online/result.json | 180 ++ .../w4a16/result.json | 395 +++++ .../w4a16/sustained/result.json | 278 +++ .../w8a16/accuracy/accuracy.json | 8 + .../w8a16/offline/result.json | 178 ++ .../w8a16/online/result.json | 180 ++ .../w8a16/result.json | 395 +++++ .../w8a16/sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 154 ++ .../online/result.json | 168 ++ .../result.json | 551 ++++++ .../speculative/result.json | 154 ++ .../sustained/result.json | 428 +++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 166 ++ .../online/result.json | 156 ++ .../result.json | 371 ++++ .../sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 + .../burst/result.json | 161 ++ .../env_info.json | 44 + .../interactive/result.json | 139 ++ .../offline/result.json | 196 +++ .../online/result.json | 195 +++ .../result.json | 694 ++++++++ .../sustained/result.json | 456 +++++ .../bf16/accuracy/accuracy.json | 8 + .../bf16/offline/result.json | 221 +++ .../bf16/result.json | 228 +++ .../env_info.json | 44 + .../fp8/accuracy/accuracy.json | 8 + .../fp8/offline/result.json | 221 +++ .../fp8/result.json | 228 +++ .../result.json | 603 +++++++ .../w4a16/accuracy/accuracy.json | 8 + .../w4a16/offline/result.json | 221 +++ .../w4a16/result.json | 228 +++ .../w8a16/accuracy/accuracy.json | 8 + .../w8a16/offline/result.json | 221 +++ .../w8a16/result.json | 228 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 44 + .../interactive/result.json | 138 ++ .../offline/result.json | 169 ++ .../online/result.json | 192 +++ .../result.json | 617 +++++++ .../sustained/result.json | 456 +++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 44 + .../interactive/result.json | 139 ++ .../offline/result.json | 196 +++ .../online/result.json | 171 ++ .../result.json | 460 +++++ .../sustained/result.json | 292 ++++ .../accuracy/accuracy.json | 8 + .../burst/result.json | 143 ++ .../env_info.json | 32 + .../interactive/result.json | 115 ++ .../offline/result.json | 111 +- .../online/result.json | 147 ++ .../result.json | 594 +++++++ .../speculative}/result.json | 108 +- .../sustained/result.json | 407 +++++ .../bf16/accuracy/accuracy.json | 8 + .../bf16/offline/result.json | 157 ++ .../bf16/online/result.json | 159 ++ .../bf16/result.json | 374 ++++ .../bf16/sustained/result.json | 257 +++ .../env_info.json | 32 + .../fp8/accuracy/accuracy.json | 8 + .../fp8/offline/result.json | 157 ++ .../fp8/online/result.json | 159 ++ .../fp8/result.json | 374 ++++ .../fp8/sustained/result.json | 257 +++ .../result.json | 1499 +++++++++++++++++ .../w4a16/accuracy/accuracy.json | 8 + .../w4a16/offline/result.json | 157 ++ .../w4a16/online/result.json | 159 ++ .../w4a16/result.json | 374 ++++ .../w4a16/sustained/result.json | 257 +++ .../w8a16/accuracy/accuracy.json | 8 + .../w8a16/offline/result.json | 157 ++ .../w8a16/online/result.json | 159 ++ .../w8a16/result.json | 374 ++++ .../w8a16/sustained/result.json | 257 +++ .../w8a8/accuracy/accuracy.json | 8 + .../w8a8/offline/result.json | 157 ++ .../w8a8/online/result.json | 159 ++ .../w8a8/result.json | 374 ++++ .../w8a8/sustained/result.json | 257 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 32 + .../interactive/result.json | 115 ++ .../offline/result.json | 133 ++ .../online/result.json | 147 ++ .../result.json | 500 ++++++ .../sustained/result.json | 407 +++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 32 + .../interactive/result.json | 115 ++ .../offline/result.json | 107 +- .../online/result.json | 135 ++ .../result.json | 353 ++++ .../sustained/result.json | 257 +++ .../accuracy/accuracy.json | 8 + .../burst/result.json | 164 ++ .../env_info.json | 25 +- .../interactive/result.json | 136 ++ .../offline/result.json | 166 ++ .../online/result.json | 168 ++ .../result.json | 615 +++++++ .../speculative/result.json | 166 ++ .../sustained/result.json | 428 +++++ .../bf16}/accuracy/accuracy.json | 0 .../bf16/offline/result.json | 178 ++ .../bf16/online/result.json | 180 ++ .../bf16/result.json | 395 +++++ .../bf16/sustained/result.json | 278 +++ .../env_info.json | 25 +- .../result.json | 963 +++++++++++ .../w4a16/accuracy/accuracy.json | 8 + .../w4a16/offline/result.json | 178 ++ .../w4a16/online/result.json | 180 ++ .../w4a16/result.json | 395 +++++ .../w4a16/sustained/result.json | 278 +++ .../w8a16/accuracy/accuracy.json | 8 + .../w8a16/offline/result.json | 178 ++ .../w8a16/online/result.json | 180 ++ .../w8a16/result.json | 395 +++++ .../w8a16/sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 + .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 154 ++ .../online/result.json | 168 ++ .../result.json | 551 ++++++ .../speculative/result.json | 154 ++ .../sustained/result.json | 428 +++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 53 + .../interactive/result.json | 136 ++ .../offline/result.json | 98 +- .../online/result.json | 156 ++ .../result.json | 371 ++++ .../sustained/result.json | 278 +++ .../accuracy/accuracy.json | 8 - .../env_info.json | 33 - .../online/result.json | 158 -- .../result.json | 210 --- .../accuracy/accuracy.json | 8 - .../env_info.json | 33 - .../interactive/result.json | 126 -- .../online/result.json | 146 -- .../result.json | 210 --- 258 files changed, 50069 insertions(+), 4790 deletions(-) delete mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json delete mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json delete mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json delete mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json delete mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json delete mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json delete mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json delete mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json delete mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json delete mode 100644 results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/result.json delete mode 100644 results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/env_info.json delete mode 100644 results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/result.json delete mode 100644 results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/result.json delete mode 100644 results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/burst/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/interactive/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd => nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413}/accuracy/accuracy.json (100%) create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/interactive/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/result.json create mode 100644 results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/burst/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd => nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc}/env_info.json (50%) create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/interactive/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/speculative/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/accuracy/accuracy.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/interactive/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/speculative/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/sustained/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8 => nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2}/accuracy/accuracy.json (100%) create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/env_info.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/interactive/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/offline/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/online/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/result.json create mode 100644 results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/burst/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/env_info.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/interactive/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/speculative/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/env_info.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/env_info.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/interactive/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/speculative/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/sustained/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/accuracy/accuracy.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/env_info.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/interactive/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/offline/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/online/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/result.json create mode 100644 results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/sustained/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/burst/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/env_info.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/interactive/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/online/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/result.json create mode 100644 results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/sustained/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/env_info.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/result.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/env_info.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/interactive/result.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/online/result.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/result.json create mode 100644 results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/sustained/result.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/accuracy/accuracy.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/env_info.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/interactive/result.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/offline/result.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/online/result.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/result.json create mode 100644 results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/burst/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/env_info.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/interactive/result.json rename results/community/{mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0 => nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd}/offline/result.json (54%) create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/result.json rename results/community/{tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline => nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/speculative}/result.json (50%) create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/env_info.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/env_info.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/interactive/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/offline/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/sustained/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/accuracy/accuracy.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/env_info.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/interactive/result.json rename results/community/{tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c => nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697}/offline/result.json (51%) create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/online/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/result.json create mode 100644 results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/sustained/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/accuracy/accuracy.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/burst/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8 => nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5}/env_info.json (63%) create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/interactive/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/offline/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/speculative/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/sustained/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97 => nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16}/accuracy/accuracy.json (100%) create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained/result.json rename results/community/{nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97 => nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb}/env_info.json (63%) create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/accuracy/accuracy.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/accuracy/accuracy.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/env_info.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/interactive/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/offline/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/speculative/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/sustained/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/accuracy/accuracy.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/env_info.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/interactive/result.json rename results/community/{mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d => nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50}/offline/result.json (53%) create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/online/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/result.json create mode 100644 results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/sustained/result.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json delete mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json deleted file mode 100644 index 72422349..00000000 --- a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "subset_score": 0.07, - "baseline_delta": -0.53, - "valid": false, - "framework": "vllm-musa", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json deleted file mode 100644 index 4244ef73..00000000 --- a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "collected_at": "2026-05-18T09:21:31.092840+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json deleted file mode 100644 index 064d6b8a..00000000 --- a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json +++ /dev/null @@ -1,163 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_A", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", - "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", - "count": 1, - "memory_gb": 48.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T09:21:31.092840+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" - }, - "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" - }, - "model": { - "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8.0, - "precision": "BF16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "online", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": null, - "runtime_metrics": null - }, - "metrics": { - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 5, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5.0, - "ttft_ms_p50": 194.45, - "ttft_ms_p90": 315.05, - "ttft_ms_p99": 424.55, - "tpot_ms_p50": 201.93, - "tpot_ms_p90": 253.8, - "tpot_ms_p99": 471.28, - "elapsed_seconds_median": 137.6, - "sla_met": true - }, - { - "target_qps": 25, - "achieved_qps": 25.0, - "ttft_ms_p50": 4796.14, - "ttft_ms_p90": 8459.18, - "ttft_ms_p99": 9348.86, - "tpot_ms_p50": 355.01, - "tpot_ms_p90": 6430.04, - "tpot_ms_p99": 15579.83, - "elapsed_seconds_median": 93.0, - "sla_met": false - }, - { - "target_qps": 100, - "achieved_qps": 100.0, - "ttft_ms_p50": 10354.27, - "ttft_ms_p90": 17651.16, - "ttft_ms_p99": 19078.89, - "tpot_ms_p50": 849.82, - "tpot_ms_p90": 8677.79, - "tpot_ms_p99": 14281.03, - "elapsed_seconds_median": 90.0, - "sla_met": false - } - ] - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "17:53:38", - "run_id": "cabb7bd0", - "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", - "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00", - "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00", - "benchmark_elapsed_minutes": 16.4, - "model_load_seconds": 122.7 - } -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json deleted file mode 100644 index e4b1093e..00000000 --- a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json +++ /dev/null @@ -1,215 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_A", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", - "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", - "count": 1, - "memory_gb": 48.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T09:21:31.092840+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" - }, - "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" - }, - "model": { - "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8.0, - "precision": "BF16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": null - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 8, - "throughput_tokens_per_sec": 332.62, - "throughput_tokens_per_sec_per_chip": 332.62, - "throughput_tokens_per_sec_total": 922.83, - "elapsed_seconds_median": 43.4, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 32, - "throughput_tokens_per_sec": 331.64, - "throughput_tokens_per_sec_per_chip": 331.64, - "throughput_tokens_per_sec_total": 920.1, - "elapsed_seconds_median": 43.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 128, - "throughput_tokens_per_sec": 331.76, - "throughput_tokens_per_sec_per_chip": 331.76, - "throughput_tokens_per_sec_total": 920.46, - "elapsed_seconds_median": 43.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 5, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5.0, - "ttft_ms_p50": 194.45, - "ttft_ms_p90": 315.05, - "ttft_ms_p99": 424.55, - "tpot_ms_p50": 201.93, - "tpot_ms_p90": 253.8, - "tpot_ms_p99": 471.28, - "elapsed_seconds_median": 137.6, - "sla_met": true - }, - { - "target_qps": 25, - "achieved_qps": 25.0, - "ttft_ms_p50": 4796.14, - "ttft_ms_p90": 8459.18, - "ttft_ms_p99": 9348.86, - "tpot_ms_p50": 355.01, - "tpot_ms_p90": 6430.04, - "tpot_ms_p99": 15579.83, - "elapsed_seconds_median": 93.0, - "sla_met": false - }, - { - "target_qps": 100, - "achieved_qps": 100.0, - "ttft_ms_p50": 10354.27, - "ttft_ms_p90": 17651.16, - "ttft_ms_p99": 19078.89, - "tpot_ms_p50": 849.82, - "tpot_ms_p90": 8677.79, - "tpot_ms_p99": 14281.03, - "elapsed_seconds_median": 90.0, - "sla_met": false - } - ] - } - }, - "accuracy": { - "subset_score": 0.07, - "baseline_delta": -0.53, - "valid": false, - "framework": "vllm-musa", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "17:34:52", - "run_id": "cabb7bd0", - "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", - "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.", - "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", - "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", - "benchmark_elapsed_minutes": 25.1, - "model_load_seconds": 116.8, - "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", - "scenario_dirs": { - "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline", - "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online" - } - } -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json deleted file mode 100644 index 63c6e929..00000000 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "subset_score": 0.07, - "baseline_delta": -0.31, - "valid": false, - "framework": "vllm-musa", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json deleted file mode 100644 index 31f501be..00000000 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "collected_at": "2026-05-18T08:40:55.208034+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json deleted file mode 100644 index 4f5ff811..00000000 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", - "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", - "count": 1, - "memory_gb": 48.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T08:40:55.208034+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" - }, - "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "BF16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "interactive", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": null, - "runtime_metrics": null - }, - "metrics": { - "interactive": { - "ttft_ms_p50": 25.89, - "ttft_ms_p90": 27.18, - "ttft_ms_p99": 28.51, - "tpot_ms_p50": 14.85, - "tpot_ms_p90": 15.17, - "tpot_ms_p99": 15.5, - "peak_memory_gb": null, - "elapsed_seconds_median": 481.4 - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "17:21:09", - "run_id": "4f66d29d", - "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", - "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00", - "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00", - "benchmark_elapsed_minutes": 24.4, - "model_load_seconds": 151.2 - } -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json deleted file mode 100644 index eb13372d..00000000 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json +++ /dev/null @@ -1,151 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", - "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", - "count": 1, - "memory_gb": 48.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T08:40:55.208034+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" - }, - "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "BF16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "online", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": null, - "runtime_metrics": null - }, - "metrics": { - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 40, - "results_by_qps": [ - { - "target_qps": 10, - "achieved_qps": 10.0, - "ttft_ms_p50": 47.68, - "ttft_ms_p90": 96.31, - "ttft_ms_p99": 956.22, - "tpot_ms_p50": 47.25, - "tpot_ms_p90": 80.82, - "tpot_ms_p99": 131.63, - "elapsed_seconds_median": 37.8, - "sla_met": false - }, - { - "target_qps": 40, - "achieved_qps": 40.0, - "ttft_ms_p50": 94.5, - "ttft_ms_p90": 194.64, - "ttft_ms_p99": 331.88, - "tpot_ms_p50": 74.76, - "tpot_ms_p90": 287.01, - "tpot_ms_p99": 444.19, - "elapsed_seconds_median": 19.0, - "sla_met": true - } - ] - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "16:53:54", - "run_id": "4f66d29d", - "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", - "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00", - "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00", - "benchmark_elapsed_minutes": 2.9, - "model_load_seconds": 132.6 - } -} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json deleted file mode 100644 index a1c073de..00000000 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json +++ /dev/null @@ -1,215 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", - "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", - "count": 1, - "memory_gb": 48.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T08:40:55.208034+00:00", - "accelerators": [ - { - "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", - "firmware_version": null, - "supports_bf16": true - } - ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" - }, - "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "BF16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online", - "interactive" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": null - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 1994.51, - "throughput_tokens_per_sec_per_chip": 1994.51, - "throughput_tokens_per_sec_total": 3642.41, - "elapsed_seconds_median": 12.5, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 1998.44, - "throughput_tokens_per_sec_per_chip": 1998.44, - "throughput_tokens_per_sec_total": 3649.59, - "elapsed_seconds_median": 12.5, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 2004.02, - "throughput_tokens_per_sec_per_chip": 2004.02, - "throughput_tokens_per_sec_total": 3659.77, - "elapsed_seconds_median": 12.5, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 40, - "results_by_qps": [ - { - "target_qps": 10, - "achieved_qps": 10.0, - "ttft_ms_p50": 47.68, - "ttft_ms_p90": 96.31, - "ttft_ms_p99": 956.22, - "tpot_ms_p50": 47.25, - "tpot_ms_p90": 80.82, - "tpot_ms_p99": 131.63, - "elapsed_seconds_median": 37.8, - "sla_met": false - }, - { - "target_qps": 40, - "achieved_qps": 40.0, - "ttft_ms_p50": 94.5, - "ttft_ms_p90": 194.64, - "ttft_ms_p99": 331.88, - "tpot_ms_p50": 74.76, - "tpot_ms_p90": 287.01, - "tpot_ms_p99": 444.19, - "elapsed_seconds_median": 19.0, - "sla_met": true - } - ] - }, - "interactive": { - "ttft_ms_p50": 25.89, - "ttft_ms_p90": 27.18, - "ttft_ms_p99": 28.51, - "tpot_ms_p50": 14.85, - "tpot_ms_p90": 15.17, - "tpot_ms_p99": 15.5, - "peak_memory_gb": null, - "elapsed_seconds_median": 481.4 - } - }, - "accuracy": { - "subset_score": 0.07, - "baseline_delta": -0.31, - "valid": false, - "framework": "vllm-musa", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "16:48:27", - "run_id": "4f66d29d", - "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", - "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.", - "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", - "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", - "benchmark_elapsed_minutes": 29.8, - "model_load_seconds": 146.8, - "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", - "scenario_dirs": { - "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline", - "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online", - "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive" - } - } -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/result.json b/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/result.json deleted file mode 100644 index adcbafe3..00000000 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/result.json +++ /dev/null @@ -1,652 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_A", - "implementation_id": "nvidia_sglang_c43a8309", - "chip": { - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 40, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-06T11:15:11.081772+00:00", - "accelerators": [ - { - "index": 0, - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "memory_gb": 40, - "driver_version": "565.57.01", - "firmware_version": null, - "compute_capability": "8.0", - "supports_bf16": true - } - ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tPXB\tNODE\tSYS\t0-31,64-95\t0\t\tN/A\nNIC0\tPXB\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tPXB\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", - "intra_node_interconnect": null, - "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.7, - "pcie_generation": "PCIe Gen 4", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_2", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_3", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20", - "kernel_version": "5.15.0-60-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "SGLang", - "framework_version": "0.5.6", - "driver_version": "565.57.01", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20" - }, - "model": { - "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8, - "precision": "BF16", - "effective_dtype": "bfloat16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online", - "interactive", - "sustained", - "speculative", - "burst" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": null - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 8, - "throughput_tokens_per_sec": 3144.73, - "throughput_tokens_per_sec_per_chip": 3144.73, - "elapsed_seconds_median": 11.2, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 32, - "throughput_tokens_per_sec": 3146.66, - "throughput_tokens_per_sec_per_chip": 3146.66, - "elapsed_seconds_median": 11.2, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 128, - "throughput_tokens_per_sec": 3146.09, - "throughput_tokens_per_sec_per_chip": 3146.09, - "elapsed_seconds_median": 11.2, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 100, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5, - "ttft_ms_p50": 43.91, - "ttft_ms_p90": 62.26, - "ttft_ms_p99": 972.47, - "tpot_ms_p50": 15.63, - "tpot_ms_p90": 17.36, - "tpot_ms_p99": 18.58, - "elapsed_seconds_median": 66.1, - "sla_met": false - }, - { - "target_qps": 25, - "achieved_qps": 25, - "ttft_ms_p50": 52.85, - "ttft_ms_p90": 67.65, - "ttft_ms_p99": 80.71, - "tpot_ms_p50": 36.16, - "tpot_ms_p90": 41.45, - "tpot_ms_p99": 57.42, - "elapsed_seconds_median": 17.1, - "sla_met": true - }, - { - "target_qps": 100, - "achieved_qps": 100, - "ttft_ms_p50": 50.85, - "ttft_ms_p90": 62.88, - "ttft_ms_p99": 245.1, - "tpot_ms_p50": 41.47, - "tpot_ms_p90": 53.07, - "tpot_ms_p99": 177.42, - "elapsed_seconds_median": 10.8, - "sla_met": true - } - ] - }, - "interactive": { - "ttft_ms_p50": 32.52, - "ttft_ms_p90": 44.71, - "ttft_ms_p99": 61.84, - "tpot_ms_p50": 12.93, - "tpot_ms_p90": 12.98, - "tpot_ms_p99": 13.03, - "peak_memory_gb": null, - "elapsed_seconds_median": 381.4 - }, - "sustained": { - "sustained_concurrency": 8, - "duration_minutes": 30, - "warmup_minutes": 2, - "sample_interval_seconds": 60, - "samples": [ - { - "minute": 1, - "is_warmup": true, - "throughput_tokens_per_sec": 477, - "tokens_out": 28638, - "tokens_in": 0, - "requests_completed": 154, - "ttft_ms_p50": 49.6, - "ttft_ms_p99": 6750.3 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 566.2, - "tokens_out": 33972, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.6, - "ttft_ms_p99": 61.2 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 560.8, - "tokens_out": 33639, - "tokens_in": 0, - "requests_completed": 177, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 61.3 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 565.3, - "tokens_out": 33929, - "tokens_in": 0, - "requests_completed": 181, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 60.8 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 561.4, - "tokens_out": 33685, - "tokens_in": 0, - "requests_completed": 179, - "ttft_ms_p50": 44.2, - "ttft_ms_p99": 61.6 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 561.9, - "tokens_out": 33707, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 60.8 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 570, - "tokens_out": 34190, - "tokens_in": 0, - "requests_completed": 179, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.4 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 558.3, - "tokens_out": 33498, - "tokens_in": 0, - "requests_completed": 177, - "ttft_ms_p50": 44.6, - "ttft_ms_p99": 62.6 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 563.3, - "tokens_out": 33801, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.8 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 552.7, - "tokens_out": 33163, - "tokens_in": 0, - "requests_completed": 176, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 52.4 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 569.3, - "tokens_out": 34157, - "tokens_in": 0, - "requests_completed": 181, - "ttft_ms_p50": 44.1, - "ttft_ms_p99": 60.4 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 558.9, - "tokens_out": 33526, - "tokens_in": 0, - "requests_completed": 177, - "ttft_ms_p50": 44.2, - "ttft_ms_p99": 47.9 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 568.4, - "tokens_out": 34113, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 139.2 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 557.2, - "tokens_out": 33424, - "tokens_in": 0, - "requests_completed": 178, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 61.5 - }, - { - "minute": 15, - "is_warmup": false, - "throughput_tokens_per_sec": 565.5, - "tokens_out": 33942, - "tokens_in": 0, - "requests_completed": 181, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.7 - }, - { - "minute": 16, - "is_warmup": false, - "throughput_tokens_per_sec": 554.1, - "tokens_out": 33238, - "tokens_in": 0, - "requests_completed": 175, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.2 - }, - { - "minute": 17, - "is_warmup": false, - "throughput_tokens_per_sec": 563.7, - "tokens_out": 33832, - "tokens_in": 0, - "requests_completed": 178, - "ttft_ms_p50": 44.4, - "ttft_ms_p99": 61.7 - }, - { - "minute": 18, - "is_warmup": false, - "throughput_tokens_per_sec": 563.3, - "tokens_out": 33783, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 62.1 - }, - { - "minute": 19, - "is_warmup": false, - "throughput_tokens_per_sec": 565, - "tokens_out": 33912, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.6, - "ttft_ms_p99": 62.2 - }, - { - "minute": 20, - "is_warmup": false, - "throughput_tokens_per_sec": 563.3, - "tokens_out": 33773, - "tokens_in": 0, - "requests_completed": 179, - "ttft_ms_p50": 44.6, - "ttft_ms_p99": 61.6 - }, - { - "minute": 21, - "is_warmup": false, - "throughput_tokens_per_sec": 564.7, - "tokens_out": 33889, - "tokens_in": 0, - "requests_completed": 178, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.3 - }, - { - "minute": 22, - "is_warmup": false, - "throughput_tokens_per_sec": 564.3, - "tokens_out": 33853, - "tokens_in": 0, - "requests_completed": 179, - "ttft_ms_p50": 44.6, - "ttft_ms_p99": 61.1 - }, - { - "minute": 23, - "is_warmup": false, - "throughput_tokens_per_sec": 562.3, - "tokens_out": 33744, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 61.2 - }, - { - "minute": 24, - "is_warmup": false, - "throughput_tokens_per_sec": 569.5, - "tokens_out": 34180, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 61.3 - }, - { - "minute": 25, - "is_warmup": false, - "throughput_tokens_per_sec": 550.8, - "tokens_out": 33047, - "tokens_in": 0, - "requests_completed": 176, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 61.5 - }, - { - "minute": 26, - "is_warmup": false, - "throughput_tokens_per_sec": 562.5, - "tokens_out": 33749, - "tokens_in": 0, - "requests_completed": 178, - "ttft_ms_p50": 44.5, - "ttft_ms_p99": 63.9 - }, - { - "minute": 27, - "is_warmup": false, - "throughput_tokens_per_sec": 561.7, - "tokens_out": 33689, - "tokens_in": 0, - "requests_completed": 179, - "ttft_ms_p50": 44.7, - "ttft_ms_p99": 61.6 - }, - { - "minute": 28, - "is_warmup": false, - "throughput_tokens_per_sec": 566.5, - "tokens_out": 34010, - "tokens_in": 0, - "requests_completed": 181, - "ttft_ms_p50": 44.7, - "ttft_ms_p99": 141.7 - }, - { - "minute": 29, - "is_warmup": false, - "throughput_tokens_per_sec": 558.2, - "tokens_out": 33464, - "tokens_in": 0, - "requests_completed": 178, - "ttft_ms_p50": 44.3, - "ttft_ms_p99": 60.9 - } - ], - "sustained_throughput_tokens_per_sec": 562.5, - "throttle_ratio": 0.966, - "throttle_onset_minute": null, - "ttft_p99_drift_ms": -0.3, - "throughput_post_warmup_reliability": { - "n": 28, - "mean": 562.5, - "std": 4.9, - "cv_pct": 0.86, - "stability": "stable", - "runs": [ - 566.2, - 560.8, - 565.3, - 561.4, - 561.9, - 570.0, - 558.3, - 563.3, - 552.7, - 569.3, - 558.9, - 568.4, - 557.2, - 565.5, - 554.1, - 563.7, - 563.3, - 565.0, - 563.3, - 564.7, - 564.3, - 562.3, - 569.5, - 550.8, - 562.5, - 561.7, - 566.5, - 558.2 - ] - } - }, - "speculative": { - "results_by_concurrency": [ - { - "client_concurrency": 8, - "throughput_tokens_per_sec": 705.16, - "throughput_tokens_per_sec_per_chip": 705.16, - "elapsed_seconds_median": 49.7, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 32, - "throughput_tokens_per_sec": 703.58, - "throughput_tokens_per_sec_per_chip": 703.58, - "elapsed_seconds_median": 49.8, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 128, - "throughput_tokens_per_sec": 704.29, - "throughput_tokens_per_sec_per_chip": 704.29, - "elapsed_seconds_median": 49.7, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "burst": { - "sla_ttft_ms": 500, - "burst_steady_qps": 5, - "burst_peak_qps": 25, - "burst_duration_seconds": 30, - "burst_interval_seconds": 120, - "steady_requests_total": 1812, - "burst_requests_total": 2245, - "steady_ttft_p50_ms": 43.06, - "steady_ttft_p99_ms": 3985.36, - "burst_ttft_p50_ms": 57.82, - "burst_ttft_p99_ms": 99.11, - "sla_met_during_burst": true, - "burst_degradation_ratio": 0.025, - "results_by_cycle": [ - { - "cycle": 1, - "steady_requests": 581, - "burst_requests": 760, - "steady_ttft_p99_ms": 5093.98, - "burst_ttft_p99_ms": 103.16 - }, - { - "cycle": 2, - "steady_requests": 595, - "burst_requests": 734, - "steady_ttft_p99_ms": 63.56, - "burst_ttft_p99_ms": 90.45 - }, - { - "cycle": 3, - "steady_requests": 636, - "burst_requests": 751, - "steady_ttft_p99_ms": 65.44, - "burst_ttft_p99_ms": 85.05 - } - ] - } - }, - "accuracy": { - "subset_score": 0.61, - "baseline_delta": 0.01, - "valid": true, - "framework": "SGLang", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." - }, - "meta": { - "submitted_by": "Gong-K", - "submission_type": "individual", - "date": "2026-05-06", - "time": "11:21:34", - "run_id": "958afbbd", - "run_name": "nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd", - "flagged": null, - "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", - "env_info_file": "../env_info.json", - "log_file": null, - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-06T11:19:15.947406+00:00", - "benchmark_end_time": "2026-05-06T11:21:34.758403+00:00", - "benchmark_elapsed_minutes": 74.2, - "model_load_seconds": 50, - "benchmark_elapsed_minutes_note": "Total across [offline, online, interactive, sustained, speculative, burst] scenarios.", - "scenario_dirs": { - "offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/offline", - "online": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/online", - "interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/interactive", - "sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/sustained", - "speculative": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/speculative", - "burst": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/burst" - } - } -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/env_info.json b/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/env_info.json deleted file mode 100644 index 967b71e8..00000000 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/env_info.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "collected_at": "2026-04-30T00:00:00Z", - "accelerators": [ - { - "index": 0, - "name": "NVIDIA A100-SXM4-40GB", - "memory_gb": 40, - "driver_version": "565.57.01", - "firmware_version": null - } - ], - "accelerator_topology": null, - "cpu": { - "model": "unknown", - "physical_cores": 1, - "logical_cores": 1, - "numa_nodes": 1 - }, - "system_memory_gb": 0, - "pcie_generation": "unknown", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "kernel_version": "unknown", - "runtime_version": "CUDA 12.8" -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/result.json b/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/result.json deleted file mode 100644 index 5105bc28..00000000 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/result.json +++ /dev/null @@ -1,963 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_C", - "implementation_id": "nvidia_sglang_c43a8309", - "chip": { - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 40, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "software": { - "framework": "SGLang", - "framework_version": "0.5.6", - "driver_version": "565.57.01", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20" - }, - "model": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8, - "precision": "BF16", - "effective_dtype": "bfloat16", - "quantization_method": null, - "model_format": "HuggingFace original", - "_note": "suite model_id. Each precision level uses its own quantized checkpoint." - }, - "task": { - "scenarios_run": [ - "accuracy", - "offline", - "online", - "sustained" - ], - "precision_levels_run": [ - "BF16", - "FP8", - "W8A8", - "W8A16", - "W4A16" - ], - "precision_levels_skipped": [ - "FP16" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": null - }, - "metrics": { - "quantization": { - "results_by_precision": [ - { - "precision": "BF16", - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "best_throughput_tokens_per_sec": 3160.74, - "accuracy_score": 0.57, - "accuracy_baseline_delta": 0.01, - "accuracy_valid": true, - "quality_efficiency": 1801.6, - "speedup_vs_bf16": 1, - "results_by_concurrency": [ - { - "client_concurrency": 1, - "throughput_tokens_per_sec": 3149.6, - "throughput_tokens_per_sec_per_chip": 3149.6, - "elapsed_seconds_median": 11.4, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 3160.74, - "throughput_tokens_per_sec_per_chip": 3160.74, - "elapsed_seconds_median": 11.3, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 3148.17, - "throughput_tokens_per_sec_per_chip": 3148.17, - "elapsed_seconds_median": 11.3, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 3156.58, - "throughput_tokens_per_sec_per_chip": 3156.58, - "elapsed_seconds_median": 11.3, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ], - "result_dir": "bf16", - "effective_dtype": "bfloat16", - "quantization_method": null - }, - { - "precision": "W8A16", - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", - "best_throughput_tokens_per_sec": 3396.91, - "accuracy_score": 0.58, - "accuracy_baseline_delta": -0.01, - "accuracy_valid": true, - "quality_efficiency": 1970.2, - "speedup_vs_bf16": 1.075, - "results_by_concurrency": [ - { - "client_concurrency": 1, - "throughput_tokens_per_sec": 3396.91, - "throughput_tokens_per_sec_per_chip": 3396.91, - "elapsed_seconds_median": 10.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 3316.93, - "throughput_tokens_per_sec_per_chip": 3316.93, - "elapsed_seconds_median": 10.8, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 3387.33, - "throughput_tokens_per_sec_per_chip": 3387.33, - "elapsed_seconds_median": 10.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 3395.75, - "throughput_tokens_per_sec_per_chip": 3395.75, - "elapsed_seconds_median": 10.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ], - "result_dir": "w8a16", - "effective_dtype": "auto", - "quantization_method": "compressed-tensors" - }, - { - "precision": "W4A16", - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", - "best_throughput_tokens_per_sec": 1817.91, - "accuracy_score": 0.56, - "accuracy_baseline_delta": -0.01, - "accuracy_valid": true, - "quality_efficiency": 1018, - "speedup_vs_bf16": 0.575, - "results_by_concurrency": [ - { - "client_concurrency": 1, - "throughput_tokens_per_sec": 1808.4, - "throughput_tokens_per_sec_per_chip": 1808.4, - "elapsed_seconds_median": 19, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 1810.14, - "throughput_tokens_per_sec_per_chip": 1810.14, - "elapsed_seconds_median": 19, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 1810.03, - "throughput_tokens_per_sec_per_chip": 1810.03, - "elapsed_seconds_median": 19, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 1817.91, - "throughput_tokens_per_sec_per_chip": 1817.91, - "elapsed_seconds_median": 19, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ], - "result_dir": "w4a16", - "effective_dtype": "auto", - "quantization_method": "gptq" - } - ] - }, - "derived": {}, - "quantization_online": { - "results_by_precision": [ - { - "precision": "BF16", - "max_valid_qps": 50, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5, - "ttft_ms_p50": 44.84, - "ttft_ms_p90": 63.85, - "ttft_ms_p99": 1627.45, - "tpot_ms_p50": 15.7, - "tpot_ms_p90": 17.63, - "tpot_ms_p99": 19.03, - "elapsed_seconds_median": 66.2, - "sla_met": false - }, - { - "target_qps": 10, - "achieved_qps": 10, - "ttft_ms_p50": 47, - "ttft_ms_p90": 56.65, - "ttft_ms_p99": 65.26, - "tpot_ms_p50": 21.61, - "tpot_ms_p90": 23.39, - "tpot_ms_p99": 26.07, - "elapsed_seconds_median": 32.9, - "sla_met": true - }, - { - "target_qps": 25, - "achieved_qps": 25, - "ttft_ms_p50": 52.95, - "ttft_ms_p90": 66.65, - "ttft_ms_p99": 78.1, - "tpot_ms_p50": 35.4, - "tpot_ms_p90": 40.44, - "tpot_ms_p99": 49.08, - "elapsed_seconds_median": 17.1, - "sla_met": true - }, - { - "target_qps": 50, - "achieved_qps": 50, - "ttft_ms_p50": 51.84, - "ttft_ms_p90": 67.78, - "ttft_ms_p99": 87.9, - "tpot_ms_p50": 41.22, - "tpot_ms_p90": 49.79, - "tpot_ms_p99": 75.08, - "elapsed_seconds_median": 12.9, - "sla_met": true - } - ] - }, - { - "precision": "W8A16", - "max_valid_qps": 50, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5, - "ttft_ms_p50": 36.98, - "ttft_ms_p90": 65.92, - "ttft_ms_p99": 1707.78, - "tpot_ms_p50": 9.64, - "tpot_ms_p90": 10.99, - "tpot_ms_p99": 14.19, - "elapsed_seconds_median": 64.8, - "sla_met": false - }, - { - "target_qps": 10, - "achieved_qps": 10, - "ttft_ms_p50": 42.23, - "ttft_ms_p90": 54.06, - "ttft_ms_p99": 61.13, - "tpot_ms_p50": 15.34, - "tpot_ms_p90": 19.73, - "tpot_ms_p99": 21.49, - "elapsed_seconds_median": 31.8, - "sla_met": true - }, - { - "target_qps": 25, - "achieved_qps": 25, - "ttft_ms_p50": 54.16, - "ttft_ms_p90": 70.81, - "ttft_ms_p99": 86.6, - "tpot_ms_p50": 38.87, - "tpot_ms_p90": 45.19, - "tpot_ms_p99": 56.31, - "elapsed_seconds_median": 17.6, - "sla_met": true - }, - { - "target_qps": 50, - "achieved_qps": 50, - "ttft_ms_p50": 55.28, - "ttft_ms_p90": 74.41, - "ttft_ms_p99": 101.43, - "tpot_ms_p50": 47.2, - "tpot_ms_p90": 55.92, - "tpot_ms_p99": 82.54, - "elapsed_seconds_median": 14.1, - "sla_met": true - } - ] - }, - { - "precision": "W4A16", - "max_valid_qps": 50, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5, - "ttft_ms_p50": 57.96, - "ttft_ms_p90": 100.93, - "ttft_ms_p99": 1674.78, - "tpot_ms_p50": 23.16, - "tpot_ms_p90": 36.67, - "tpot_ms_p99": 42.7, - "elapsed_seconds_median": 66.6, - "sla_met": false - }, - { - "target_qps": 10, - "achieved_qps": 10, - "ttft_ms_p50": 65.68, - "ttft_ms_p90": 85.76, - "ttft_ms_p99": 92.43, - "tpot_ms_p50": 42.17, - "tpot_ms_p90": 43.43, - "tpot_ms_p99": 46.09, - "elapsed_seconds_median": 35.7, - "sla_met": true - }, - { - "target_qps": 25, - "achieved_qps": 25, - "ttft_ms_p50": 64.12, - "ttft_ms_p90": 88.09, - "ttft_ms_p99": 113.73, - "tpot_ms_p50": 53.25, - "tpot_ms_p90": 59.64, - "tpot_ms_p99": 73.73, - "elapsed_seconds_median": 20.9, - "sla_met": true - }, - { - "target_qps": 50, - "achieved_qps": 50, - "ttft_ms_p50": 57.15, - "ttft_ms_p90": 81.87, - "ttft_ms_p99": 103.31, - "tpot_ms_p50": 55.67, - "tpot_ms_p90": 67.41, - "tpot_ms_p99": 86.73, - "elapsed_seconds_median": 16.4, - "sla_met": true - } - ] - } - ] - }, - "quantization_sustained": { - "results_by_precision": [ - { - "precision": "BF16", - "sustained_throughput_tokens_per_sec": 558.6, - "throttle_ratio": 0.889, - "throttle_onset_minute": 1, - "ttft_p99_drift_ms": -2930, - "sustained_concurrency": 8, - "duration_minutes": 15, - "samples": [ - { - "minute": 1, - "is_warmup": false, - "throughput_tokens_per_sec": 510.3, - "tokens_out": 30617, - "tokens_in": 0, - "requests_completed": 168, - "ttft_ms_p50": 47, - "ttft_ms_p99": 2980.6 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 566.2, - "tokens_out": 33989, - "tokens_in": 0, - "requests_completed": 185, - "ttft_ms_p50": 43.1, - "ttft_ms_p99": 59.5 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 555.8, - "tokens_out": 33345, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 43.1, - "ttft_ms_p99": 50.9 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 574.1, - "tokens_out": 34447, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 42.7, - "ttft_ms_p99": 59.1 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 564.4, - "tokens_out": 33852, - "tokens_in": 0, - "requests_completed": 182, - "ttft_ms_p50": 43, - "ttft_ms_p99": 45.9 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 552.2, - "tokens_out": 33145, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 43.3, - "ttft_ms_p99": 59.3 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 562.1, - "tokens_out": 33715, - "tokens_in": 0, - "requests_completed": 184, - "ttft_ms_p50": 43.1, - "ttft_ms_p99": 59.1 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 562.6, - "tokens_out": 33751, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 43, - "ttft_ms_p99": 58.6 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 565.5, - "tokens_out": 33923, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 43.4, - "ttft_ms_p99": 46.6 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 559.5, - "tokens_out": 33594, - "tokens_in": 0, - "requests_completed": 180, - "ttft_ms_p50": 43.5, - "ttft_ms_p99": 59.5 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 555.9, - "tokens_out": 33329, - "tokens_in": 0, - "requests_completed": 181, - "ttft_ms_p50": 43.2, - "ttft_ms_p99": 58.6 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 561.2, - "tokens_out": 33679, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 43.2, - "ttft_ms_p99": 59.9 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 568, - "tokens_out": 34091, - "tokens_in": 0, - "requests_completed": 186, - "ttft_ms_p50": 43.4, - "ttft_ms_p99": 57.6 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 562.5, - "tokens_out": 33735, - "tokens_in": 0, - "requests_completed": 183, - "ttft_ms_p50": 43.4, - "ttft_ms_p99": 50.6 - } - ] - }, - { - "precision": "W8A16", - "sustained_throughput_tokens_per_sec": 841.8, - "throttle_ratio": 0.902, - "throttle_onset_minute": null, - "ttft_p99_drift_ms": -3044.7, - "sustained_concurrency": 8, - "duration_minutes": 15, - "samples": [ - { - "minute": 1, - "is_warmup": false, - "throughput_tokens_per_sec": 770, - "tokens_out": 46214, - "tokens_in": 0, - "requests_completed": 254, - "ttft_ms_p50": 35.2, - "ttft_ms_p99": 3097.4 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 851.3, - "tokens_out": 51089, - "tokens_in": 0, - "requests_completed": 281, - "ttft_ms_p50": 34.7, - "ttft_ms_p99": 45.2 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 851.8, - "tokens_out": 51090, - "tokens_in": 0, - "requests_completed": 275, - "ttft_ms_p50": 34.8, - "ttft_ms_p99": 52.2 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 839.2, - "tokens_out": 50347, - "tokens_in": 0, - "requests_completed": 277, - "ttft_ms_p50": 34.8, - "ttft_ms_p99": 49.7 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 849.4, - "tokens_out": 50977, - "tokens_in": 0, - "requests_completed": 278, - "ttft_ms_p50": 35, - "ttft_ms_p99": 53 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 850.5, - "tokens_out": 51013, - "tokens_in": 0, - "requests_completed": 279, - "ttft_ms_p50": 34.8, - "ttft_ms_p99": 47.5 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 850.2, - "tokens_out": 51029, - "tokens_in": 0, - "requests_completed": 275, - "ttft_ms_p50": 35.2, - "ttft_ms_p99": 52.9 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 833.1, - "tokens_out": 49975, - "tokens_in": 0, - "requests_completed": 273, - "ttft_ms_p50": 35.1, - "ttft_ms_p99": 52.4 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 853.8, - "tokens_out": 51245, - "tokens_in": 0, - "requests_completed": 281, - "ttft_ms_p50": 34.9, - "ttft_ms_p99": 47.7 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 852.4, - "tokens_out": 51154, - "tokens_in": 0, - "requests_completed": 280, - "ttft_ms_p50": 35, - "ttft_ms_p99": 39.7 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 849.4, - "tokens_out": 50955, - "tokens_in": 0, - "requests_completed": 277, - "ttft_ms_p50": 35, - "ttft_ms_p99": 51.7 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 840.2, - "tokens_out": 50400, - "tokens_in": 0, - "requests_completed": 275, - "ttft_ms_p50": 34.9, - "ttft_ms_p99": 52.7 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 850.7, - "tokens_out": 51044, - "tokens_in": 0, - "requests_completed": 281, - "ttft_ms_p50": 35, - "ttft_ms_p99": 47.5 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 843.5, - "tokens_out": 50629, - "tokens_in": 0, - "requests_completed": 277, - "ttft_ms_p50": 35.2, - "ttft_ms_p99": 52.7 - } - ] - }, - { - "precision": "W4A16", - "sustained_throughput_tokens_per_sec": 760.9, - "throttle_ratio": 0.887, - "throttle_onset_minute": 1, - "ttft_p99_drift_ms": -2750.7, - "sustained_concurrency": 8, - "duration_minutes": 15, - "samples": [ - { - "minute": 1, - "is_warmup": false, - "throughput_tokens_per_sec": 687.5, - "tokens_out": 41259, - "tokens_in": 0, - "requests_completed": 236, - "ttft_ms_p50": 36.8, - "ttft_ms_p99": 2802.1 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 770.2, - "tokens_out": 46209, - "tokens_in": 0, - "requests_completed": 256, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 51.3 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 764, - "tokens_out": 45832, - "tokens_in": 0, - "requests_completed": 258, - "ttft_ms_p50": 35.3, - "ttft_ms_p99": 51.2 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 768.7, - "tokens_out": 46151, - "tokens_in": 0, - "requests_completed": 257, - "ttft_ms_p50": 35.5, - "ttft_ms_p99": 51.9 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 766.7, - "tokens_out": 45997, - "tokens_in": 0, - "requests_completed": 258, - "ttft_ms_p50": 35.3, - "ttft_ms_p99": 51.5 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 768.2, - "tokens_out": 46086, - "tokens_in": 0, - "requests_completed": 257, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 47.7 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 764.3, - "tokens_out": 45881, - "tokens_in": 0, - "requests_completed": 258, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 51.4 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 768.5, - "tokens_out": 46105, - "tokens_in": 0, - "requests_completed": 260, - "ttft_ms_p50": 35.3, - "ttft_ms_p99": 51.9 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 762.5, - "tokens_out": 45749, - "tokens_in": 0, - "requests_completed": 253, - "ttft_ms_p50": 35.2, - "ttft_ms_p99": 52.1 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 773, - "tokens_out": 46367, - "tokens_in": 0, - "requests_completed": 260, - "ttft_ms_p50": 35.5, - "ttft_ms_p99": 51.4 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 761.1, - "tokens_out": 45663, - "tokens_in": 0, - "requests_completed": 254, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 49.6 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 760.9, - "tokens_out": 45671, - "tokens_in": 0, - "requests_completed": 256, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 45.8 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 775.3, - "tokens_out": 46524, - "tokens_in": 0, - "requests_completed": 259, - "ttft_ms_p50": 35.4, - "ttft_ms_p99": 51.6 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 762.3, - "tokens_out": 45727, - "tokens_in": 0, - "requests_completed": 257, - "ttft_ms_p50": 35.3, - "ttft_ms_p99": 51.4 - } - ] - } - ] - } - }, - "accuracy": null, - "meta": { - "submitted_by": "Gong-K", - "submission_type": "individual", - "date": "2026-04-30", - "time": "08:29:45", - "run_id": "651fefa6", - "run_name": "nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6", - "flagged": null, - "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", - "env_info_file": "../../env_info.json", - "log_file": null, - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-04-30T08:26:37.946702+00:00", - "benchmark_end_time": "2026-04-30T08:29:45.379126+00:00", - "benchmark_elapsed_minutes": 76.2, - "model_load_seconds": 65.9, - "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", - "scenario_dirs": { - "bf16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/offline", - "bf16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/online", - "bf16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/sustained", - "fp8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/offline", - "fp8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/online", - "fp8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/sustained", - "w8a8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/offline", - "w8a8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/online", - "w8a8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/sustained", - "w8a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/offline", - "w8a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/online", - "w8a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/sustained", - "w4a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/offline", - "w4a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/online", - "w4a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/sustained" - }, - "precision_dirs": { - "BF16": "bf16", - "FP8": "fp8", - "W8A8": "w8a8", - "W8A16": "w8a16", - "W4A16": "w4a16" - }, - "precision_model_map": { - "BF16": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", - "dtype_override": "bfloat16" - }, - "FP8": { - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", - "engine_kwargs": { - "quantization": "compressed-tensors" - }, - "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." - }, - "W8A8": { - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", - "engine_kwargs": { - "quantization": "compressed-tensors" - }, - "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." - }, - "W8A16": { - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", - "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", - "engine_kwargs": { - "quantization": "compressed-tensors" - }, - "_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype." - }, - "W4A16": { - "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", - "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", - "engine_kwargs": { - "quantization": "gptq" - }, - "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16." - } - } - } -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/result.json b/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/result.json deleted file mode 100644 index e110ab18..00000000 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/result.json +++ /dev/null @@ -1,583 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_D", - "implementation_id": "nvidia_sglang_c43a8309", - "chip": { - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 40, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-07T06:55:48.459765+00:00", - "accelerators": [ - { - "index": 0, - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "memory_gb": 40, - "driver_version": "565.57.01", - "firmware_version": null, - "compute_capability": "8.0", - "supports_bf16": true - } - ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", - "intra_node_interconnect": null, - "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.7, - "pcie_generation": "PCIe Gen 4", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_2", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20", - "kernel_version": "5.15.0-60-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "SGLang", - "framework_version": "0.5.6", - "driver_version": "565.57.01", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20" - }, - "model": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8, - "precision": "BF16", - "effective_dtype": "bfloat16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "interactive", - "sustained", - "online", - "speculative" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 2, - "extra_config": null - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 1, - "throughput_tokens_per_sec": 59.89, - "throughput_tokens_per_sec_per_chip": 59.89, - "elapsed_seconds_median": 214.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 59.82, - "throughput_tokens_per_sec_per_chip": 59.82, - "elapsed_seconds_median": 214.8, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "interactive": { - "ttft_ms_p50": 2987.93, - "ttft_ms_p90": 3151.31, - "ttft_ms_p99": 3185.98, - "tpot_ms_p50": 15.65, - "tpot_ms_p90": 15.72, - "tpot_ms_p99": 15.76, - "peak_memory_gb": null, - "elapsed_seconds_median": 650.6 - }, - "sustained": { - "sustained_concurrency": 8, - "duration_minutes": 30, - "warmup_minutes": 2, - "sample_interval_seconds": 60, - "samples": [ - { - "minute": 1, - "is_warmup": true, - "throughput_tokens_per_sec": 37.5, - "tokens_out": 2250, - "tokens_in": 0, - "requests_completed": 10, - "ttft_ms_p50": 14034.2, - "ttft_ms_p99": 30569.8 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23679.8, - "ttft_ms_p99": 29684.9 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22756.1, - "ttft_ms_p99": 29093.4 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23284, - "ttft_ms_p99": 29407.4 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23627, - "ttft_ms_p99": 29689.4 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23576.2, - "ttft_ms_p99": 29714.6 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23169.5, - "ttft_ms_p99": 29430 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23525.8, - "ttft_ms_p99": 29430.1 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23823.6, - "ttft_ms_p99": 29827.2 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22851.5, - "ttft_ms_p99": 29426.4 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 37.5, - "tokens_out": 2250, - "tokens_in": 0, - "requests_completed": 10, - "ttft_ms_p50": 23268.9, - "ttft_ms_p99": 29167.7 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23677.8, - "ttft_ms_p99": 29717.6 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23598.2, - "ttft_ms_p99": 29748 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23189.8, - "ttft_ms_p99": 29437.1 - }, - { - "minute": 15, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23568.1, - "ttft_ms_p99": 29461.6 - }, - { - "minute": 16, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23841.9, - "ttft_ms_p99": 29818.9 - }, - { - "minute": 17, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22839, - "ttft_ms_p99": 29428.4 - }, - { - "minute": 18, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23356.8, - "ttft_ms_p99": 29448.1 - }, - { - "minute": 19, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23860, - "ttft_ms_p99": 29836.8 - }, - { - "minute": 20, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22877.8, - "ttft_ms_p99": 29251.6 - }, - { - "minute": 21, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23360.2, - "ttft_ms_p99": 29503.3 - }, - { - "minute": 22, - "is_warmup": false, - "throughput_tokens_per_sec": 37.5, - "tokens_out": 2250, - "tokens_in": 0, - "requests_completed": 10, - "ttft_ms_p50": 23633.9, - "ttft_ms_p99": 29457.5 - }, - { - "minute": 23, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23851.7, - "ttft_ms_p99": 29866.9 - }, - { - "minute": 24, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22862.3, - "ttft_ms_p99": 29426.1 - }, - { - "minute": 25, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23381.4, - "ttft_ms_p99": 29497.2 - }, - { - "minute": 26, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23862.4, - "ttft_ms_p99": 29847.2 - }, - { - "minute": 27, - "is_warmup": false, - "throughput_tokens_per_sec": 56.3, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 22872.5, - "ttft_ms_p99": 29246.9 - }, - { - "minute": 28, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23368.3, - "ttft_ms_p99": 29473.1 - }, - { - "minute": 29, - "is_warmup": false, - "throughput_tokens_per_sec": 56.2, - "tokens_out": 3375, - "tokens_in": 0, - "requests_completed": 15, - "ttft_ms_p50": 23691.2, - "ttft_ms_p99": 29750.9 - } - ], - "sustained_throughput_tokens_per_sec": 54.9, - "throttle_ratio": 0.666, - "throttle_onset_minute": 11, - "ttft_p99_drift_ms": 66, - "throughput_post_warmup_reliability": { - "n": 28, - "mean": 54.9, - "std": 4.9, - "cv_pct": 8.95, - "stability": "high-variance", - "runs": [ - 56.2, - 56.3, - 56.2, - 56.3, - 56.3, - 56.2, - 56.2, - 56.2, - 56.3, - 37.5, - 56.3, - 56.2, - 56.3, - 56.2, - 56.3, - 56.2, - 56.3, - 56.2, - 56.3, - 56.2, - 37.5, - 56.3, - 56.2, - 56.2, - 56.3, - 56.3, - 56.2, - 56.2 - ] - } - }, - "online": { - "sla_ttft_ms": 5000, - "max_valid_qps": 0, - "results_by_qps": [ - { - "target_qps": 0.5, - "achieved_qps": 0.5, - "ttft_ms_p50": 112272.07, - "ttft_ms_p90": 202401.64, - "ttft_ms_p99": 216182.98, - "tpot_ms_p50": 52.34, - "tpot_ms_p90": 78.65, - "tpot_ms_p99": 80.48, - "elapsed_seconds_median": 413.5, - "sla_met": false - }, - { - "target_qps": 1, - "achieved_qps": 1, - "ttft_ms_p50": 145998.66, - "ttft_ms_p90": 264672.22, - "ttft_ms_p99": 294893.64, - "tpot_ms_p50": 52.5, - "tpot_ms_p90": 78.93, - "tpot_ms_p99": 80.57, - "elapsed_seconds_median": 414.8, - "sla_met": false - }, - { - "target_qps": 2, - "achieved_qps": 2, - "ttft_ms_p50": 179802.9, - "ttft_ms_p90": 322496.7, - "ttft_ms_p99": 356490.83, - "tpot_ms_p50": 52.59, - "tpot_ms_p90": 79.01, - "tpot_ms_p99": 80.69, - "elapsed_seconds_median": 412.7, - "sla_met": false - } - ] - }, - "speculative": { - "results_by_concurrency": [ - { - "client_concurrency": 1, - "throughput_tokens_per_sec": 36.86, - "throughput_tokens_per_sec_per_chip": 36.86, - "elapsed_seconds_median": 348.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 36.85, - "throughput_tokens_per_sec_per_chip": 36.85, - "elapsed_seconds_median": 348.7, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - } - }, - "accuracy": { - "subset_score": 0.57, - "baseline_delta": 0.01, - "valid": true, - "framework": "SGLang", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." - }, - "meta": { - "submitted_by": "Gong-K", - "submission_type": "individual", - "date": "2026-05-07", - "time": "07:22:09", - "run_id": "99c43b97", - "run_name": "nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97", - "flagged": null, - "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", - "env_info_file": "../env_info.json", - "log_file": null, - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-07T07:00:40.025406+00:00", - "benchmark_end_time": "2026-05-07T07:22:09.476338+00:00", - "benchmark_elapsed_minutes": 150.5, - "model_load_seconds": 52.9, - "benchmark_elapsed_minutes_note": "Total across [offline, interactive, sustained, online, speculative] scenarios.", - "scenario_dirs": { - "offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/offline", - "interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/interactive", - "sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/sustained", - "online": "results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/online", - "speculative": "results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/speculative" - } - } -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/result.json b/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/result.json deleted file mode 100644 index 54c9f403..00000000 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/result.json +++ /dev/null @@ -1,389 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "nvidia_sglang_c43a8309", - "chip": { - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 40, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-07T10:52:35.700123+00:00", - "accelerators": [ - { - "index": 0, - "name": "NVIDIA A100-SXM4-40GB", - "vendor": "NVIDIA", - "memory_gb": 40, - "driver_version": "565.57.01", - "firmware_version": null, - "compute_capability": "8.0", - "supports_bf16": true - } - ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", - "intra_node_interconnect": null, - "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, - "logical_cores": 128, - "numa_nodes": 2 - }, - "system_memory_gb": 1007.7, - "pcie_generation": "PCIe Gen 4", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_2", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20", - "kernel_version": "5.15.0-60-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "SGLang", - "framework_version": "0.5.6", - "driver_version": "565.57.01", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.4 LTS", - "python_version": "3.10.20" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "BF16", - "effective_dtype": "bfloat16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online", - "interactive", - "sustained" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": null - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 11447.71, - "throughput_tokens_per_sec_per_chip": 11447.71, - "elapsed_seconds_median": 3.7, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 11507.48, - "throughput_tokens_per_sec_per_chip": 11507.48, - "elapsed_seconds_median": 3.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 11509.2, - "throughput_tokens_per_sec_per_chip": 11509.2, - "elapsed_seconds_median": 3.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 40, - "results_by_qps": [ - { - "target_qps": 10, - "achieved_qps": 10, - "ttft_ms_p50": 18.63, - "ttft_ms_p90": 31.07, - "ttft_ms_p99": 1226.53, - "tpot_ms_p50": 2.56, - "tpot_ms_p90": 3.01, - "tpot_ms_p99": 4.21, - "elapsed_seconds_median": 31.9, - "sla_met": false - }, - { - "target_qps": 40, - "achieved_qps": 40, - "ttft_ms_p50": 34.28, - "ttft_ms_p90": 41.32, - "ttft_ms_p99": 44.41, - "tpot_ms_p50": 20.08, - "tpot_ms_p90": 24.83, - "tpot_ms_p99": 31.82, - "elapsed_seconds_median": 10, - "sla_met": true - } - ] - }, - "interactive": { - "ttft_ms_p50": 16.46, - "ttft_ms_p90": 17.26, - "ttft_ms_p99": 18.42, - "tpot_ms_p50": 1.89, - "tpot_ms_p90": 1.91, - "tpot_ms_p99": 1.99, - "peak_memory_gb": null, - "elapsed_seconds_median": 56.5 - }, - "sustained": { - "sustained_concurrency": 32, - "duration_minutes": 15, - "warmup_minutes": 1, - "sample_interval_seconds": 60, - "samples": [ - { - "minute": 1, - "is_warmup": false, - "throughput_tokens_per_sec": 6616.4, - "tokens_out": 397010, - "tokens_in": 0, - "requests_completed": 2131, - "ttft_ms_p50": 19.6, - "ttft_ms_p99": 4749.5 - }, - { - "minute": 2, - "is_warmup": false, - "throughput_tokens_per_sec": 7181.7, - "tokens_out": 430976, - "tokens_in": 0, - "requests_completed": 2317, - "ttft_ms_p50": 19.3, - "ttft_ms_p99": 41.7 - }, - { - "minute": 3, - "is_warmup": false, - "throughput_tokens_per_sec": 7188.9, - "tokens_out": 431427, - "tokens_in": 0, - "requests_completed": 2312, - "ttft_ms_p50": 19.1, - "ttft_ms_p99": 41 - }, - { - "minute": 4, - "is_warmup": false, - "throughput_tokens_per_sec": 7110.2, - "tokens_out": 426673, - "tokens_in": 0, - "requests_completed": 2292, - "ttft_ms_p50": 19.4, - "ttft_ms_p99": 34 - }, - { - "minute": 5, - "is_warmup": false, - "throughput_tokens_per_sec": 7106.5, - "tokens_out": 426157, - "tokens_in": 0, - "requests_completed": 2287, - "ttft_ms_p50": 19.2, - "ttft_ms_p99": 40.6 - }, - { - "minute": 6, - "is_warmup": false, - "throughput_tokens_per_sec": 7144.4, - "tokens_out": 428781, - "tokens_in": 0, - "requests_completed": 2311, - "ttft_ms_p50": 19.2, - "ttft_ms_p99": 33.6 - }, - { - "minute": 7, - "is_warmup": false, - "throughput_tokens_per_sec": 7158.3, - "tokens_out": 429690, - "tokens_in": 0, - "requests_completed": 2306, - "ttft_ms_p50": 19.3, - "ttft_ms_p99": 40.3 - }, - { - "minute": 8, - "is_warmup": false, - "throughput_tokens_per_sec": 7020, - "tokens_out": 421197, - "tokens_in": 0, - "requests_completed": 2265, - "ttft_ms_p50": 19.3, - "ttft_ms_p99": 41.4 - }, - { - "minute": 9, - "is_warmup": false, - "throughput_tokens_per_sec": 7160.7, - "tokens_out": 429349, - "tokens_in": 0, - "requests_completed": 2303, - "ttft_ms_p50": 19.4, - "ttft_ms_p99": 41.1 - }, - { - "minute": 10, - "is_warmup": false, - "throughput_tokens_per_sec": 7183.9, - "tokens_out": 431115, - "tokens_in": 0, - "requests_completed": 2319, - "ttft_ms_p50": 19.4, - "ttft_ms_p99": 38 - }, - { - "minute": 11, - "is_warmup": false, - "throughput_tokens_per_sec": 7180.8, - "tokens_out": 431065, - "tokens_in": 0, - "requests_completed": 2308, - "ttft_ms_p50": 19.2, - "ttft_ms_p99": 34.1 - }, - { - "minute": 12, - "is_warmup": false, - "throughput_tokens_per_sec": 7127.9, - "tokens_out": 427694, - "tokens_in": 0, - "requests_completed": 2301, - "ttft_ms_p50": 19.3, - "ttft_ms_p99": 34.3 - }, - { - "minute": 13, - "is_warmup": false, - "throughput_tokens_per_sec": 7111.1, - "tokens_out": 426396, - "tokens_in": 0, - "requests_completed": 2282, - "ttft_ms_p50": 19.2, - "ttft_ms_p99": 41 - }, - { - "minute": 14, - "is_warmup": false, - "throughput_tokens_per_sec": 7044.6, - "tokens_out": 422801, - "tokens_in": 0, - "requests_completed": 2269, - "ttft_ms_p50": 19.4, - "ttft_ms_p99": 41.8 - } - ], - "sustained_throughput_tokens_per_sec": 7095.4, - "throttle_ratio": 0.92, - "throttle_onset_minute": null, - "ttft_p99_drift_ms": -4707.7, - "throughput_post_warmup_reliability": { - "n": 14, - "mean": 7095.4, - "std": 147.1, - "cv_pct": 2.07, - "stability": "stable", - "runs": [ - 6616.4, - 7181.7, - 7188.9, - 7110.2, - 7106.5, - 7144.4, - 7158.3, - 7020.0, - 7160.7, - 7183.9, - 7180.8, - 7127.9, - 7111.1, - 7044.6 - ] - } - } - }, - "accuracy": { - "subset_score": 0.41, - "baseline_delta": 0.03, - "valid": true, - "framework": "SGLang", - "precision": "BF16", - "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." - }, - "meta": { - "submitted_by": "Gong-K", - "submission_type": "individual", - "date": "2026-05-07", - "time": "10:56:30", - "run_id": "435424a8", - "run_name": "nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8", - "flagged": null, - "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", - "env_info_file": "../env_info.json", - "log_file": null, - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-07T10:55:44.424768+00:00", - "benchmark_end_time": "2026-05-07T10:56:30.330070+00:00", - "benchmark_elapsed_minutes": 20.7, - "model_load_seconds": 33.4, - "benchmark_elapsed_minutes_note": "Total across [offline, online, interactive, sustained] scenarios.", - "scenario_dirs": { - "offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/offline", - "online": "results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/online", - "interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/interactive", - "sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/sustained" - } - } -} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/accuracy/accuracy.json new file mode 100644 index 00000000..9930c9e2 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.77, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/burst/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/burst/result.json new file mode 100644 index 00000000..c39a27f2 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/burst/result.json @@ -0,0 +1,229 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 1000, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 98.54, + "steady_ttft_p99_ms": 201.51, + "burst_ttft_p50_ms": 140.87, + "burst_ttft_p99_ms": 384.05, + "sla_met_during_burst": true, + "burst_degradation_ratio": 1.906, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 364.09, + "burst_ttft_p99_ms": 404.43 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 173.75, + "burst_ttft_p99_ms": 391.83 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 174.59, + "burst_ttft_p99_ms": 340.29 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "00:44:05", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T00:35:27.385471+00:00", + "benchmark_end_time": "2026-05-08T00:44:05.823975+00:00", + "benchmark_elapsed_minutes": 8.6, + "model_load_seconds": 444.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/env_info.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/env_info.json new file mode 100644 index 00000000..4eeed5c5 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/env_info.json @@ -0,0 +1,118 @@ +{ + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/interactive/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/interactive/result.json new file mode 100644 index 00000000..4f2ece9d --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/interactive/result.json @@ -0,0 +1,201 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 72.21, + "ttft_ms_p90": 78.15, + "ttft_ms_p99": 308.29, + "tpot_ms_p50": 18.52, + "tpot_ms_p90": 18.58, + "tpot_ms_p99": 18.6, + "peak_memory_gb": null, + "elapsed_seconds_median": 176.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "00:26:55", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T00:18:03.615707+00:00", + "benchmark_end_time": "2026-05-08T00:26:55.443402+00:00", + "benchmark_elapsed_minutes": 8.9, + "model_load_seconds": 446.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/offline/result.json new file mode 100644 index 00000000..76d2c725 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 2234.48, + "throughput_tokens_per_sec_per_chip": 279.31, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 2233.83, + "throughput_tokens_per_sec_per_chip": 279.23, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 2236.02, + "throughput_tokens_per_sec_per_chip": 279.5, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "23:12:06", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T23:08:55.136710+00:00", + "benchmark_end_time": "2026-05-07T23:12:06.765021+00:00", + "benchmark_elapsed_minutes": 3.2, + "model_load_seconds": 480.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/online/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/online/result.json new file mode 100644 index 00000000..4de35ed5 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/online/result.json @@ -0,0 +1,245 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 1000, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 92.29, + "ttft_ms_p90": 136.26, + "ttft_ms_p99": 184.14, + "tpot_ms_p50": 25.88, + "tpot_ms_p90": 27.81, + "tpot_ms_p99": 30.42, + "elapsed_seconds_median": 104.6, + "sla_met": true + }, + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 96.6, + "ttft_ms_p90": 143.81, + "ttft_ms_p99": 166.6, + "tpot_ms_p50": 32.04, + "tpot_ms_p90": 34.57, + "tpot_ms_p99": 37.5, + "elapsed_seconds_median": 47.5, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 115.36, + "ttft_ms_p90": 157.84, + "ttft_ms_p99": 185.54, + "tpot_ms_p50": 47.27, + "tpot_ms_p90": 54.71, + "tpot_ms_p99": 57.26, + "elapsed_seconds_median": 26.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 135.31, + "ttft_ms_p90": 174.62, + "ttft_ms_p99": 789.97, + "tpot_ms_p50": 56.94, + "tpot_ms_p90": 71.74, + "tpot_ms_p99": 146.67, + "elapsed_seconds_median": 17.6, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "23:30:30", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T23:20:37.906520+00:00", + "benchmark_end_time": "2026-05-07T23:30:30.833319+00:00", + "benchmark_elapsed_minutes": 9.9, + "model_load_seconds": 461.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/result.json new file mode 100644 index 00000000..d9b1803e --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/result.json @@ -0,0 +1,650 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained", + "interactive", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 2234.48, + "throughput_tokens_per_sec_per_chip": 279.31, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 2233.83, + "throughput_tokens_per_sec_per_chip": 279.23, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 2236.02, + "throughput_tokens_per_sec_per_chip": 279.5, + "elapsed_seconds_median": 15.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 1000, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 92.29, + "ttft_ms_p90": 136.26, + "ttft_ms_p99": 184.14, + "tpot_ms_p50": 25.88, + "tpot_ms_p90": 27.81, + "tpot_ms_p99": 30.42, + "elapsed_seconds_median": 104.6, + "sla_met": true + }, + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 96.6, + "ttft_ms_p90": 143.81, + "ttft_ms_p99": 166.6, + "tpot_ms_p50": 32.04, + "tpot_ms_p90": 34.57, + "tpot_ms_p99": 37.5, + "elapsed_seconds_median": 47.5, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 115.36, + "ttft_ms_p90": 157.84, + "ttft_ms_p99": 185.54, + "tpot_ms_p50": 47.27, + "tpot_ms_p90": 54.71, + "tpot_ms_p99": 57.26, + "elapsed_seconds_median": 26.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 135.31, + "ttft_ms_p90": 174.62, + "ttft_ms_p99": 789.97, + "tpot_ms_p50": 56.94, + "tpot_ms_p90": 71.74, + "tpot_ms_p99": 146.67, + "elapsed_seconds_median": 17.6, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 4, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 166.1, + "tokens_out": 9973, + "tokens_in": 0, + "requests_completed": 54, + "ttft_ms_p50": 97.1, + "ttft_ms_p99": 745.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.2, + "tokens_out": 10872, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 95.8, + "ttft_ms_p99": 150.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 186.1, + "tokens_out": 11167, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 138.2 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.3, + "tokens_out": 10997, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.8, + "ttft_ms_p99": 143.4 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.9, + "tokens_out": 10911, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.6, + "ttft_ms_p99": 114.2 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 179.2, + "tokens_out": 10755, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 131.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.2, + "tokens_out": 10992, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 142.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.1, + "tokens_out": 10805, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.4, + "ttft_ms_p99": 116.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.9, + "tokens_out": 10855, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 138.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 185.0, + "tokens_out": 11100, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 112.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.6, + "tokens_out": 10833, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 132.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 185.3, + "tokens_out": 11128, + "tokens_in": 0, + "requests_completed": 60, + "ttft_ms_p50": 93.7, + "ttft_ms_p99": 143.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.0, + "tokens_out": 10859, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 147.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.3, + "tokens_out": 10933, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.4, + "ttft_ms_p99": 143.0 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 179.6, + "tokens_out": 10776, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 96.8 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.8, + "tokens_out": 10736, + "tokens_in": 0, + "requests_completed": 55, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 158.0 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 187.3, + "tokens_out": 11233, + "tokens_in": 0, + "requests_completed": 60, + "ttft_ms_p50": 92.9, + "ttft_ms_p99": 111.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.4, + "tokens_out": 10885, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 92.8, + "ttft_ms_p99": 155.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.1, + "tokens_out": 10985, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 92.7, + "ttft_ms_p99": 142.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.8, + "tokens_out": 11032, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 115.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 177.4, + "tokens_out": 10641, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 159.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.3, + "tokens_out": 10937, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 95.0 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.8, + "tokens_out": 10723, + "tokens_in": 0, + "requests_completed": 55, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 137.9 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.9, + "tokens_out": 10856, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.1, + "ttft_ms_p99": 138.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 184.6, + "tokens_out": 11084, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 131.4 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 186.9, + "tokens_out": 11212, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 137.1 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.4, + "tokens_out": 10943, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 96.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.6, + "tokens_out": 10722, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.5, + "ttft_ms_p99": 142.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.7, + "tokens_out": 10717, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.6, + "ttft_ms_p99": 138.1 + } + ], + "sustained_throughput_tokens_per_sec": 182.0, + "throttle_ratio": 0.947, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -12.7 + }, + "interactive": { + "ttft_ms_p50": 72.21, + "ttft_ms_p90": 78.15, + "ttft_ms_p99": 308.29, + "tpot_ms_p50": 18.52, + "tpot_ms_p90": 18.58, + "tpot_ms_p99": 18.6, + "peak_memory_gb": null, + "elapsed_seconds_median": 176.9 + }, + "burst": { + "sla_ttft_ms": 1000, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 98.54, + "steady_ttft_p99_ms": 201.51, + "burst_ttft_p50_ms": 140.87, + "burst_ttft_p99_ms": 384.05, + "sla_met_during_burst": true, + "burst_degradation_ratio": 1.906, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 364.09, + "burst_ttft_p99_ms": 404.43 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 173.75, + "burst_ttft_p99_ms": 391.83 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 174.59, + "burst_ttft_p99_ms": 340.29 + } + ] + } + }, + "accuracy": { + "subset_score": 0.77, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "23:12:06", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T23:08:55.136710+00:00", + "benchmark_end_time": "2026-05-07T23:12:06.765021+00:00", + "benchmark_elapsed_minutes": 60.7, + "model_load_seconds": 480.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained', 'interactive', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/offline", + "online": "results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/online", + "sustained": "results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/sustained", + "interactive": "results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/interactive", + "burst": "results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/burst" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/sustained/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/sustained/result.json new file mode 100644 index 00000000..0cc90236 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1/sustained/result.json @@ -0,0 +1,493 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_B", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T22:51:04.801985+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 4, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 166.1, + "tokens_out": 9973, + "tokens_in": 0, + "requests_completed": 54, + "ttft_ms_p50": 97.1, + "ttft_ms_p99": 745.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.2, + "tokens_out": 10872, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 95.8, + "ttft_ms_p99": 150.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 186.1, + "tokens_out": 11167, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 138.2 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.3, + "tokens_out": 10997, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.8, + "ttft_ms_p99": 143.4 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.9, + "tokens_out": 10911, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.6, + "ttft_ms_p99": 114.2 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 179.2, + "tokens_out": 10755, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 131.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.2, + "tokens_out": 10992, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 142.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.1, + "tokens_out": 10805, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.4, + "ttft_ms_p99": 116.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.9, + "tokens_out": 10855, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 138.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 185.0, + "tokens_out": 11100, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 112.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.6, + "tokens_out": 10833, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 132.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 185.3, + "tokens_out": 11128, + "tokens_in": 0, + "requests_completed": 60, + "ttft_ms_p50": 93.7, + "ttft_ms_p99": 143.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.0, + "tokens_out": 10859, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 147.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.3, + "tokens_out": 10933, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.4, + "ttft_ms_p99": 143.0 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 179.6, + "tokens_out": 10776, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 96.8 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.8, + "tokens_out": 10736, + "tokens_in": 0, + "requests_completed": 55, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 158.0 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 187.3, + "tokens_out": 11233, + "tokens_in": 0, + "requests_completed": 60, + "ttft_ms_p50": 92.9, + "ttft_ms_p99": 111.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 181.4, + "tokens_out": 10885, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 92.8, + "ttft_ms_p99": 155.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.1, + "tokens_out": 10985, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 92.7, + "ttft_ms_p99": 142.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 183.8, + "tokens_out": 11032, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 115.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 177.4, + "tokens_out": 10641, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 159.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.3, + "tokens_out": 10937, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.0, + "ttft_ms_p99": 95.0 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.8, + "tokens_out": 10723, + "tokens_in": 0, + "requests_completed": 55, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 137.9 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 180.9, + "tokens_out": 10856, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.1, + "ttft_ms_p99": 138.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 184.6, + "tokens_out": 11084, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 131.4 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 186.9, + "tokens_out": 11212, + "tokens_in": 0, + "requests_completed": 58, + "ttft_ms_p50": 93.2, + "ttft_ms_p99": 137.1 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 182.4, + "tokens_out": 10943, + "tokens_in": 0, + "requests_completed": 59, + "ttft_ms_p50": 93.3, + "ttft_ms_p99": 96.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.6, + "tokens_out": 10722, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 93.5, + "ttft_ms_p99": 142.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 178.7, + "tokens_out": 10717, + "tokens_in": 0, + "requests_completed": 57, + "ttft_ms_p50": 93.6, + "ttft_ms_p99": 138.1 + } + ], + "sustained_throughput_tokens_per_sec": 182.0, + "throttle_ratio": 0.947, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -12.7 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "00:09:25", + "run_id": "40a62dd1", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_B_nvidia_sglang_c43a8309_40a62dd1", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T23:39:17.027251+00:00", + "benchmark_end_time": "2026-05-08T00:09:25.500189+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 456.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline/result.json new file mode 100644 index 00000000..a6a37cf6 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 40.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 4199.41, + "throughput_tokens_per_sec_per_chip": 4199.41, + "elapsed_seconds_median": 12.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 4196.73, + "throughput_tokens_per_sec_per_chip": 4196.73, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 4200.78, + "throughput_tokens_per_sec_per_chip": 4200.78, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:54:23", + "run_id": "e61755d3", + "run_name": "nvidia_a100_sxm4_40gbx1_suite_E_nvidia_sglang_c43a8309_e61755d3", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:51:48.053693+00:00", + "benchmark_end_time": "2026-05-09T01:54:23.461144+00:00", + "benchmark_elapsed_minutes": 2.6, + "model_load_seconds": 55.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/result.json new file mode 100644 index 00000000..aa566c0d --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/result.json @@ -0,0 +1,236 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 40.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 4199.41, + "throughput_tokens_per_sec_per_chip": 4199.41, + "elapsed_seconds_median": 12.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 4196.73, + "throughput_tokens_per_sec_per_chip": 4196.73, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 4200.78, + "throughput_tokens_per_sec_per_chip": 4200.78, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to populate." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:54:23", + "run_id": "e61755d3", + "run_name": "nvidia_a100_sxm4_40gbx1_suite_E_nvidia_sglang_c43a8309_e61755d3", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:51:48.053693+00:00", + "benchmark_end_time": "2026-05-09T01:54:23.461144+00:00", + "benchmark_elapsed_minutes": 2.6, + "model_load_seconds": 55.5, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/offline/result.json new file mode 100644 index 00000000..8db270ea --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 2, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 5095.96, + "throughput_tokens_per_sec_per_chip": 2547.98, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 5098.68, + "throughput_tokens_per_sec_per_chip": 2549.34, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 5107.25, + "throughput_tokens_per_sec_per_chip": 2553.63, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:50:22", + "run_id": "b52e1fb8", + "run_name": "nvidia_a100_sxm4_40gbx2_suite_E_nvidia_sglang_c43a8309_b52e1fb8", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:48:13.765095+00:00", + "benchmark_end_time": "2026-05-09T01:50:22.725663+00:00", + "benchmark_elapsed_minutes": 2.1, + "model_load_seconds": 61.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/result.json new file mode 100644 index 00000000..d7858378 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/result.json @@ -0,0 +1,236 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 2, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 5095.96, + "throughput_tokens_per_sec_per_chip": 2547.98, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 5098.68, + "throughput_tokens_per_sec_per_chip": 2549.34, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 5107.25, + "throughput_tokens_per_sec_per_chip": 2553.63, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to populate." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:50:22", + "run_id": "b52e1fb8", + "run_name": "nvidia_a100_sxm4_40gbx2_suite_E_nvidia_sglang_c43a8309_b52e1fb8", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:48:13.765095+00:00", + "benchmark_end_time": "2026-05-09T01:50:22.725663+00:00", + "benchmark_elapsed_minutes": 2.1, + "model_load_seconds": 61.7, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/2x/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/offline/result.json new file mode 100644 index 00000000..3dd2ffd2 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 4, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 10106.45, + "throughput_tokens_per_sec_per_chip": 2526.61, + "elapsed_seconds_median": 5.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 10486.01, + "throughput_tokens_per_sec_per_chip": 2621.5, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 10488.02, + "throughput_tokens_per_sec_per_chip": 2622.0, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:46:44", + "run_id": "cf842629", + "run_name": "nvidia_a100_sxm4_40gbx4_suite_E_nvidia_sglang_c43a8309_cf842629", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:45:38.186874+00:00", + "benchmark_end_time": "2026-05-09T01:46:44.010738+00:00", + "benchmark_elapsed_minutes": 1.1, + "model_load_seconds": 75.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/result.json new file mode 100644 index 00000000..e7ed4ae1 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/result.json @@ -0,0 +1,236 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 4, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 10106.45, + "throughput_tokens_per_sec_per_chip": 2526.61, + "elapsed_seconds_median": 5.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 10486.01, + "throughput_tokens_per_sec_per_chip": 2621.5, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 10488.02, + "throughput_tokens_per_sec_per_chip": 2622.0, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to populate." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:46:44", + "run_id": "cf842629", + "run_name": "nvidia_a100_sxm4_40gbx4_suite_E_nvidia_sglang_c43a8309_cf842629", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:45:38.186874+00:00", + "benchmark_end_time": "2026-05-09T01:46:44.010738+00:00", + "benchmark_elapsed_minutes": 1.1, + "model_load_seconds": 75.6, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/4x/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/offline/result.json new file mode 100644 index 00000000..5f9467a7 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 13319.56, + "throughput_tokens_per_sec_per_chip": 1664.95, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 13335.41, + "throughput_tokens_per_sec_per_chip": 1666.93, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 13332.3, + "throughput_tokens_per_sec_per_chip": 1666.54, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:43:53", + "run_id": "67683413", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:42:59.242491+00:00", + "benchmark_end_time": "2026-05-09T01:43:53.069174+00:00", + "benchmark_elapsed_minutes": 0.9, + "model_load_seconds": 99.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/result.json new file mode 100644 index 00000000..14c1dd52 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/result.json @@ -0,0 +1,236 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 13319.56, + "throughput_tokens_per_sec_per_chip": 1664.95, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 13335.41, + "throughput_tokens_per_sec_per_chip": 1666.93, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 13332.3, + "throughput_tokens_per_sec_per_chip": 1666.54, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to populate." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:43:53", + "run_id": "67683413", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:42:59.242491+00:00", + "benchmark_end_time": "2026-05-09T01:43:53.069174+00:00", + "benchmark_elapsed_minutes": 0.9, + "model_load_seconds": 99.9, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/8x/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/accuracy/accuracy.json similarity index 100% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/accuracy/accuracy.json rename to results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/accuracy/accuracy.json diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/env_info.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/env_info.json new file mode 100644 index 00000000..18bde9fe --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/env_info.json @@ -0,0 +1,118 @@ +{ + "collected_at": "2026-05-09T01:37:37.031654+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/result.json new file mode 100644 index 00000000..f4fb58a0 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/result.json @@ -0,0 +1,273 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_E", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null, + "_count_note": "Maximum chip count used in this suite. See task.chip_counts_run for all counts tested." + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "chip_counts_run": [ + 1, + 2, + 4, + 8 + ], + "parallelism_note": "Each chip_count uses tensor_parallel_size=N", + "num_runs": 3 + }, + "metrics": { + "scaling": { + "base_chip_count": 1, + "base_throughput_tokens_per_sec": 4200.78, + "results_by_chip_count": [ + { + "chip_count": 1, + "best_throughput_tokens_per_sec": 4200.78, + "throughput_tokens_per_sec_per_chip": 4200.78, + "scaling_efficiency": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 4199.41, + "throughput_tokens_per_sec_per_chip": 4199.41, + "elapsed_seconds_median": 12.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 4196.73, + "throughput_tokens_per_sec_per_chip": 4196.73, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 4200.78, + "throughput_tokens_per_sec_per_chip": 4200.78, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "1x" + }, + { + "chip_count": 2, + "best_throughput_tokens_per_sec": 5107.25, + "throughput_tokens_per_sec_per_chip": 2553.62, + "scaling_efficiency": 0.608, + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 5095.96, + "throughput_tokens_per_sec_per_chip": 2547.98, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 5098.68, + "throughput_tokens_per_sec_per_chip": 2549.34, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 5107.25, + "throughput_tokens_per_sec_per_chip": 2553.62, + "elapsed_seconds_median": 10.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "2x" + }, + { + "chip_count": 4, + "best_throughput_tokens_per_sec": 10488.02, + "throughput_tokens_per_sec_per_chip": 2622.01, + "scaling_efficiency": 0.624, + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 10106.45, + "throughput_tokens_per_sec_per_chip": 2526.61, + "elapsed_seconds_median": 5.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 10486.01, + "throughput_tokens_per_sec_per_chip": 2621.5, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 10488.02, + "throughput_tokens_per_sec_per_chip": 2622.01, + "elapsed_seconds_median": 5.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "4x" + }, + { + "chip_count": 8, + "best_throughput_tokens_per_sec": 13335.41, + "throughput_tokens_per_sec_per_chip": 1666.93, + "scaling_efficiency": 0.397, + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 13319.56, + "throughput_tokens_per_sec_per_chip": 1664.94, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 13335.41, + "throughput_tokens_per_sec_per_chip": 1666.93, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 13332.3, + "throughput_tokens_per_sec_per_chip": 1666.54, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "8x" + } + ] + }, + "derived": {} + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "01:54:23", + "run_id": "67683413", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T01:51:48.053693+00:00", + "benchmark_end_time": "2026-05-09T01:54:23.461144+00:00", + "benchmark_elapsed_minutes": 6.7, + "model_load_seconds": 55.5, + "benchmark_elapsed_minutes_note": "Sum of per-chip-count benchmark_elapsed_minutes (excludes sleep gaps, orchestrator overhead, and skipped counts).", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_E_nvidia_sglang_c43a8309_67683413/1x/offline" + }, + "chip_count_dirs": { + "1": "1x", + "2": "2x", + "4": "4x", + "8": "8x" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/accuracy/accuracy.json new file mode 100644 index 00000000..25ffb30c --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.66, + "baseline_delta": 0.04, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/env_info.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/env_info.json new file mode 100644 index 00000000..d278d605 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/env_info.json @@ -0,0 +1,118 @@ +{ + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/interactive/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/interactive/result.json new file mode 100644 index 00000000..324baf20 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/interactive/result.json @@ -0,0 +1,201 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_G", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "moe", + "parameter_count_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 42.97, + "ttft_ms_p90": 45.86, + "ttft_ms_p99": 942.53, + "tpot_ms_p50": 4.94, + "tpot_ms_p90": 5.0, + "tpot_ms_p99": 5.75, + "peak_memory_gb": null, + "elapsed_seconds_median": 50.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "19:59:26", + "run_id": "9e9c88dd", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T19:56:50.783467+00:00", + "benchmark_end_time": "2026-05-09T19:59:26.812585+00:00", + "benchmark_elapsed_minutes": 2.6, + "model_load_seconds": 353.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/offline/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/offline/result.json new file mode 100644 index 00000000..6e88b068 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/offline/result.json @@ -0,0 +1,231 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_G", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "moe", + "parameter_count_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4096.26, + "throughput_tokens_per_sec_per_chip": 512.03, + "elapsed_seconds_median": 8.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4330.45, + "throughput_tokens_per_sec_per_chip": 541.31, + "elapsed_seconds_median": 7.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4346.6, + "throughput_tokens_per_sec_per_chip": 543.32, + "elapsed_seconds_median": 7.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "19:32:15", + "run_id": "9e9c88dd", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T19:30:32.944483+00:00", + "benchmark_end_time": "2026-05-09T19:32:15.237392+00:00", + "benchmark_elapsed_minutes": 1.7, + "model_load_seconds": 358.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/online/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/online/result.json new file mode 100644 index 00000000..e99cb8e0 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/online/result.json @@ -0,0 +1,233 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_G", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "moe", + "parameter_count_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 52.25, + "ttft_ms_p90": 78.57, + "ttft_ms_p99": 553.67, + "tpot_ms_p50": 11.24, + "tpot_ms_p90": 13.21, + "tpot_ms_p99": 18.14, + "elapsed_seconds_median": 159.1, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 63.44, + "ttft_ms_p90": 97.44, + "ttft_ms_p99": 196.44, + "tpot_ms_p50": 20.84, + "tpot_ms_p90": 23.14, + "tpot_ms_p99": 26.92, + "elapsed_seconds_median": 32.3, + "sla_met": true + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 92.92, + "ttft_ms_p90": 110.13, + "ttft_ms_p99": 161.4, + "tpot_ms_p50": 49.96, + "tpot_ms_p90": 66.04, + "tpot_ms_p99": 127.08, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "19:49:44", + "run_id": "9e9c88dd", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T19:39:29.143471+00:00", + "benchmark_end_time": "2026-05-09T19:49:44.926315+00:00", + "benchmark_elapsed_minutes": 10.3, + "model_load_seconds": 380.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/result.json new file mode 100644 index 00000000..3c975318 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/result.json @@ -0,0 +1,598 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_G", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "moe", + "parameter_count_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4096.26, + "throughput_tokens_per_sec_per_chip": 512.03, + "elapsed_seconds_median": 8.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4330.45, + "throughput_tokens_per_sec_per_chip": 541.31, + "elapsed_seconds_median": 7.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4346.6, + "throughput_tokens_per_sec_per_chip": 543.32, + "elapsed_seconds_median": 7.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 52.25, + "ttft_ms_p90": 78.57, + "ttft_ms_p99": 553.67, + "tpot_ms_p50": 11.24, + "tpot_ms_p90": 13.21, + "tpot_ms_p99": 18.14, + "elapsed_seconds_median": 159.1, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 63.44, + "ttft_ms_p90": 97.44, + "ttft_ms_p99": 196.44, + "tpot_ms_p50": 20.84, + "tpot_ms_p90": 23.14, + "tpot_ms_p99": 26.92, + "elapsed_seconds_median": 32.3, + "sla_met": true + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 92.92, + "ttft_ms_p90": 110.13, + "ttft_ms_p99": 161.4, + "tpot_ms_p50": 49.96, + "tpot_ms_p90": 66.04, + "tpot_ms_p99": 127.08, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 42.97, + "ttft_ms_p90": 45.86, + "ttft_ms_p99": 942.53, + "tpot_ms_p50": 4.94, + "tpot_ms_p90": 5.0, + "tpot_ms_p99": 5.75, + "peak_memory_gb": null, + "elapsed_seconds_median": 50.9 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 599.8, + "tokens_out": 35995, + "tokens_in": 0, + "requests_completed": 187, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 1593.9 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.0, + "tokens_out": 37760, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.7, + "ttft_ms_p99": 90.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 632.3, + "tokens_out": 37933, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.0, + "tokens_out": 38148, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 90.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.8, + "tokens_out": 37654, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 90.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.7, + "tokens_out": 37903, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 640.5, + "tokens_out": 38458, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 91.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.2, + "tokens_out": 37625, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.8 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.3, + "tokens_out": 37630, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 93.1 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 639.5, + "tokens_out": 38365, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.0, + "ttft_ms_p99": 90.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 620.4, + "tokens_out": 37220, + "tokens_in": 0, + "requests_completed": 193, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 91.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 638.6, + "tokens_out": 38323, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 91.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.5, + "tokens_out": 37637, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 632.4, + "tokens_out": 37975, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 92.2 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 637.7, + "tokens_out": 38230, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 92.4 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 641.6, + "tokens_out": 38514, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.5 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.3, + "tokens_out": 37765, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 90.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 635.1, + "tokens_out": 38103, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.8 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.9, + "tokens_out": 37909, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.1 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.6, + "tokens_out": 37900, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 92.5 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.3, + "tokens_out": 38166, + "tokens_in": 0, + "requests_completed": 199, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 92.8 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.9, + "tokens_out": 37790, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 90.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 634.3, + "tokens_out": 38083, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 83.4 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 630.9, + "tokens_out": 37851, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 91.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.2, + "tokens_out": 38151, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 92.2 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 625.1, + "tokens_out": 37526, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 85.4 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 626.9, + "tokens_out": 37605, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 90.2 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 634.1, + "tokens_out": 38047, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 92.4 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 628.0, + "tokens_out": 37674, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 94.6 + } + ], + "sustained_throughput_tokens_per_sec": 632.1, + "throttle_ratio": 0.967, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 4.2 + } + }, + "accuracy": { + "subset_score": 0.66, + "baseline_delta": 0.04, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "19:32:15", + "run_id": "9e9c88dd", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T19:30:32.944483+00:00", + "benchmark_end_time": "2026-05-09T19:32:15.237392+00:00", + "benchmark_elapsed_minutes": 44.7, + "model_load_seconds": 358.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/offline", + "online": "results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/online", + "interactive": "results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/interactive", + "sustained": "results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/sustained/result.json b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/sustained/result.json new file mode 100644 index 00000000..3b191050 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd/sustained/result.json @@ -0,0 +1,493 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_G", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "count": 8, + "memory_gb": 40.0, + "interconnect_intra_node": "NVLink", + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T19:16:00.016713+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 1, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 2, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 3, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 4, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 5, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 6, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + }, + { + "index": 7, + "name": "NVIDIA A100-SXM4-40GB", + "vendor": "NVIDIA", + "memory_gb": 40.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU1\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tPXB\tPXB\tNODE\t0-31,64-95\t0\t\tN/A\nGPU2\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU3\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tNV12\tNODE\tNODE\tPXB\t0-31,64-95\t0\t\tN/A\nGPU4\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU5\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU6\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tNV12\tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nGPU7\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\tNV12\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tPXB\tPXB\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tNODE\tNODE\tPXB\tPXB\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": "NVLink", + "cpu": { + "model": "AMD EPYC 7532 32-Core Processor", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "moe", + "parameter_count_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 8, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 599.8, + "tokens_out": 35995, + "tokens_in": 0, + "requests_completed": 187, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 1593.9 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.0, + "tokens_out": 37760, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.7, + "ttft_ms_p99": 90.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 632.3, + "tokens_out": 37933, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.0, + "tokens_out": 38148, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 90.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.8, + "tokens_out": 37654, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 90.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.7, + "tokens_out": 37903, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 640.5, + "tokens_out": 38458, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 91.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.2, + "tokens_out": 37625, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.8 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.3, + "tokens_out": 37630, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 93.1 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 639.5, + "tokens_out": 38365, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.0, + "ttft_ms_p99": 90.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 620.4, + "tokens_out": 37220, + "tokens_in": 0, + "requests_completed": 193, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 91.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 638.6, + "tokens_out": 38323, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 91.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 627.5, + "tokens_out": 37637, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 632.4, + "tokens_out": 37975, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 92.2 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 637.7, + "tokens_out": 38230, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 92.4 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 641.6, + "tokens_out": 38514, + "tokens_in": 0, + "requests_completed": 200, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 90.5 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.3, + "tokens_out": 37765, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 90.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 635.1, + "tokens_out": 38103, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.8 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.9, + "tokens_out": 37909, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 91.1 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 631.6, + "tokens_out": 37900, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 92.5 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.3, + "tokens_out": 38166, + "tokens_in": 0, + "requests_completed": 199, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 92.8 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 629.9, + "tokens_out": 37790, + "tokens_in": 0, + "requests_completed": 197, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 90.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 634.3, + "tokens_out": 38083, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 83.4 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 630.9, + "tokens_out": 37851, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 91.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 636.2, + "tokens_out": 38151, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.2, + "ttft_ms_p99": 92.2 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 625.1, + "tokens_out": 37526, + "tokens_in": 0, + "requests_completed": 195, + "ttft_ms_p50": 57.1, + "ttft_ms_p99": 85.4 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 626.9, + "tokens_out": 37605, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.4, + "ttft_ms_p99": 90.2 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 634.1, + "tokens_out": 38047, + "tokens_in": 0, + "requests_completed": 198, + "ttft_ms_p50": 57.5, + "ttft_ms_p99": 92.4 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 628.0, + "tokens_out": 37674, + "tokens_in": 0, + "requests_completed": 196, + "ttft_ms_p50": 57.3, + "ttft_ms_p99": 94.6 + } + ], + "sustained_throughput_tokens_per_sec": 632.1, + "throttle_ratio": 0.967, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 4.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "20:36:20", + "run_id": "9e9c88dd", + "run_name": "nvidia_a100_sxm4_40gbx8_suite_G_nvidia_sglang_c43a8309_9e9c88dd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T20:06:15.402590+00:00", + "benchmark_end_time": "2026-05-09T20:36:20.103726+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 365.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/accuracy/accuracy.json new file mode 100644 index 00000000..5b260195 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/burst/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/burst/result.json new file mode 100644 index 00000000..28a3fd9f --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/burst/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 40.23, + "steady_ttft_p99_ms": 7616.24, + "burst_ttft_p50_ms": 58.43, + "burst_ttft_p99_ms": 92.44, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.012, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 9005.58, + "burst_ttft_p99_ms": 93.71 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 60.59, + "burst_ttft_p99_ms": 90.98 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 61.04, + "burst_ttft_p99_ms": 90.39 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "07:22:26", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T07:14:36.321733+00:00", + "benchmark_end_time": "2026-05-09T07:22:26.557125+00:00", + "benchmark_elapsed_minutes": 7.8, + "model_load_seconds": 148.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/env_info.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/env_info.json similarity index 50% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/env_info.json rename to results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/env_info.json index 6f03653e..e13ead7e 100644 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/env_info.json +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/env_info.json @@ -1,22 +1,22 @@ { - "collected_at": "2026-05-06T11:15:11.081772+00:00", + "collected_at": "2026-05-09T05:48:17.734271+00:00", "accelerators": [ { "index": 0, - "name": "NVIDIA A100-SXM4-40GB", + "name": "NVIDIA A100-SXM4-80GB", "vendor": "NVIDIA", - "memory_gb": 40, + "memory_gb": 80.0, "driver_version": "565.57.01", "firmware_version": null, "compute_capability": "8.0", "supports_bf16": true } ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tPXB\tNODE\tSYS\t0-31,64-95\t0\t\tN/A\nNIC0\tPXB\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tPXB\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", "intra_node_interconnect": null, "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, "logical_cores": 128, "numa_nodes": 2 }, diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/interactive/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/interactive/result.json new file mode 100644 index 00000000..97cf6bd4 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 29.64, + "ttft_ms_p90": 42.16, + "ttft_ms_p99": 60.21, + "tpot_ms_p50": 11.01, + "tpot_ms_p90": 11.05, + "tpot_ms_p99": 11.1, + "peak_memory_gb": null, + "elapsed_seconds_median": 325.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "06:19:40", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T06:03:20.785501+00:00", + "benchmark_end_time": "2026-05-09T06:19:40.172258+00:00", + "benchmark_elapsed_minutes": 16.3, + "model_load_seconds": 74.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/offline/result.json new file mode 100644 index 00000000..f8b644a7 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/offline/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3826.14, + "throughput_tokens_per_sec_per_chip": 3826.14, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3825.94, + "throughput_tokens_per_sec_per_chip": 3825.94, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3825.6, + "throughput_tokens_per_sec_per_chip": 3825.6, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:55:09", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:53:13.837994+00:00", + "benchmark_end_time": "2026-05-09T05:55:09.368859+00:00", + "benchmark_elapsed_minutes": 1.9, + "model_load_seconds": 76.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/online/result.json new file mode 100644 index 00000000..2720c2c6 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.3, + "ttft_ms_p90": 61.57, + "ttft_ms_p99": 2755.55, + "tpot_ms_p50": 12.84, + "tpot_ms_p90": 14.51, + "tpot_ms_p99": 18.23, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.35, + "ttft_ms_p90": 76.61, + "ttft_ms_p99": 88.02, + "tpot_ms_p50": 30.58, + "tpot_ms_p90": 36.23, + "tpot_ms_p99": 44.23, + "elapsed_seconds_median": 16.2, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 51.72, + "ttft_ms_p90": 73.43, + "ttft_ms_p99": 279.13, + "tpot_ms_p50": 38.98, + "tpot_ms_p90": 50.77, + "tpot_ms_p99": 137.32, + "elapsed_seconds_median": 10.3, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "06:01:30", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:56:56.038603+00:00", + "benchmark_end_time": "2026-05-09T06:01:30.135883+00:00", + "benchmark_elapsed_minutes": 4.6, + "model_load_seconds": 72.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/result.json new file mode 100644 index 00000000..62fe8bc2 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/result.json @@ -0,0 +1,615 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "speculative", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3826.14, + "throughput_tokens_per_sec_per_chip": 3826.14, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3825.94, + "throughput_tokens_per_sec_per_chip": 3825.94, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3825.6, + "throughput_tokens_per_sec_per_chip": 3825.6, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.3, + "ttft_ms_p90": 61.57, + "ttft_ms_p99": 2755.55, + "tpot_ms_p50": 12.84, + "tpot_ms_p90": 14.51, + "tpot_ms_p99": 18.23, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.35, + "ttft_ms_p90": 76.61, + "ttft_ms_p99": 88.02, + "tpot_ms_p50": 30.58, + "tpot_ms_p90": 36.23, + "tpot_ms_p99": 44.23, + "elapsed_seconds_median": 16.2, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 51.72, + "ttft_ms_p90": 73.43, + "ttft_ms_p99": 279.13, + "tpot_ms_p50": 38.98, + "tpot_ms_p90": 50.77, + "tpot_ms_p99": 137.32, + "elapsed_seconds_median": 10.3, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 29.64, + "ttft_ms_p90": 42.16, + "ttft_ms_p99": 60.21, + "tpot_ms_p50": 11.01, + "tpot_ms_p90": 11.05, + "tpot_ms_p99": 11.1, + "peak_memory_gb": null, + "elapsed_seconds_median": 325.6 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 461.8, + "tokens_out": 27707, + "tokens_in": 0, + "requests_completed": 149, + "ttft_ms_p50": 43.5, + "ttft_ms_p99": 16624.9 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.3, + "tokens_out": 39864, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 38.8, + "ttft_ms_p99": 54.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.9, + "tokens_out": 39547, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 54.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.0, + "tokens_out": 40062, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.0 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.5, + "tokens_out": 39684, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.9 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.0, + "tokens_out": 39666, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.3, + "tokens_out": 39923, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.0, + "ttft_ms_p99": 42.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.0, + "tokens_out": 40095, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.3, + "tokens_out": 40337, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 44.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.9, + "tokens_out": 39632, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 46.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.6, + "tokens_out": 39525, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 46.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.2, + "tokens_out": 39809, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.2, + "tokens_out": 39897, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.4, + "tokens_out": 39497, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.8 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.5, + "tokens_out": 40005, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 57.0 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.9, + "tokens_out": 40133, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 55.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.8, + "tokens_out": 40122, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.5, + "ttft_ms_p99": 47.1 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.7, + "tokens_out": 39530, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 43.2 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.7, + "tokens_out": 39630, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 46.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.3, + "tokens_out": 39871, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.0, + "tokens_out": 39810, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 55.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.9, + "tokens_out": 40137, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 47.5 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.6, + "tokens_out": 39960, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 59.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.1, + "tokens_out": 39597, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 43.0 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.1, + "tokens_out": 39972, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.0 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.6, + "tokens_out": 39744, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 54.2 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.4, + "tokens_out": 39800, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.6 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.1, + "tokens_out": 39830, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.1 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.1, + "tokens_out": 39812, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.5 + } + ], + "sustained_throughput_tokens_per_sec": 664.0, + "throttle_ratio": 0.979, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2 + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 732.93, + "throughput_tokens_per_sec_per_chip": 732.93, + "elapsed_seconds_median": 47.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 732.58, + "throughput_tokens_per_sec_per_chip": 732.58, + "elapsed_seconds_median": 47.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 731.36, + "throughput_tokens_per_sec_per_chip": 731.36, + "elapsed_seconds_median": 47.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 40.23, + "steady_ttft_p99_ms": 7616.24, + "burst_ttft_p50_ms": 58.43, + "burst_ttft_p99_ms": 92.44, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.012, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 9005.58, + "burst_ttft_p99_ms": 93.71 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 60.59, + "burst_ttft_p99_ms": 90.98 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 61.04, + "burst_ttft_p99_ms": 90.39 + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:55:09", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:53:13.837994+00:00", + "benchmark_end_time": "2026-05-09T05:55:09.368859+00:00", + "benchmark_elapsed_minutes": 70.2, + "model_load_seconds": 76.2, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/offline", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/online", + "interactive": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/interactive", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/sustained", + "speculative": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/speculative", + "burst": "results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/burst" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/speculative/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/speculative/result.json new file mode 100644 index 00000000..45e13030 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/speculative/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 732.93, + "throughput_tokens_per_sec_per_chip": 732.93, + "elapsed_seconds_median": 47.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 732.58, + "throughput_tokens_per_sec_per_chip": 732.58, + "elapsed_seconds_median": 47.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 731.36, + "throughput_tokens_per_sec_per_chip": 731.36, + "elapsed_seconds_median": 47.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "07:10:47", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T07:01:16.371264+00:00", + "benchmark_end_time": "2026-05-09T07:10:47.930065+00:00", + "benchmark_elapsed_minutes": 9.5, + "model_load_seconds": 224.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/sustained/result.json new file mode 100644 index 00000000..428e2715 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:48:17.734271+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 461.8, + "tokens_out": 27707, + "tokens_in": 0, + "requests_completed": 149, + "ttft_ms_p50": 43.5, + "ttft_ms_p99": 16624.9 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.3, + "tokens_out": 39864, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 38.8, + "ttft_ms_p99": 54.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.9, + "tokens_out": 39547, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 54.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.0, + "tokens_out": 40062, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.0 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.5, + "tokens_out": 39684, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.9 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.0, + "tokens_out": 39666, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.3, + "tokens_out": 39923, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.0, + "ttft_ms_p99": 42.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.0, + "tokens_out": 40095, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.3, + "tokens_out": 40337, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 44.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.9, + "tokens_out": 39632, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 46.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.6, + "tokens_out": 39525, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 46.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.2, + "tokens_out": 39809, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.2, + "tokens_out": 39897, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.4, + "tokens_out": 39497, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.8 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.5, + "tokens_out": 40005, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 57.0 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.9, + "tokens_out": 40133, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 55.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.8, + "tokens_out": 40122, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.5, + "ttft_ms_p99": 47.1 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.7, + "tokens_out": 39530, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 43.2 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.7, + "tokens_out": 39630, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 46.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.3, + "tokens_out": 39871, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.0, + "tokens_out": 39810, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 55.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.9, + "tokens_out": 40137, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 47.5 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.6, + "tokens_out": 39960, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 59.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.1, + "tokens_out": 39597, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 43.0 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.1, + "tokens_out": 39972, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.0 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.6, + "tokens_out": 39744, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 54.2 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.4, + "tokens_out": 39800, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.6 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.1, + "tokens_out": 39830, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.1 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.1, + "tokens_out": 39812, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.1, + "ttft_ms_p99": 54.5 + } + ], + "sustained_throughput_tokens_per_sec": 664.0, + "throttle_ratio": 0.979, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "06:55:50", + "run_id": "b14c1ebc", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_b14c1ebc", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T06:25:45.065449+00:00", + "benchmark_end_time": "2026-05-09T06:55:49.949999+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 252.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline/result.json new file mode 100644 index 00000000..0f55961e --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3811.78, + "throughput_tokens_per_sec_per_chip": 3811.78, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3812.36, + "throughput_tokens_per_sec_per_chip": 3812.36, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3814.22, + "throughput_tokens_per_sec_per_chip": 3814.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3806.77, + "throughput_tokens_per_sec_per_chip": 3806.77, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:10:25", + "run_id": "29a32aea", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:07:50.935068+00:00", + "benchmark_end_time": "2026-04-30T08:10:25.900871+00:00", + "benchmark_elapsed_minutes": 2.6, + "model_load_seconds": 70.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online/result.json new file mode 100644 index 00000000..a809fb6e --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.09, + "ttft_ms_p90": 61.91, + "ttft_ms_p99": 3163.27, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.44, + "tpot_ms_p99": 17.5, + "elapsed_seconds_median": 65.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.35, + "ttft_ms_p90": 57.39, + "ttft_ms_p99": 64.91, + "tpot_ms_p50": 15.94, + "tpot_ms_p90": 17.81, + "tpot_ms_p99": 19.09, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.45, + "ttft_ms_p90": 75.19, + "ttft_ms_p99": 86.75, + "tpot_ms_p50": 29.88, + "tpot_ms_p90": 35.11, + "tpot_ms_p99": 39.97, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 51.39, + "ttft_ms_p90": 75.87, + "ttft_ms_p99": 94.44, + "tpot_ms_p50": 37.98, + "tpot_ms_p90": 44.01, + "tpot_ms_p99": 60.2, + "elapsed_seconds_median": 12.1, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:18:27", + "run_id": "29a32aea", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:12:09.187300+00:00", + "benchmark_end_time": "2026-04-30T08:18:27.314631+00:00", + "benchmark_elapsed_minutes": 6.3, + "model_load_seconds": 70.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/result.json new file mode 100644 index 00000000..46c68a3a --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3811.78, + "throughput_tokens_per_sec_per_chip": 3811.78, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3812.36, + "throughput_tokens_per_sec_per_chip": 3812.36, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3814.22, + "throughput_tokens_per_sec_per_chip": 3814.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3806.77, + "throughput_tokens_per_sec_per_chip": 3806.77, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.09, + "ttft_ms_p90": 61.91, + "ttft_ms_p99": 3163.27, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.44, + "tpot_ms_p99": 17.5, + "elapsed_seconds_median": 65.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.35, + "ttft_ms_p90": 57.39, + "ttft_ms_p99": 64.91, + "tpot_ms_p50": 15.94, + "tpot_ms_p90": 17.81, + "tpot_ms_p99": 19.09, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.45, + "ttft_ms_p90": 75.19, + "ttft_ms_p99": 86.75, + "tpot_ms_p50": 29.88, + "tpot_ms_p90": 35.11, + "tpot_ms_p99": 39.97, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 51.39, + "ttft_ms_p90": 75.87, + "ttft_ms_p99": 94.44, + "tpot_ms_p50": 37.98, + "tpot_ms_p90": 44.01, + "tpot_ms_p99": 60.2, + "elapsed_seconds_median": 12.1, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 587.4, + "tokens_out": 35271, + "tokens_in": 0, + "requests_completed": 193, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 4162.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 673.7, + "tokens_out": 40390, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 56.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.0, + "tokens_out": 39444, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.0, + "ttft_ms_p99": 55.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.4, + "tokens_out": 39848, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 55.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.9, + "tokens_out": 39968, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.9, + "tokens_out": 39577, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 60.7 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39897, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.4, + "tokens_out": 39449, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40108, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 53.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.6, + "tokens_out": 39662, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.3, + "tokens_out": 39497, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.7, + "tokens_out": 39679, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 47.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.9, + "tokens_out": 39798, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 671.5, + "tokens_out": 40279, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.4 + } + ], + "sustained_throughput_tokens_per_sec": 658.1, + "throttle_ratio": 0.872, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -4105.9 + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:10:25", + "run_id": "29a32aea", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:07:50.935068+00:00", + "benchmark_end_time": "2026-04-30T08:10:25.900871+00:00", + "benchmark_elapsed_minutes": 24.0, + "model_load_seconds": 70.3, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained/result.json new file mode 100644 index 00000000..3d008d8f --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 587.4, + "tokens_out": 35271, + "tokens_in": 0, + "requests_completed": 193, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 4162.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 673.7, + "tokens_out": 40390, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 56.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.0, + "tokens_out": 39444, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.0, + "ttft_ms_p99": 55.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.4, + "tokens_out": 39848, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 55.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.9, + "tokens_out": 39968, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.9, + "tokens_out": 39577, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 60.7 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39897, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.4, + "tokens_out": 39449, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40108, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 53.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.6, + "tokens_out": 39662, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.3, + "tokens_out": 39497, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.7, + "tokens_out": 39679, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 47.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.9, + "tokens_out": 39798, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 671.5, + "tokens_out": 40279, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.4 + } + ], + "sustained_throughput_tokens_per_sec": 658.1, + "throttle_ratio": 0.872, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -4105.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:35:24", + "run_id": "29a32aea", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:20:20.395609+00:00", + "benchmark_end_time": "2026-04-30T08:35:24.178367+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 76.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/env_info.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/env_info.json new file mode 100644 index 00000000..b1f9df26 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/result.json new file mode 100644 index 00000000..0d242ee4 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/result.json @@ -0,0 +1,963 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 3814.22, + "accuracy_score": 0.55, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 2097.8, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3811.78, + "throughput_tokens_per_sec_per_chip": 3811.78, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3812.36, + "throughput_tokens_per_sec_per_chip": 3812.36, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3814.22, + "throughput_tokens_per_sec_per_chip": 3814.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3806.77, + "throughput_tokens_per_sec_per_chip": 3806.77, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 3995.48, + "accuracy_score": 0.59, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 2357.3, + "speedup_vs_bf16": 1.048, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3994.04, + "throughput_tokens_per_sec_per_chip": 3994.04, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3995.3, + "throughput_tokens_per_sec_per_chip": 3995.3, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3993.64, + "throughput_tokens_per_sec_per_chip": 3993.64, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3995.48, + "throughput_tokens_per_sec_per_chip": 3995.48, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 2208.93, + "accuracy_score": 0.57, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 1259.1, + "speedup_vs_bf16": 0.579, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2151.78, + "throughput_tokens_per_sec_per_chip": 2151.78, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2107.59, + "throughput_tokens_per_sec_per_chip": 2107.59, + "elapsed_seconds_median": 16.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2144.09, + "throughput_tokens_per_sec_per_chip": 2144.09, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2208.93, + "throughput_tokens_per_sec_per_chip": 2208.93, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "auto", + "quantization_method": "gptq" + } + ] + }, + "derived": {}, + "quantization_online": { + "results_by_precision": [ + { + "precision": "BF16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.09, + "ttft_ms_p90": 61.91, + "ttft_ms_p99": 3163.27, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.44, + "tpot_ms_p99": 17.5, + "elapsed_seconds_median": 65.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.35, + "ttft_ms_p90": 57.39, + "ttft_ms_p99": 64.91, + "tpot_ms_p50": 15.94, + "tpot_ms_p90": 17.81, + "tpot_ms_p99": 19.09, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.45, + "ttft_ms_p90": 75.19, + "ttft_ms_p99": 86.75, + "tpot_ms_p50": 29.88, + "tpot_ms_p90": 35.11, + "tpot_ms_p99": 39.97, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 51.39, + "ttft_ms_p90": 75.87, + "ttft_ms_p99": 94.44, + "tpot_ms_p50": 37.98, + "tpot_ms_p90": 44.01, + "tpot_ms_p99": 60.2, + "elapsed_seconds_median": 12.1, + "sla_met": true + } + ] + }, + { + "precision": "W8A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 36.12, + "ttft_ms_p90": 69.07, + "ttft_ms_p99": 3268.43, + "tpot_ms_p50": 8.37, + "tpot_ms_p90": 9.71, + "tpot_ms_p99": 14.4, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 37.67, + "ttft_ms_p90": 55.4, + "ttft_ms_p99": 66.01, + "tpot_ms_p50": 11.2, + "tpot_ms_p90": 12.6, + "tpot_ms_p99": 14.08, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 54.14, + "ttft_ms_p90": 78.87, + "ttft_ms_p99": 95.15, + "tpot_ms_p50": 30.91, + "tpot_ms_p90": 37.44, + "tpot_ms_p99": 44.41, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 56.58, + "ttft_ms_p90": 83.52, + "ttft_ms_p99": 102.84, + "tpot_ms_p50": 41.53, + "tpot_ms_p90": 50.68, + "tpot_ms_p99": 75.53, + "elapsed_seconds_median": 12.9, + "sla_met": true + } + ] + }, + { + "precision": "W4A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 96.93, + "ttft_ms_p99": 3210.47, + "tpot_ms_p50": 21.98, + "tpot_ms_p90": 36.21, + "tpot_ms_p99": 40.53, + "elapsed_seconds_median": 66.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 65.61, + "ttft_ms_p90": 87.43, + "ttft_ms_p99": 98.67, + "tpot_ms_p50": 35.19, + "tpot_ms_p90": 36.47, + "tpot_ms_p99": 39.7, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.45, + "ttft_ms_p90": 96.95, + "ttft_ms_p99": 116.42, + "tpot_ms_p50": 45.94, + "tpot_ms_p90": 50.69, + "tpot_ms_p99": 57.79, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.0, + "ttft_ms_p90": 97.76, + "ttft_ms_p99": 121.5, + "tpot_ms_p50": 49.66, + "tpot_ms_p90": 59.83, + "tpot_ms_p99": 75.38, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + } + ] + }, + "quantization_sustained": { + "results_by_precision": [ + { + "precision": "BF16", + "sustained_throughput_tokens_per_sec": 658.1, + "throttle_ratio": 0.872, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -4105.9, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 587.4, + "tokens_out": 35271, + "tokens_in": 0, + "requests_completed": 193, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 4162.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 673.7, + "tokens_out": 40390, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 56.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.0, + "tokens_out": 39444, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.0, + "ttft_ms_p99": 55.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.4, + "tokens_out": 39848, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 55.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.9, + "tokens_out": 39968, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.9, + "tokens_out": 39577, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 60.7 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39897, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 657.4, + "tokens_out": 39449, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40108, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 53.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.6, + "tokens_out": 39662, + "tokens_in": 0, + "requests_completed": 218, + "ttft_ms_p50": 39.2, + "ttft_ms_p99": 54.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 658.3, + "tokens_out": 39497, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 55.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.7, + "tokens_out": 39679, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 47.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.9, + "tokens_out": 39798, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.4, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 671.5, + "tokens_out": 40279, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.3, + "ttft_ms_p99": 56.4 + } + ] + }, + { + "precision": "W8A16", + "sustained_throughput_tokens_per_sec": 949.7, + "throttle_ratio": 0.895, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3559.4, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 862.6, + "tokens_out": 51782, + "tokens_in": 0, + "requests_completed": 285, + "ttft_ms_p50": 34.2, + "ttft_ms_p99": 3609.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.6, + "tokens_out": 57403, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 55.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 964.1, + "tokens_out": 57850, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 32.9, + "ttft_ms_p99": 49.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 949.6, + "tokens_out": 56948, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.8, + "tokens_out": 57415, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 959.2, + "tokens_out": 57545, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 32.7, + "ttft_ms_p99": 50.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57358, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 44.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.3, + "tokens_out": 57077, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 50.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57391, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 51.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 960.7, + "tokens_out": 57646, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 958.1, + "tokens_out": 57460, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.6, + "tokens_out": 57271, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.0, + "tokens_out": 57231, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 49.7 + } + ] + }, + { + "precision": "W4A16", + "sustained_throughput_tokens_per_sec": 777.5, + "throttle_ratio": 0.886, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3160.0, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.0, + "tokens_out": 42136, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 36.4, + "ttft_ms_p99": 3215.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.1, + "tokens_out": 47275, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 776.1, + "tokens_out": 46574, + "tokens_in": 0, + "requests_completed": 261, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 51.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 792.4, + "tokens_out": 47547, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.6, + "tokens_out": 46845, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 783.4, + "tokens_out": 47022, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 778.0, + "tokens_out": 46682, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.4, + "tokens_out": 47403, + "tokens_in": 0, + "requests_completed": 268, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.9, + "tokens_out": 46849, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 52.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 774.9, + "tokens_out": 46503, + "tokens_in": 0, + "requests_completed": 258, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.4, + "tokens_out": 47353, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 55.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.1, + "tokens_out": 46810, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 52.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.7, + "tokens_out": 46798, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 53.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.6, + "tokens_out": 47370, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 55.0 + } + ] + } + ] + } + }, + "accuracy": null, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:10:25", + "run_id": "29a32aea", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:07:50.935068+00:00", + "benchmark_end_time": "2026-04-30T08:10:25.900871+00:00", + "benchmark_elapsed_minutes": 73.8, + "model_load_seconds": 70.3, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/offline", + "bf16/online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/online", + "bf16/sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/bf16/sustained", + "fp8/offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/fp8/offline", + "fp8/online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/fp8/online", + "fp8/sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/fp8/sustained", + "w8a8/offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a8/offline", + "w8a8/online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a8/online", + "w8a8/sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a8/sustained", + "w8a16/offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline", + "w8a16/online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online", + "w8a16/sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained", + "w4a16/offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline", + "w4a16/online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online", + "w4a16/sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization — reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization — larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..e2c86fd4 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline/result.json new file mode 100644 index 00000000..2beadb1b --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2151.78, + "throughput_tokens_per_sec_per_chip": 2151.78, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2107.59, + "throughput_tokens_per_sec_per_chip": 2107.59, + "elapsed_seconds_median": 16.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2144.09, + "throughput_tokens_per_sec_per_chip": 2144.09, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2208.93, + "throughput_tokens_per_sec_per_chip": 2208.93, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:27:50", + "run_id": "ce081f96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_ce081f96", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:23:29.519979+00:00", + "benchmark_end_time": "2026-04-30T09:27:50.159108+00:00", + "benchmark_elapsed_minutes": 4.3, + "model_load_seconds": 71.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online/result.json new file mode 100644 index 00000000..0bc84f2d --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 96.93, + "ttft_ms_p99": 3210.47, + "tpot_ms_p50": 21.98, + "tpot_ms_p90": 36.21, + "tpot_ms_p99": 40.53, + "elapsed_seconds_median": 66.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 65.61, + "ttft_ms_p90": 87.43, + "ttft_ms_p99": 98.67, + "tpot_ms_p50": 35.19, + "tpot_ms_p90": 36.47, + "tpot_ms_p99": 39.7, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.45, + "ttft_ms_p90": 96.95, + "ttft_ms_p99": 116.42, + "tpot_ms_p50": 45.94, + "tpot_ms_p90": 50.69, + "tpot_ms_p99": 57.79, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.0, + "ttft_ms_p90": 97.76, + "ttft_ms_p99": 121.5, + "tpot_ms_p50": 49.66, + "tpot_ms_p90": 59.83, + "tpot_ms_p99": 75.38, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:36:16", + "run_id": "ce081f96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_ce081f96", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:29:31.834008+00:00", + "benchmark_end_time": "2026-04-30T09:36:16.456822+00:00", + "benchmark_elapsed_minutes": 6.7, + "model_load_seconds": 69.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/result.json new file mode 100644 index 00000000..074a24c8 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2151.78, + "throughput_tokens_per_sec_per_chip": 2151.78, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2107.59, + "throughput_tokens_per_sec_per_chip": 2107.59, + "elapsed_seconds_median": 16.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2144.09, + "throughput_tokens_per_sec_per_chip": 2144.09, + "elapsed_seconds_median": 16.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2208.93, + "throughput_tokens_per_sec_per_chip": 2208.93, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 96.93, + "ttft_ms_p99": 3210.47, + "tpot_ms_p50": 21.98, + "tpot_ms_p90": 36.21, + "tpot_ms_p99": 40.53, + "elapsed_seconds_median": 66.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 65.61, + "ttft_ms_p90": 87.43, + "ttft_ms_p99": 98.67, + "tpot_ms_p50": 35.19, + "tpot_ms_p90": 36.47, + "tpot_ms_p99": 39.7, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.45, + "ttft_ms_p90": 96.95, + "ttft_ms_p99": 116.42, + "tpot_ms_p50": 45.94, + "tpot_ms_p90": 50.69, + "tpot_ms_p99": 57.79, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.0, + "ttft_ms_p90": 97.76, + "ttft_ms_p99": 121.5, + "tpot_ms_p50": 49.66, + "tpot_ms_p90": 59.83, + "tpot_ms_p99": 75.38, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.0, + "tokens_out": 42136, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 36.4, + "ttft_ms_p99": 3215.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.1, + "tokens_out": 47275, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 776.1, + "tokens_out": 46574, + "tokens_in": 0, + "requests_completed": 261, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 51.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 792.4, + "tokens_out": 47547, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.6, + "tokens_out": 46845, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 783.4, + "tokens_out": 47022, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 778.0, + "tokens_out": 46682, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.4, + "tokens_out": 47403, + "tokens_in": 0, + "requests_completed": 268, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.9, + "tokens_out": 46849, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 52.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 774.9, + "tokens_out": 46503, + "tokens_in": 0, + "requests_completed": 258, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.4, + "tokens_out": 47353, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 55.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.1, + "tokens_out": 46810, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 52.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.7, + "tokens_out": 46798, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 53.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.6, + "tokens_out": 47370, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 55.0 + } + ], + "sustained_throughput_tokens_per_sec": 777.5, + "throttle_ratio": 0.886, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3160.0 + } + }, + "accuracy": { + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:27:50", + "run_id": "ce081f96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_ce081f96", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:23:29.519979+00:00", + "benchmark_end_time": "2026-04-30T09:27:50.159108+00:00", + "benchmark_elapsed_minutes": 26.1, + "model_load_seconds": 71.1, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/offline", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/online", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained/result.json new file mode 100644 index 00000000..0e3ed7e4 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w4a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.0, + "tokens_out": 42136, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 36.4, + "ttft_ms_p99": 3215.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.1, + "tokens_out": 47275, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 776.1, + "tokens_out": 46574, + "tokens_in": 0, + "requests_completed": 261, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 51.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 792.4, + "tokens_out": 47547, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.6, + "tokens_out": 46845, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 783.4, + "tokens_out": 47022, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 778.0, + "tokens_out": 46682, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.2, + "ttft_ms_p99": 50.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.4, + "tokens_out": 47403, + "tokens_in": 0, + "requests_completed": 268, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 52.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.9, + "tokens_out": 46849, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 52.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 774.9, + "tokens_out": 46503, + "tokens_in": 0, + "requests_completed": 258, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.4, + "tokens_out": 47353, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 55.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 780.1, + "tokens_out": 46810, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 52.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.7, + "tokens_out": 46798, + "tokens_in": 0, + "requests_completed": 259, + "ttft_ms_p50": 35.5, + "ttft_ms_p99": 53.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 789.6, + "tokens_out": 47370, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.4, + "ttft_ms_p99": 55.0 + } + ], + "sustained_throughput_tokens_per_sec": 777.5, + "throttle_ratio": 0.886, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3160.0 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:53:08", + "run_id": "ce081f96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_ce081f96", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:38:04.691953+00:00", + "benchmark_end_time": "2026-04-30T09:53:08.303371+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 74.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..60c0d5fb --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline/result.json new file mode 100644 index 00000000..3fa0c538 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3994.04, + "throughput_tokens_per_sec_per_chip": 3994.04, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3995.3, + "throughput_tokens_per_sec_per_chip": 3995.3, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3993.64, + "throughput_tokens_per_sec_per_chip": 3993.64, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3995.48, + "throughput_tokens_per_sec_per_chip": 3995.48, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:53:38", + "run_id": "abbf6933", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_abbf6933", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:51:11.064170+00:00", + "benchmark_end_time": "2026-04-30T08:53:38.642090+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 69.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online/result.json new file mode 100644 index 00000000..8226db5c --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 36.12, + "ttft_ms_p90": 69.07, + "ttft_ms_p99": 3268.43, + "tpot_ms_p50": 8.37, + "tpot_ms_p90": 9.71, + "tpot_ms_p99": 14.4, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 37.67, + "ttft_ms_p90": 55.4, + "ttft_ms_p99": 66.01, + "tpot_ms_p50": 11.2, + "tpot_ms_p90": 12.6, + "tpot_ms_p99": 14.08, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 54.14, + "ttft_ms_p90": 78.87, + "ttft_ms_p99": 95.15, + "tpot_ms_p50": 30.91, + "tpot_ms_p90": 37.44, + "tpot_ms_p99": 44.41, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 56.58, + "ttft_ms_p90": 83.52, + "ttft_ms_p99": 102.84, + "tpot_ms_p50": 41.53, + "tpot_ms_p90": 50.68, + "tpot_ms_p99": 75.53, + "elapsed_seconds_median": 12.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:01:35", + "run_id": "abbf6933", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_abbf6933", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:55:21.326171+00:00", + "benchmark_end_time": "2026-04-30T09:01:35.215175+00:00", + "benchmark_elapsed_minutes": 6.2, + "model_load_seconds": 70.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/result.json new file mode 100644 index 00000000..59a8d13b --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3994.04, + "throughput_tokens_per_sec_per_chip": 3994.04, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3995.3, + "throughput_tokens_per_sec_per_chip": 3995.3, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3993.64, + "throughput_tokens_per_sec_per_chip": 3993.64, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3995.48, + "throughput_tokens_per_sec_per_chip": 3995.48, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 36.12, + "ttft_ms_p90": 69.07, + "ttft_ms_p99": 3268.43, + "tpot_ms_p50": 8.37, + "tpot_ms_p90": 9.71, + "tpot_ms_p99": 14.4, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 37.67, + "ttft_ms_p90": 55.4, + "ttft_ms_p99": 66.01, + "tpot_ms_p50": 11.2, + "tpot_ms_p90": 12.6, + "tpot_ms_p99": 14.08, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 54.14, + "ttft_ms_p90": 78.87, + "ttft_ms_p99": 95.15, + "tpot_ms_p50": 30.91, + "tpot_ms_p90": 37.44, + "tpot_ms_p99": 44.41, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 56.58, + "ttft_ms_p90": 83.52, + "ttft_ms_p99": 102.84, + "tpot_ms_p50": 41.53, + "tpot_ms_p90": 50.68, + "tpot_ms_p99": 75.53, + "elapsed_seconds_median": 12.9, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 862.6, + "tokens_out": 51782, + "tokens_in": 0, + "requests_completed": 285, + "ttft_ms_p50": 34.2, + "ttft_ms_p99": 3609.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.6, + "tokens_out": 57403, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 55.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 964.1, + "tokens_out": 57850, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 32.9, + "ttft_ms_p99": 49.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 949.6, + "tokens_out": 56948, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.8, + "tokens_out": 57415, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 959.2, + "tokens_out": 57545, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 32.7, + "ttft_ms_p99": 50.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57358, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 44.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.3, + "tokens_out": 57077, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 50.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57391, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 51.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 960.7, + "tokens_out": 57646, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 958.1, + "tokens_out": 57460, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.6, + "tokens_out": 57271, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.0, + "tokens_out": 57231, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 49.7 + } + ], + "sustained_throughput_tokens_per_sec": 949.7, + "throttle_ratio": 0.895, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3559.4 + } + }, + "accuracy": { + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:53:38", + "run_id": "abbf6933", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_abbf6933", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:51:11.064170+00:00", + "benchmark_end_time": "2026-04-30T08:53:38.642090+00:00", + "benchmark_elapsed_minutes": 23.7, + "model_load_seconds": 69.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/offline", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/online", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained/result.json new file mode 100644 index 00000000..4878ff90 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_29a32aea/w8a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T08:03:04.688337+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tPXB\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tPXB\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 862.6, + "tokens_out": 51782, + "tokens_in": 0, + "requests_completed": 285, + "ttft_ms_p50": 34.2, + "ttft_ms_p99": 3609.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.6, + "tokens_out": 57403, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 55.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 964.1, + "tokens_out": 57850, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 32.9, + "ttft_ms_p99": 49.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 949.6, + "tokens_out": 56948, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.8, + "tokens_out": 57415, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 959.2, + "tokens_out": 57545, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 32.7, + "ttft_ms_p99": 50.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57358, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 44.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.3, + "tokens_out": 57077, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 50.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57391, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 51.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 960.7, + "tokens_out": 57646, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 958.1, + "tokens_out": 57460, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.1, + "ttft_ms_p99": 51.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.6, + "tokens_out": 57271, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.0, + "tokens_out": 57231, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.1, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 49.7 + } + ], + "sustained_throughput_tokens_per_sec": 949.7, + "throttle_ratio": 0.895, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3559.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:18:39", + "run_id": "abbf6933", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_abbf6933", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:03:36.171327+00:00", + "benchmark_end_time": "2026-04-30T09:18:39.104477+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 83.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/env_info.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/env_info.json new file mode 100644 index 00000000..813ac789 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/interactive/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/interactive/result.json new file mode 100644 index 00000000..c2ab1376 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 2971.73, + "ttft_ms_p90": 3094.21, + "ttft_ms_p99": 3156.35, + "tpot_ms_p50": 13.3, + "tpot_ms_p90": 13.34, + "tpot_ms_p99": 13.37, + "peak_memory_gb": null, + "elapsed_seconds_median": 599.3 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "08:11:09", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T07:51:10.821530+00:00", + "benchmark_end_time": "2026-05-09T08:11:09.370299+00:00", + "benchmark_elapsed_minutes": 20.0, + "model_load_seconds": 136.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/offline/result.json new file mode 100644 index 00000000..ad6ea168 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/offline/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 69.59, + "throughput_tokens_per_sec_per_chip": 69.59, + "elapsed_seconds_median": 184.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 69.6, + "throughput_tokens_per_sec_per_chip": 69.6, + "elapsed_seconds_median": 184.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "07:47:42", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T07:29:11.140810+00:00", + "benchmark_end_time": "2026-05-09T07:47:42.068128+00:00", + "benchmark_elapsed_minutes": 18.5, + "model_load_seconds": 80.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/online/result.json new file mode 100644 index 00000000..a86a4c36 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 83128.05, + "ttft_ms_p90": 154238.99, + "ttft_ms_p99": 170546.4, + "tpot_ms_p50": 128.57, + "tpot_ms_p90": 211.06, + "tpot_ms_p99": 226.9, + "elapsed_seconds_median": 362.7, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 116443.21, + "ttft_ms_p90": 209635.11, + "ttft_ms_p99": 237132.86, + "tpot_ms_p50": 128.45, + "tpot_ms_p90": 210.75, + "tpot_ms_p99": 226.84, + "elapsed_seconds_median": 358.6, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 150859.34, + "ttft_ms_p90": 267250.36, + "ttft_ms_p99": 298587.01, + "tpot_ms_p50": 128.52, + "tpot_ms_p90": 210.81, + "tpot_ms_p99": 227.0, + "elapsed_seconds_median": 356.2, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "09:24:50", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T08:48:55.526410+00:00", + "benchmark_end_time": "2026-05-09T09:24:50.718566+00:00", + "benchmark_elapsed_minutes": 35.9, + "model_load_seconds": 160.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/result.json new file mode 100644 index 00000000..5eb9289c --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/result.json @@ -0,0 +1,551 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online", + "speculative" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 69.59, + "throughput_tokens_per_sec_per_chip": 69.59, + "elapsed_seconds_median": 184.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 69.6, + "throughput_tokens_per_sec_per_chip": 69.6, + "elapsed_seconds_median": 184.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 2971.73, + "ttft_ms_p90": 3094.21, + "ttft_ms_p99": 3156.35, + "tpot_ms_p50": 13.3, + "tpot_ms_p90": 13.34, + "tpot_ms_p99": 13.37, + "peak_memory_gb": null, + "elapsed_seconds_median": 599.3 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 30.0, + "tokens_out": 1800, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 21060.2, + "ttft_ms_p99": 30620.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12973.6, + "ttft_ms_p99": 23038.0 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12726.7, + "ttft_ms_p99": 22480.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12784.9, + "ttft_ms_p99": 22623.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13034.8, + "ttft_ms_p99": 23137.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12580.3, + "ttft_ms_p99": 22602.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.4, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12807.2, + "ttft_ms_p99": 22623.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.2, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13020.5, + "ttft_ms_p99": 23312.8 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.2, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12557.1, + "ttft_ms_p99": 22757.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12436.6, + "ttft_ms_p99": 22553.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13014.2, + "ttft_ms_p99": 23107.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12519.8, + "ttft_ms_p99": 22658.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12681.3, + "ttft_ms_p99": 22718.9 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13039.8, + "ttft_ms_p99": 23137.1 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12753.6, + "ttft_ms_p99": 22520.4 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12791.4, + "ttft_ms_p99": 22658.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13051.0, + "ttft_ms_p99": 23138.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12589.1, + "ttft_ms_p99": 22595.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12810.6, + "ttft_ms_p99": 22597.2 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13023.1, + "ttft_ms_p99": 23253.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12585.5, + "ttft_ms_p99": 22615.8 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12514.3, + "ttft_ms_p99": 22785.1 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12850.5, + "ttft_ms_p99": 23035.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12578.8, + "ttft_ms_p99": 22827.1 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12602.0, + "ttft_ms_p99": 22692.5 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12942.4, + "ttft_ms_p99": 22964.2 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12986.3, + "ttft_ms_p99": 23014.5 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12792.2, + "ttft_ms_p99": 22573.6 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12968.9, + "ttft_ms_p99": 22869.8 + } + ], + "sustained_throughput_tokens_per_sec": 61.1, + "throttle_ratio": 0.671, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": -168.2 + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 83128.05, + "ttft_ms_p90": 154238.99, + "ttft_ms_p99": 170546.4, + "tpot_ms_p50": 128.57, + "tpot_ms_p90": 211.06, + "tpot_ms_p99": 226.9, + "elapsed_seconds_median": 362.7, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 116443.21, + "ttft_ms_p90": 209635.11, + "ttft_ms_p99": 237132.86, + "tpot_ms_p50": 128.45, + "tpot_ms_p90": 210.75, + "tpot_ms_p99": 226.84, + "elapsed_seconds_median": 358.6, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 150859.34, + "ttft_ms_p90": 267250.36, + "ttft_ms_p99": 298587.01, + "tpot_ms_p50": 128.52, + "tpot_ms_p90": 210.81, + "tpot_ms_p99": 227.0, + "elapsed_seconds_median": 356.2, + "sla_met": false + } + ] + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 45.04, + "throughput_tokens_per_sec_per_chip": 45.04, + "elapsed_seconds_median": 285.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 45.04, + "throughput_tokens_per_sec_per_chip": 45.04, + "elapsed_seconds_median": 285.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "07:47:42", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T07:29:11.140810+00:00", + "benchmark_end_time": "2026-05-09T07:47:42.068128+00:00", + "benchmark_elapsed_minutes": 133.6, + "model_load_seconds": 80.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online', 'speculative'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/offline", + "interactive": "results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/interactive", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/sustained", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/online", + "speculative": "results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/speculative" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/speculative/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/speculative/result.json new file mode 100644 index 00000000..ab653100 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/speculative/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 45.04, + "throughput_tokens_per_sec_per_chip": 45.04, + "elapsed_seconds_median": 285.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 45.04, + "throughput_tokens_per_sec_per_chip": 45.04, + "elapsed_seconds_median": 285.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "09:57:54", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T09:29:15.832962+00:00", + "benchmark_end_time": "2026-05-09T09:57:54.181380+00:00", + "benchmark_elapsed_minutes": 28.6, + "model_load_seconds": 172.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/sustained/result.json new file mode 100644 index 00000000..bb68a8c6 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T07:24:11.439226+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 30.0, + "tokens_out": 1800, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 21060.2, + "ttft_ms_p99": 30620.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12973.6, + "ttft_ms_p99": 23038.0 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12726.7, + "ttft_ms_p99": 22480.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12784.9, + "ttft_ms_p99": 22623.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13034.8, + "ttft_ms_p99": 23137.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12580.3, + "ttft_ms_p99": 22602.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.4, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12807.2, + "ttft_ms_p99": 22623.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.2, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13020.5, + "ttft_ms_p99": 23312.8 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.2, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12557.1, + "ttft_ms_p99": 22757.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12436.6, + "ttft_ms_p99": 22553.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13014.2, + "ttft_ms_p99": 23107.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12519.8, + "ttft_ms_p99": 22658.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12681.3, + "ttft_ms_p99": 22718.9 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13039.8, + "ttft_ms_p99": 23137.1 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12753.6, + "ttft_ms_p99": 22520.4 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12791.4, + "ttft_ms_p99": 22658.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13051.0, + "ttft_ms_p99": 23138.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12589.1, + "ttft_ms_p99": 22595.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12810.6, + "ttft_ms_p99": 22597.2 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13023.1, + "ttft_ms_p99": 23253.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12585.5, + "ttft_ms_p99": 22615.8 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12514.3, + "ttft_ms_p99": 22785.1 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12850.5, + "ttft_ms_p99": 23035.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12578.8, + "ttft_ms_p99": 22827.1 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12602.0, + "ttft_ms_p99": 22692.5 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12942.4, + "ttft_ms_p99": 22964.2 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12986.3, + "ttft_ms_p99": 23014.5 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12792.2, + "ttft_ms_p99": 22573.6 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12968.9, + "ttft_ms_p99": 22869.8 + } + ], + "sustained_throughput_tokens_per_sec": 61.1, + "throttle_ratio": 0.671, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": -168.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "08:45:00", + "run_id": "4f45791f", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_4f45791f", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T08:14:26.663896+00:00", + "benchmark_end_time": "2026-05-09T08:45:00.753253+00:00", + "benchmark_elapsed_minutes": 30.6, + "model_load_seconds": 130.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/accuracy/accuracy.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/accuracy/accuracy.json similarity index 100% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/accuracy/accuracy.json rename to results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/accuracy/accuracy.json diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/env_info.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/env_info.json new file mode 100644 index 00000000..df3069ce --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/interactive/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/interactive/result.json new file mode 100644 index 00000000..4fb147b5 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 17.46, + "ttft_ms_p90": 18.77, + "ttft_ms_p99": 22.71, + "tpot_ms_p50": 1.76, + "tpot_ms_p90": 1.78, + "tpot_ms_p99": 1.85, + "peak_memory_gb": null, + "elapsed_seconds_median": 53.3 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:30:27", + "run_id": "280f3db2", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:27:45.855974+00:00", + "benchmark_end_time": "2026-05-09T05:30:27.695194+00:00", + "benchmark_elapsed_minutes": 2.7, + "model_load_seconds": 66.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/offline/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/offline/result.json new file mode 100644 index 00000000..cb12a852 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/offline/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 34907.16, + "throughput_tokens_per_sec_per_chip": 34907.16, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 35318.72, + "throughput_tokens_per_sec_per_chip": 35318.72, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 35341.25, + "throughput_tokens_per_sec_per_chip": 35341.25, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:22:34", + "run_id": "280f3db2", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:22:15.032061+00:00", + "benchmark_end_time": "2026-05-09T05:22:34.478538+00:00", + "benchmark_elapsed_minutes": 0.3, + "model_load_seconds": 60.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/online/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/online/result.json new file mode 100644 index 00000000..cece4dc2 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/online/result.json @@ -0,0 +1,156 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 19.48, + "ttft_ms_p90": 32.59, + "ttft_ms_p99": 2242.09, + "tpot_ms_p50": 2.43, + "tpot_ms_p90": 2.85, + "tpot_ms_p99": 6.7, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 28.79, + "ttft_ms_p90": 37.57, + "ttft_ms_p99": 44.56, + "tpot_ms_p50": 4.56, + "tpot_ms_p90": 5.59, + "tpot_ms_p99": 9.8, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:26:01", + "run_id": "280f3db2", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:24:02.675359+00:00", + "benchmark_end_time": "2026-05-09T05:26:01.825237+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 55.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/result.json new file mode 100644 index 00000000..4e34a091 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/result.json @@ -0,0 +1,371 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 34907.16, + "throughput_tokens_per_sec_per_chip": 34907.16, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 35318.72, + "throughput_tokens_per_sec_per_chip": 35318.72, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 35341.25, + "throughput_tokens_per_sec_per_chip": 35341.25, + "elapsed_seconds_median": 1.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 19.48, + "ttft_ms_p90": 32.59, + "ttft_ms_p99": 2242.09, + "tpot_ms_p50": 2.43, + "tpot_ms_p90": 2.85, + "tpot_ms_p99": 6.7, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 28.79, + "ttft_ms_p90": 37.57, + "ttft_ms_p99": 44.56, + "tpot_ms_p50": 4.56, + "tpot_ms_p90": 5.59, + "tpot_ms_p99": 9.8, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 17.46, + "ttft_ms_p90": 18.77, + "ttft_ms_p99": 22.71, + "tpot_ms_p50": 1.76, + "tpot_ms_p90": 1.78, + "tpot_ms_p99": 1.85, + "peak_memory_gb": null, + "elapsed_seconds_median": 53.3 + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7020.2, + "tokens_out": 421272, + "tokens_in": 0, + "requests_completed": 2260, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 3103.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7497.5, + "tokens_out": 449990, + "tokens_in": 0, + "requests_completed": 2402, + "ttft_ms_p50": 25.1, + "ttft_ms_p99": 44.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7463.0, + "tokens_out": 447695, + "tokens_in": 0, + "requests_completed": 2397, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 42.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7409.6, + "tokens_out": 444555, + "tokens_in": 0, + "requests_completed": 2372, + "ttft_ms_p50": 23.2, + "ttft_ms_p99": 44.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7414.8, + "tokens_out": 445029, + "tokens_in": 0, + "requests_completed": 2374, + "ttft_ms_p50": 22.1, + "ttft_ms_p99": 42.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7398.7, + "tokens_out": 443888, + "tokens_in": 0, + "requests_completed": 2366, + "ttft_ms_p50": 22.9, + "ttft_ms_p99": 43.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7454.4, + "tokens_out": 447128, + "tokens_in": 0, + "requests_completed": 2389, + "ttft_ms_p50": 22.5, + "ttft_ms_p99": 43.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7395.5, + "tokens_out": 443937, + "tokens_in": 0, + "requests_completed": 2372, + "ttft_ms_p50": 22.8, + "ttft_ms_p99": 43.1 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7477.0, + "tokens_out": 448440, + "tokens_in": 0, + "requests_completed": 2398, + "ttft_ms_p50": 26.0, + "ttft_ms_p99": 43.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7481.1, + "tokens_out": 449028, + "tokens_in": 0, + "requests_completed": 2403, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 44.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7420.7, + "tokens_out": 445294, + "tokens_in": 0, + "requests_completed": 2376, + "ttft_ms_p50": 21.6, + "ttft_ms_p99": 44.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7434.3, + "tokens_out": 446024, + "tokens_in": 0, + "requests_completed": 2386, + "ttft_ms_p50": 24.1, + "ttft_ms_p99": 44.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7485.4, + "tokens_out": 448857, + "tokens_in": 0, + "requests_completed": 2392, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 43.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7420.0, + "tokens_out": 445468, + "tokens_in": 0, + "requests_completed": 2379, + "ttft_ms_p50": 22.4, + "ttft_ms_p99": 43.8 + } + ], + "sustained_throughput_tokens_per_sec": 7412.3, + "throttle_ratio": 0.936, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -3059.6 + } + }, + "accuracy": { + "subset_score": 0.41, + "baseline_delta": 0.03, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:22:34", + "run_id": "280f3db2", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:22:15.032061+00:00", + "benchmark_end_time": "2026-05-09T05:22:34.478538+00:00", + "benchmark_elapsed_minutes": 20.0, + "model_load_seconds": 60.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/offline", + "online": "results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/online", + "interactive": "results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/interactive", + "sustained": "results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/sustained/result.json b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/sustained/result.json new file mode 100644 index 00000000..0c601e54 --- /dev/null +++ b/results/community/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-09T05:17:28.177530+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\tNODE\t64-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7020.2, + "tokens_out": 421272, + "tokens_in": 0, + "requests_completed": 2260, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 3103.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7497.5, + "tokens_out": 449990, + "tokens_in": 0, + "requests_completed": 2402, + "ttft_ms_p50": 25.1, + "ttft_ms_p99": 44.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7463.0, + "tokens_out": 447695, + "tokens_in": 0, + "requests_completed": 2397, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 42.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7409.6, + "tokens_out": 444555, + "tokens_in": 0, + "requests_completed": 2372, + "ttft_ms_p50": 23.2, + "ttft_ms_p99": 44.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7414.8, + "tokens_out": 445029, + "tokens_in": 0, + "requests_completed": 2374, + "ttft_ms_p50": 22.1, + "ttft_ms_p99": 42.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7398.7, + "tokens_out": 443888, + "tokens_in": 0, + "requests_completed": 2366, + "ttft_ms_p50": 22.9, + "ttft_ms_p99": 43.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7454.4, + "tokens_out": 447128, + "tokens_in": 0, + "requests_completed": 2389, + "ttft_ms_p50": 22.5, + "ttft_ms_p99": 43.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7395.5, + "tokens_out": 443937, + "tokens_in": 0, + "requests_completed": 2372, + "ttft_ms_p50": 22.8, + "ttft_ms_p99": 43.1 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7477.0, + "tokens_out": 448440, + "tokens_in": 0, + "requests_completed": 2398, + "ttft_ms_p50": 26.0, + "ttft_ms_p99": 43.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7481.1, + "tokens_out": 449028, + "tokens_in": 0, + "requests_completed": 2403, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 44.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7420.7, + "tokens_out": 445294, + "tokens_in": 0, + "requests_completed": 2376, + "ttft_ms_p50": 21.6, + "ttft_ms_p99": 44.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7434.3, + "tokens_out": 446024, + "tokens_in": 0, + "requests_completed": 2386, + "ttft_ms_p50": 24.1, + "ttft_ms_p99": 44.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7485.4, + "tokens_out": 448857, + "tokens_in": 0, + "requests_completed": 2392, + "ttft_ms_p50": 22.7, + "ttft_ms_p99": 43.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7420.0, + "tokens_out": 445468, + "tokens_in": 0, + "requests_completed": 2379, + "ttft_ms_p50": 22.4, + "ttft_ms_p99": 43.8 + } + ], + "sustained_throughput_tokens_per_sec": 7412.3, + "throttle_ratio": 0.936, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -3059.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-09", + "time": "05:47:01", + "run_id": "280f3db2", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_280f3db2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-09T05:31:59.919623+00:00", + "benchmark_end_time": "2026-05-09T05:47:01.325708+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 55.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/accuracy/accuracy.json new file mode 100644 index 00000000..5b260195 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/burst/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/burst/result.json new file mode 100644 index 00000000..87444f16 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/burst/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 39.79, + "steady_ttft_p99_ms": 88.73, + "burst_ttft_p50_ms": 56.4, + "burst_ttft_p99_ms": 91.21, + "sla_met_during_burst": true, + "burst_degradation_ratio": 1.028, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 765.97, + "burst_ttft_p99_ms": 94.46 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 60.79, + "burst_ttft_p99_ms": 90.42 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 62.08, + "burst_ttft_p99_ms": 89.13 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:39:28", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:31:37.700771+00:00", + "benchmark_end_time": "2026-05-07T08:39:28.136330+00:00", + "benchmark_elapsed_minutes": 7.8, + "model_load_seconds": 55.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/env_info.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/env_info.json new file mode 100644 index 00000000..bf163b17 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/interactive/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/interactive/result.json new file mode 100644 index 00000000..70d8e422 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 31.14, + "ttft_ms_p90": 42.5, + "ttft_ms_p99": 58.74, + "tpot_ms_p50": 11.0, + "tpot_ms_p90": 11.05, + "tpot_ms_p99": 11.1, + "peak_memory_gb": null, + "elapsed_seconds_median": 325.3 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:45:43", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:29:25.547148+00:00", + "benchmark_end_time": "2026-05-07T07:45:43.129744+00:00", + "benchmark_elapsed_minutes": 16.3, + "model_load_seconds": 61.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/offline/result.json new file mode 100644 index 00000000..a0d2d653 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/offline/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3808.98, + "throughput_tokens_per_sec_per_chip": 3808.98, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3850.31, + "throughput_tokens_per_sec_per_chip": 3850.31, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3846.1, + "throughput_tokens_per_sec_per_chip": 3846.1, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:21:41", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:19:47.324731+00:00", + "benchmark_end_time": "2026-05-07T07:21:41.048200+00:00", + "benchmark_elapsed_minutes": 1.9, + "model_load_seconds": 59.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/online/result.json new file mode 100644 index 00000000..5b921271 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.79, + "ttft_ms_p90": 59.82, + "ttft_ms_p99": 1339.64, + "tpot_ms_p50": 12.88, + "tpot_ms_p90": 14.51, + "tpot_ms_p99": 15.86, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.32, + "ttft_ms_p90": 74.21, + "ttft_ms_p99": 86.11, + "tpot_ms_p50": 30.61, + "tpot_ms_p90": 35.77, + "tpot_ms_p99": 43.52, + "elapsed_seconds_median": 16.3, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 52.52, + "ttft_ms_p90": 70.44, + "ttft_ms_p99": 166.6, + "tpot_ms_p50": 38.9, + "tpot_ms_p90": 51.41, + "tpot_ms_p99": 138.25, + "elapsed_seconds_median": 10.2, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:27:56", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:23:22.713485+00:00", + "benchmark_end_time": "2026-05-07T07:27:56.704379+00:00", + "benchmark_elapsed_minutes": 4.6, + "model_load_seconds": 73.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/result.json new file mode 100644 index 00000000..e58bf7fa --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/result.json @@ -0,0 +1,615 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "speculative", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3808.98, + "throughput_tokens_per_sec_per_chip": 3808.98, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3850.31, + "throughput_tokens_per_sec_per_chip": 3850.31, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3846.1, + "throughput_tokens_per_sec_per_chip": 3846.1, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.79, + "ttft_ms_p90": 59.82, + "ttft_ms_p99": 1339.64, + "tpot_ms_p50": 12.88, + "tpot_ms_p90": 14.51, + "tpot_ms_p99": 15.86, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.32, + "ttft_ms_p90": 74.21, + "ttft_ms_p99": 86.11, + "tpot_ms_p50": 30.61, + "tpot_ms_p90": 35.77, + "tpot_ms_p99": 43.52, + "elapsed_seconds_median": 16.3, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 52.52, + "ttft_ms_p90": 70.44, + "ttft_ms_p99": 166.6, + "tpot_ms_p50": 38.9, + "tpot_ms_p90": 51.41, + "tpot_ms_p99": 138.25, + "elapsed_seconds_median": 10.2, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 31.14, + "ttft_ms_p90": 42.5, + "ttft_ms_p99": 58.74, + "tpot_ms_p50": 11.0, + "tpot_ms_p90": 11.05, + "tpot_ms_p99": 11.1, + "peak_memory_gb": null, + "elapsed_seconds_median": 325.3 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 628.4, + "tokens_out": 37735, + "tokens_in": 0, + "requests_completed": 201, + "ttft_ms_p50": 41.3, + "ttft_ms_p99": 1767.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.1, + "tokens_out": 39719, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 58.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.7, + "tokens_out": 39696, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 57.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.7, + "tokens_out": 39883, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 55.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.5, + "tokens_out": 39875, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.0, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.2 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.2, + "tokens_out": 39719, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.5, + "ttft_ms_p99": 57.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.9, + "tokens_out": 39843, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.4, + "tokens_out": 39839, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.3, + "tokens_out": 39692, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.5 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.0, + "tokens_out": 39660, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39901, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 58.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.5, + "tokens_out": 39869, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 58.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.2, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 57.8 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.5, + "tokens_out": 39785, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.3 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.4, + "tokens_out": 39925, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.2 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.3, + "tokens_out": 39683, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 667.4, + "tokens_out": 40046, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.9 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39630, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 49.5 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39633, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39891, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.5 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.7, + "tokens_out": 39946, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 56.4 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.7, + "tokens_out": 40114, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.8, + "tokens_out": 39762, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.7, + "tokens_out": 40006, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 57.4 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.7, + "tokens_out": 39882, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.3, + "ttft_ms_p99": 58.9 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.0, + "tokens_out": 39843, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.4, + "ttft_ms_p99": 57.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39623, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 47.3 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.4, + "tokens_out": 39654, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 56.1 + } + ], + "sustained_throughput_tokens_per_sec": 664.1, + "throttle_ratio": 0.982, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2.0 + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 738.63, + "throughput_tokens_per_sec_per_chip": 738.63, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 739.28, + "throughput_tokens_per_sec_per_chip": 739.28, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 738.93, + "throughput_tokens_per_sec_per_chip": 738.93, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 39.79, + "steady_ttft_p99_ms": 88.73, + "burst_ttft_p50_ms": 56.4, + "burst_ttft_p99_ms": 91.21, + "sla_met_during_burst": true, + "burst_degradation_ratio": 1.028, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 765.97, + "burst_ttft_p99_ms": 94.46 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 60.79, + "burst_ttft_p99_ms": 90.42 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 62.08, + "burst_ttft_p99_ms": 89.13 + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:21:41", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:19:47.324731+00:00", + "benchmark_end_time": "2026-05-07T07:21:41.048200+00:00", + "benchmark_elapsed_minutes": 70.1, + "model_load_seconds": 59.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/offline", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/online", + "interactive": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/interactive", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/sustained", + "speculative": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/speculative", + "burst": "results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/burst" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/speculative/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/speculative/result.json new file mode 100644 index 00000000..a4acbb9e --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/speculative/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 738.63, + "throughput_tokens_per_sec_per_chip": 738.63, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 739.28, + "throughput_tokens_per_sec_per_chip": 739.28, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 738.93, + "throughput_tokens_per_sec_per_chip": 738.93, + "elapsed_seconds_median": 46.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:30:19", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:20:54.145018+00:00", + "benchmark_end_time": "2026-05-07T08:30:19.120188+00:00", + "benchmark_elapsed_minutes": 9.4, + "model_load_seconds": 166.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/sustained/result.json new file mode 100644 index 00000000..f5d98b09 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:15:18.766397+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 628.4, + "tokens_out": 37735, + "tokens_in": 0, + "requests_completed": 201, + "ttft_ms_p50": 41.3, + "ttft_ms_p99": 1767.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.1, + "tokens_out": 39719, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 58.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.7, + "tokens_out": 39696, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 57.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.7, + "tokens_out": 39883, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 55.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.5, + "tokens_out": 39875, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.0, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.2 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.2, + "tokens_out": 39719, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.5, + "ttft_ms_p99": 57.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.9, + "tokens_out": 39843, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.4, + "tokens_out": 39839, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.3, + "tokens_out": 39692, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.5 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.0, + "tokens_out": 39660, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39901, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 58.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.5, + "tokens_out": 39869, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 58.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.2, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 57.8 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.5, + "tokens_out": 39785, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.3 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.4, + "tokens_out": 39925, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.2 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.3, + "tokens_out": 39683, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 58.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 667.4, + "tokens_out": 40046, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.9 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39630, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 49.5 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39633, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.8, + "tokens_out": 39891, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.5 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 665.7, + "tokens_out": 39946, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 56.4 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.7, + "tokens_out": 40114, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 40.2, + "ttft_ms_p99": 58.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.8, + "tokens_out": 39762, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 57.4 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.7, + "tokens_out": 40006, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 57.4 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.7, + "tokens_out": 39882, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 40.3, + "ttft_ms_p99": 58.9 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 664.0, + "tokens_out": 39843, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.4, + "ttft_ms_p99": 57.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39623, + "tokens_in": 0, + "requests_completed": 210, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 47.3 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.4, + "tokens_out": 39654, + "tokens_in": 0, + "requests_completed": 209, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 56.1 + } + ], + "sustained_throughput_tokens_per_sec": 664.1, + "throttle_ratio": 0.982, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2.0 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:17:02", + "run_id": "93928a91", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_A_nvidia_sglang_c43a8309_93928a91", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:46:58.710502+00:00", + "benchmark_end_time": "2026-05-07T08:17:02.619499+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 51.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline/result.json new file mode 100644 index 00000000..ef48e8b5 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3839.22, + "throughput_tokens_per_sec_per_chip": 3839.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3837.82, + "throughput_tokens_per_sec_per_chip": 3837.82, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3860.62, + "throughput_tokens_per_sec_per_chip": 3860.62, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3841.02, + "throughput_tokens_per_sec_per_chip": 3841.02, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:47:27", + "run_id": "8eb86278", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:44:56.309394+00:00", + "benchmark_end_time": "2026-05-07T08:47:27.583870+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 54.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online/result.json new file mode 100644 index 00000000..d9959965 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.62, + "ttft_ms_p90": 60.95, + "ttft_ms_p99": 2890.67, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.52, + "tpot_ms_p99": 16.44, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.3, + "ttft_ms_p90": 58.83, + "ttft_ms_p99": 67.63, + "tpot_ms_p50": 16.09, + "tpot_ms_p90": 18.1, + "tpot_ms_p99": 19.77, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.31, + "ttft_ms_p90": 74.03, + "ttft_ms_p99": 87.69, + "tpot_ms_p50": 30.28, + "tpot_ms_p90": 35.96, + "tpot_ms_p99": 41.34, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.07, + "ttft_ms_p90": 76.66, + "ttft_ms_p99": 94.56, + "tpot_ms_p50": 37.4, + "tpot_ms_p90": 42.77, + "tpot_ms_p99": 61.19, + "elapsed_seconds_median": 12.2, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:55:45", + "run_id": "8eb86278", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:49:27.978151+00:00", + "benchmark_end_time": "2026-05-07T08:55:45.393807+00:00", + "benchmark_elapsed_minutes": 6.3, + "model_load_seconds": 86.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/result.json new file mode 100644 index 00000000..fc617f64 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3839.22, + "throughput_tokens_per_sec_per_chip": 3839.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3837.82, + "throughput_tokens_per_sec_per_chip": 3837.82, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3860.62, + "throughput_tokens_per_sec_per_chip": 3860.62, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3841.02, + "throughput_tokens_per_sec_per_chip": 3841.02, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.62, + "ttft_ms_p90": 60.95, + "ttft_ms_p99": 2890.67, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.52, + "tpot_ms_p99": 16.44, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.3, + "ttft_ms_p90": 58.83, + "ttft_ms_p99": 67.63, + "tpot_ms_p50": 16.09, + "tpot_ms_p90": 18.1, + "tpot_ms_p99": 19.77, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.31, + "ttft_ms_p90": 74.03, + "ttft_ms_p99": 87.69, + "tpot_ms_p50": 30.28, + "tpot_ms_p90": 35.96, + "tpot_ms_p99": 41.34, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.07, + "ttft_ms_p90": 76.66, + "ttft_ms_p99": 94.56, + "tpot_ms_p50": 37.4, + "tpot_ms_p90": 42.77, + "tpot_ms_p99": 61.19, + "elapsed_seconds_median": 12.2, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 581.4, + "tokens_out": 34896, + "tokens_in": 0, + "requests_completed": 191, + "ttft_ms_p50": 41.3, + "ttft_ms_p99": 5888.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 650.0, + "tokens_out": 39002, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 56.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 675.0, + "tokens_out": 40500, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 49.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 651.6, + "tokens_out": 39092, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.7, + "ttft_ms_p99": 56.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 667.4, + "tokens_out": 40053, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 58.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39637, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 59.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.3, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 57.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.7, + "tokens_out": 39336, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 55.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40119, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 52.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 654.4, + "tokens_out": 39267, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 59.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.6, + "tokens_out": 39823, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.1, + "tokens_out": 39344, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.9, + "tokens_out": 40024, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.7, + "tokens_out": 39767, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + } + ], + "sustained_throughput_tokens_per_sec": 656.2, + "throttle_ratio": 0.861, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -5832.1 + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:47:27", + "run_id": "8eb86278", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:44:56.309394+00:00", + "benchmark_end_time": "2026-05-07T08:47:27.583870+00:00", + "benchmark_elapsed_minutes": 23.9, + "model_load_seconds": 54.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained/result.json new file mode 100644 index 00000000..951f6b69 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 581.4, + "tokens_out": 34896, + "tokens_in": 0, + "requests_completed": 191, + "ttft_ms_p50": 41.3, + "ttft_ms_p99": 5888.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 650.0, + "tokens_out": 39002, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 56.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 675.0, + "tokens_out": 40500, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 49.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 651.6, + "tokens_out": 39092, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.7, + "ttft_ms_p99": 56.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 667.4, + "tokens_out": 40053, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 58.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39637, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 59.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.3, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 57.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.7, + "tokens_out": 39336, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 55.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40119, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 52.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 654.4, + "tokens_out": 39267, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 59.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.6, + "tokens_out": 39823, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.1, + "tokens_out": 39344, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.9, + "tokens_out": 40024, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.7, + "tokens_out": 39767, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + } + ], + "sustained_throughput_tokens_per_sec": 656.2, + "throttle_ratio": 0.861, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -5832.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:13:45", + "run_id": "8eb86278", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:58:41.478087+00:00", + "benchmark_end_time": "2026-05-07T09:13:45.970102+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 117.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/env_info.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/env_info.json new file mode 100644 index 00000000..dce5e92c --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/result.json new file mode 100644 index 00000000..0d63e281 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/result.json @@ -0,0 +1,963 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 3860.62, + "accuracy_score": 0.55, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 2123.3, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3839.22, + "throughput_tokens_per_sec_per_chip": 3839.22, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3837.82, + "throughput_tokens_per_sec_per_chip": 3837.82, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3860.62, + "throughput_tokens_per_sec_per_chip": 3860.62, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3841.02, + "throughput_tokens_per_sec_per_chip": 3841.02, + "elapsed_seconds_median": 9.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 4024.12, + "accuracy_score": 0.59, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 2374.2, + "speedup_vs_bf16": 1.042, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 4024.12, + "throughput_tokens_per_sec_per_chip": 4024.12, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4018.97, + "throughput_tokens_per_sec_per_chip": 4018.97, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4015.67, + "throughput_tokens_per_sec_per_chip": 4015.67, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4012.73, + "throughput_tokens_per_sec_per_chip": 4012.73, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 2227.61, + "accuracy_score": 0.57, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 1269.7, + "speedup_vs_bf16": 0.577, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2210.43, + "throughput_tokens_per_sec_per_chip": 2210.43, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2147.56, + "throughput_tokens_per_sec_per_chip": 2147.56, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2227.61, + "throughput_tokens_per_sec_per_chip": 2227.61, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2158.28, + "throughput_tokens_per_sec_per_chip": 2158.28, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "auto", + "quantization_method": "gptq" + } + ] + }, + "derived": {}, + "quantization_online": { + "results_by_precision": [ + { + "precision": "BF16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.62, + "ttft_ms_p90": 60.95, + "ttft_ms_p99": 2890.67, + "tpot_ms_p50": 12.83, + "tpot_ms_p90": 14.52, + "tpot_ms_p99": 16.44, + "elapsed_seconds_median": 65.5, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 42.3, + "ttft_ms_p90": 58.83, + "ttft_ms_p99": 67.63, + "tpot_ms_p50": 16.09, + "tpot_ms_p90": 18.1, + "tpot_ms_p99": 19.77, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.31, + "ttft_ms_p90": 74.03, + "ttft_ms_p99": 87.69, + "tpot_ms_p50": 30.28, + "tpot_ms_p90": 35.96, + "tpot_ms_p99": 41.34, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.07, + "ttft_ms_p90": 76.66, + "ttft_ms_p99": 94.56, + "tpot_ms_p50": 37.4, + "tpot_ms_p90": 42.77, + "tpot_ms_p99": 61.19, + "elapsed_seconds_median": 12.2, + "sla_met": true + } + ] + }, + { + "precision": "W8A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 34.62, + "ttft_ms_p90": 60.74, + "ttft_ms_p99": 1546.3, + "tpot_ms_p50": 8.35, + "tpot_ms_p90": 9.42, + "tpot_ms_p99": 11.52, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 36.02, + "ttft_ms_p90": 52.89, + "ttft_ms_p99": 60.18, + "tpot_ms_p50": 11.06, + "tpot_ms_p90": 12.37, + "tpot_ms_p99": 13.91, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.44, + "ttft_ms_p90": 78.16, + "ttft_ms_p99": 93.72, + "tpot_ms_p50": 30.62, + "tpot_ms_p90": 36.67, + "tpot_ms_p99": 43.08, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 54.19, + "ttft_ms_p90": 81.26, + "ttft_ms_p99": 99.23, + "tpot_ms_p50": 40.84, + "tpot_ms_p90": 47.97, + "tpot_ms_p99": 73.29, + "elapsed_seconds_median": 12.8, + "sla_met": true + } + ] + }, + { + "precision": "W4A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.27, + "ttft_ms_p90": 99.64, + "ttft_ms_p99": 6498.74, + "tpot_ms_p50": 22.31, + "tpot_ms_p90": 34.92, + "tpot_ms_p99": 43.43, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 62.97, + "ttft_ms_p90": 84.16, + "ttft_ms_p99": 98.61, + "tpot_ms_p50": 35.03, + "tpot_ms_p90": 36.27, + "tpot_ms_p99": 39.72, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 61.87, + "ttft_ms_p90": 95.61, + "ttft_ms_p99": 113.42, + "tpot_ms_p50": 45.62, + "tpot_ms_p90": 50.37, + "tpot_ms_p99": 56.81, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.8, + "ttft_ms_p90": 92.82, + "ttft_ms_p99": 117.29, + "tpot_ms_p50": 49.13, + "tpot_ms_p90": 57.9, + "tpot_ms_p99": 74.91, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + } + ] + }, + "quantization_sustained": { + "results_by_precision": [ + { + "precision": "BF16", + "sustained_throughput_tokens_per_sec": 656.2, + "throttle_ratio": 0.861, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -5832.1, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 581.4, + "tokens_out": 34896, + "tokens_in": 0, + "requests_completed": 191, + "ttft_ms_p50": 41.3, + "ttft_ms_p99": 5888.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 650.0, + "tokens_out": 39002, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 40.0, + "ttft_ms_p99": 56.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 675.0, + "tokens_out": 40500, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 49.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 651.6, + "tokens_out": 39092, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.7, + "ttft_ms_p99": 56.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 667.4, + "tokens_out": 40053, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 58.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.5, + "tokens_out": 39637, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 59.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 672.3, + "tokens_out": 40332, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.6, + "ttft_ms_p99": 57.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.7, + "tokens_out": 39336, + "tokens_in": 0, + "requests_completed": 213, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 55.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 668.6, + "tokens_out": 40119, + "tokens_in": 0, + "requests_completed": 216, + "ttft_ms_p50": 40.1, + "ttft_ms_p99": 52.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 654.4, + "tokens_out": 39267, + "tokens_in": 0, + "requests_completed": 214, + "ttft_ms_p50": 39.8, + "ttft_ms_p99": 59.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.6, + "tokens_out": 39823, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.1, + "tokens_out": 39344, + "tokens_in": 0, + "requests_completed": 212, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 666.9, + "tokens_out": 40024, + "tokens_in": 0, + "requests_completed": 217, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.6 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.7, + "tokens_out": 39767, + "tokens_in": 0, + "requests_completed": 215, + "ttft_ms_p50": 39.9, + "ttft_ms_p99": 56.0 + } + ] + }, + { + "precision": "W8A16", + "sustained_throughput_tokens_per_sec": 953.4, + "throttle_ratio": 0.919, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2106.5, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 889.3, + "tokens_out": 53371, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 2159.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.0, + "tokens_out": 57429, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.6, + "tokens_out": 57761, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57366, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 963.5, + "tokens_out": 57774, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 50.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 950.6, + "tokens_out": 57041, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.8, + "tokens_out": 57474, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.9, + "ttft_ms_p99": 53.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.7, + "tokens_out": 57767, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.5, + "tokens_out": 57098, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 967.2, + "tokens_out": 58021, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.2, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 317, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 953.3, + "tokens_out": 57190, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 965.3, + "tokens_out": 57931, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.5, + "tokens_out": 57268, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 53.0 + } + ] + }, + { + "precision": "W4A16", + "sustained_throughput_tokens_per_sec": 776.2, + "throttle_ratio": 0.828, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -7501.6, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.9, + "tokens_out": 39423, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 40.5, + "ttft_ms_p99": 7553.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 777.5, + "tokens_out": 46657, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 51.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.5, + "tokens_out": 46961, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 51.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.8, + "tokens_out": 47629, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 785.2, + "tokens_out": 47106, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.6, + "tokens_out": 46958, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 784.8, + "tokens_out": 47066, + "tokens_in": 0, + "requests_completed": 260, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 51.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.7, + "tokens_out": 47628, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.5, + "tokens_out": 46778, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 781.4, + "tokens_out": 46903, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.8, + "tokens_out": 47444, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.0, + "tokens_out": 47284, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.9, + "tokens_out": 46949, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 787.0, + "tokens_out": 47223, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.1, + "ttft_ms_p99": 51.7 + } + ] + } + ] + } + }, + "accuracy": null, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:47:27", + "run_id": "8eb86278", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:44:56.309394+00:00", + "benchmark_end_time": "2026-05-07T08:47:27.583870+00:00", + "benchmark_elapsed_minutes": 73.6, + "model_load_seconds": 54.4, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/offline", + "bf16/online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/online", + "bf16/sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/bf16/sustained", + "fp8/offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/fp8/offline", + "fp8/online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/fp8/online", + "fp8/sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/fp8/sustained", + "w8a8/offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a8/offline", + "w8a8/online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a8/online", + "w8a8/sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a8/sustained", + "w8a16/offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline", + "w8a16/online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online", + "w8a16/sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained", + "w4a16/offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline", + "w4a16/online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online", + "w4a16/sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization — reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization — larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..e2c86fd4 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline/result.json new file mode 100644 index 00000000..569537fc --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2210.43, + "throughput_tokens_per_sec_per_chip": 2210.43, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2147.56, + "throughput_tokens_per_sec_per_chip": 2147.56, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2227.61, + "throughput_tokens_per_sec_per_chip": 2227.61, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2158.28, + "throughput_tokens_per_sec_per_chip": 2158.28, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:03:43", + "run_id": "b616229d", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_b616229d", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:59:26.530242+00:00", + "benchmark_end_time": "2026-05-07T10:03:43.484063+00:00", + "benchmark_elapsed_minutes": 4.3, + "model_load_seconds": 55.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online/result.json new file mode 100644 index 00000000..81812dec --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.27, + "ttft_ms_p90": 99.64, + "ttft_ms_p99": 6498.74, + "tpot_ms_p50": 22.31, + "tpot_ms_p90": 34.92, + "tpot_ms_p99": 43.43, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 62.97, + "ttft_ms_p90": 84.16, + "ttft_ms_p99": 98.61, + "tpot_ms_p50": 35.03, + "tpot_ms_p90": 36.27, + "tpot_ms_p99": 39.72, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 61.87, + "ttft_ms_p90": 95.61, + "ttft_ms_p99": 113.42, + "tpot_ms_p50": 45.62, + "tpot_ms_p90": 50.37, + "tpot_ms_p99": 56.81, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.8, + "ttft_ms_p90": 92.82, + "ttft_ms_p99": 117.29, + "tpot_ms_p50": 49.13, + "tpot_ms_p90": 57.9, + "tpot_ms_p99": 74.91, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:13:40", + "run_id": "b616229d", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_b616229d", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:06:55.540921+00:00", + "benchmark_end_time": "2026-05-07T10:13:40.174051+00:00", + "benchmark_elapsed_minutes": 6.7, + "model_load_seconds": 131.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/result.json new file mode 100644 index 00000000..b2cc4ef6 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2210.43, + "throughput_tokens_per_sec_per_chip": 2210.43, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2147.56, + "throughput_tokens_per_sec_per_chip": 2147.56, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2227.61, + "throughput_tokens_per_sec_per_chip": 2227.61, + "elapsed_seconds_median": 15.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2158.28, + "throughput_tokens_per_sec_per_chip": 2158.28, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.27, + "ttft_ms_p90": 99.64, + "ttft_ms_p99": 6498.74, + "tpot_ms_p50": 22.31, + "tpot_ms_p90": 34.92, + "tpot_ms_p99": 43.43, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 62.97, + "ttft_ms_p90": 84.16, + "ttft_ms_p99": 98.61, + "tpot_ms_p50": 35.03, + "tpot_ms_p90": 36.27, + "tpot_ms_p99": 39.72, + "elapsed_seconds_median": 34.8, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 61.87, + "ttft_ms_p90": 95.61, + "ttft_ms_p99": 113.42, + "tpot_ms_p50": 45.62, + "tpot_ms_p90": 50.37, + "tpot_ms_p99": 56.81, + "elapsed_seconds_median": 19.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.8, + "ttft_ms_p90": 92.82, + "ttft_ms_p99": 117.29, + "tpot_ms_p50": 49.13, + "tpot_ms_p90": 57.9, + "tpot_ms_p99": 74.91, + "elapsed_seconds_median": 14.9, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.9, + "tokens_out": 39423, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 40.5, + "ttft_ms_p99": 7553.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 777.5, + "tokens_out": 46657, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 51.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.5, + "tokens_out": 46961, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 51.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.8, + "tokens_out": 47629, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 785.2, + "tokens_out": 47106, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.6, + "tokens_out": 46958, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 784.8, + "tokens_out": 47066, + "tokens_in": 0, + "requests_completed": 260, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 51.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.7, + "tokens_out": 47628, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.5, + "tokens_out": 46778, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 781.4, + "tokens_out": 46903, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.8, + "tokens_out": 47444, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.0, + "tokens_out": 47284, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.9, + "tokens_out": 46949, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 787.0, + "tokens_out": 47223, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.1, + "ttft_ms_p99": 51.7 + } + ], + "sustained_throughput_tokens_per_sec": 776.2, + "throttle_ratio": 0.828, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -7501.6 + } + }, + "accuracy": { + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:03:43", + "run_id": "b616229d", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_b616229d", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:59:26.530242+00:00", + "benchmark_end_time": "2026-05-07T10:03:43.484063+00:00", + "benchmark_elapsed_minutes": 26.0, + "model_load_seconds": 55.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/offline", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/online", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained/result.json new file mode 100644 index 00000000..9b214d44 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w4a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 656.9, + "tokens_out": 39423, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 40.5, + "ttft_ms_p99": 7553.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 777.5, + "tokens_out": 46657, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 51.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.5, + "tokens_out": 46961, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.6, + "ttft_ms_p99": 51.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.8, + "tokens_out": 47629, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 51.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 785.2, + "tokens_out": 47106, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.6, + "tokens_out": 46958, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 784.8, + "tokens_out": 47066, + "tokens_in": 0, + "requests_completed": 260, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 51.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 793.7, + "tokens_out": 47628, + "tokens_in": 0, + "requests_completed": 266, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 779.5, + "tokens_out": 46778, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 781.4, + "tokens_out": 46903, + "tokens_in": 0, + "requests_completed": 263, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 52.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 790.8, + "tokens_out": 47444, + "tokens_in": 0, + "requests_completed": 267, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 788.0, + "tokens_out": 47284, + "tokens_in": 0, + "requests_completed": 262, + "ttft_ms_p50": 35.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 782.9, + "tokens_out": 46949, + "tokens_in": 0, + "requests_completed": 265, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 787.0, + "tokens_out": 47223, + "tokens_in": 0, + "requests_completed": 264, + "ttft_ms_p50": 35.1, + "ttft_ms_p99": 51.7 + } + ], + "sustained_throughput_tokens_per_sec": 776.2, + "throttle_ratio": 0.828, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -7501.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:32:02", + "run_id": "b616229d", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_b616229d", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:17:00.381932+00:00", + "benchmark_end_time": "2026-05-07T10:32:02.983113+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 136.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..60c0d5fb --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline/result.json new file mode 100644 index 00000000..251c36d4 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 4024.12, + "throughput_tokens_per_sec_per_chip": 4024.12, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4018.97, + "throughput_tokens_per_sec_per_chip": 4018.97, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4015.67, + "throughput_tokens_per_sec_per_chip": 4015.67, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4012.73, + "throughput_tokens_per_sec_per_chip": 4012.73, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:28:57", + "run_id": "94455a09", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_94455a09", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:26:32.275114+00:00", + "benchmark_end_time": "2026-05-07T09:28:57.186231+00:00", + "benchmark_elapsed_minutes": 2.4, + "model_load_seconds": 42.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online/result.json new file mode 100644 index 00000000..a3ca2f9c --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 34.62, + "ttft_ms_p90": 60.74, + "ttft_ms_p99": 1546.3, + "tpot_ms_p50": 8.35, + "tpot_ms_p90": 9.42, + "tpot_ms_p99": 11.52, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 36.02, + "ttft_ms_p90": 52.89, + "ttft_ms_p99": 60.18, + "tpot_ms_p50": 11.06, + "tpot_ms_p90": 12.37, + "tpot_ms_p99": 13.91, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.44, + "ttft_ms_p90": 78.16, + "ttft_ms_p99": 93.72, + "tpot_ms_p50": 30.62, + "tpot_ms_p90": 36.67, + "tpot_ms_p99": 43.08, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 54.19, + "ttft_ms_p90": 81.26, + "ttft_ms_p99": 99.23, + "tpot_ms_p50": 40.84, + "tpot_ms_p90": 47.97, + "tpot_ms_p99": 73.29, + "elapsed_seconds_median": 12.8, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:37:01", + "run_id": "94455a09", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_94455a09", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:30:48.092008+00:00", + "benchmark_end_time": "2026-05-07T09:37:01.606268+00:00", + "benchmark_elapsed_minutes": 6.2, + "model_load_seconds": 84.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/result.json new file mode 100644 index 00000000..6d974465 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 4024.12, + "throughput_tokens_per_sec_per_chip": 4024.12, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4018.97, + "throughput_tokens_per_sec_per_chip": 4018.97, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4015.67, + "throughput_tokens_per_sec_per_chip": 4015.67, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4012.73, + "throughput_tokens_per_sec_per_chip": 4012.73, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 34.62, + "ttft_ms_p90": 60.74, + "ttft_ms_p99": 1546.3, + "tpot_ms_p50": 8.35, + "tpot_ms_p90": 9.42, + "tpot_ms_p99": 11.52, + "elapsed_seconds_median": 64.6, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 36.02, + "ttft_ms_p90": 52.89, + "ttft_ms_p99": 60.18, + "tpot_ms_p50": 11.06, + "tpot_ms_p90": 12.37, + "tpot_ms_p99": 13.91, + "elapsed_seconds_median": 31.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 53.44, + "ttft_ms_p90": 78.16, + "ttft_ms_p99": 93.72, + "tpot_ms_p50": 30.62, + "tpot_ms_p90": 36.67, + "tpot_ms_p99": 43.08, + "elapsed_seconds_median": 16.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 54.19, + "ttft_ms_p90": 81.26, + "ttft_ms_p99": 99.23, + "tpot_ms_p50": 40.84, + "tpot_ms_p90": 47.97, + "tpot_ms_p99": 73.29, + "elapsed_seconds_median": 12.8, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 889.3, + "tokens_out": 53371, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 2159.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.0, + "tokens_out": 57429, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.6, + "tokens_out": 57761, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57366, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 963.5, + "tokens_out": 57774, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 50.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 950.6, + "tokens_out": 57041, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.8, + "tokens_out": 57474, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.9, + "ttft_ms_p99": 53.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.7, + "tokens_out": 57767, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.5, + "tokens_out": 57098, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 967.2, + "tokens_out": 58021, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.2, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 317, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 953.3, + "tokens_out": 57190, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 965.3, + "tokens_out": 57931, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.5, + "tokens_out": 57268, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 53.0 + } + ], + "sustained_throughput_tokens_per_sec": 953.4, + "throttle_ratio": 0.919, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2106.5 + } + }, + "accuracy": { + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:28:57", + "run_id": "94455a09", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_94455a09", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:26:32.275114+00:00", + "benchmark_end_time": "2026-05-07T09:28:57.186231+00:00", + "benchmark_elapsed_minutes": 23.7, + "model_load_seconds": 42.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/offline", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/online", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained/result.json new file mode 100644 index 00000000..78ca4db7 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_8eb86278/w8a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T08:41:03.357410+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 889.3, + "tokens_out": 53371, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 2159.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.0, + "tokens_out": 57429, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 52.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.6, + "tokens_out": 57761, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.0, + "tokens_out": 57366, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.3 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 963.5, + "tokens_out": 57774, + "tokens_in": 0, + "requests_completed": 314, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 50.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 950.6, + "tokens_out": 57041, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 957.8, + "tokens_out": 57474, + "tokens_in": 0, + "requests_completed": 313, + "ttft_ms_p50": 33.9, + "ttft_ms_p99": 53.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 962.7, + "tokens_out": 57767, + "tokens_in": 0, + "requests_completed": 315, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 951.5, + "tokens_out": 57098, + "tokens_in": 0, + "requests_completed": 311, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 967.2, + "tokens_out": 58021, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 956.2, + "tokens_out": 57378, + "tokens_in": 0, + "requests_completed": 317, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 953.3, + "tokens_out": 57190, + "tokens_in": 0, + "requests_completed": 310, + "ttft_ms_p50": 33.5, + "ttft_ms_p99": 52.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 965.3, + "tokens_out": 57931, + "tokens_in": 0, + "requests_completed": 316, + "ttft_ms_p50": 33.7, + "ttft_ms_p99": 53.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 954.5, + "tokens_out": 57268, + "tokens_in": 0, + "requests_completed": 312, + "ttft_ms_p50": 33.6, + "ttft_ms_p99": 53.0 + } + ], + "sustained_throughput_tokens_per_sec": 953.4, + "throttle_ratio": 0.919, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2106.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:54:07", + "run_id": "94455a09", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_C_nvidia_sglang_c43a8309_94455a09", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:39:03.943887+00:00", + "benchmark_end_time": "2026-05-07T09:54:07.201192+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 97.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/env_info.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/env_info.json new file mode 100644 index 00000000..da48eb3c --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/interactive/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/interactive/result.json new file mode 100644 index 00000000..85d7c9c6 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 2930.57, + "ttft_ms_p90": 3048.65, + "ttft_ms_p99": 3114.45, + "tpot_ms_p50": 13.29, + "tpot_ms_p90": 13.34, + "tpot_ms_p99": 13.37, + "peak_memory_gb": null, + "elapsed_seconds_median": 593.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "11:37:20", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T11:17:34.002807+00:00", + "benchmark_end_time": "2026-05-07T11:37:20.392956+00:00", + "benchmark_elapsed_minutes": 19.8, + "model_load_seconds": 122.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/offline/result.json new file mode 100644 index 00000000..493f3ee0 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/offline/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 69.83, + "throughput_tokens_per_sec_per_chip": 69.83, + "elapsed_seconds_median": 184.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 69.8, + "throughput_tokens_per_sec_per_chip": 69.8, + "elapsed_seconds_median": 184.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "11:14:30", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:56:04.137696+00:00", + "benchmark_end_time": "2026-05-07T11:14:30.254228+00:00", + "benchmark_elapsed_minutes": 18.4, + "model_load_seconds": 48.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/online/result.json new file mode 100644 index 00000000..21481fa6 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 81189.61, + "ttft_ms_p90": 150755.95, + "ttft_ms_p99": 166489.39, + "tpot_ms_p50": 128.63, + "tpot_ms_p90": 211.13, + "tpot_ms_p99": 227.02, + "elapsed_seconds_median": 360.8, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 116802.47, + "ttft_ms_p90": 210291.74, + "ttft_ms_p99": 238549.48, + "tpot_ms_p50": 128.73, + "tpot_ms_p90": 211.59, + "tpot_ms_p99": 227.45, + "elapsed_seconds_median": 359.7, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 151139.12, + "ttft_ms_p90": 267788.55, + "ttft_ms_p99": 299162.46, + "tpot_ms_p50": 128.7, + "tpot_ms_p90": 211.15, + "tpot_ms_p99": 226.99, + "elapsed_seconds_median": 356.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "12:48:18", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T12:12:24.408117+00:00", + "benchmark_end_time": "2026-05-07T12:48:18.444325+00:00", + "benchmark_elapsed_minutes": 35.9, + "model_load_seconds": 62.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/result.json new file mode 100644 index 00000000..19795a21 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/result.json @@ -0,0 +1,551 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online", + "speculative" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 69.83, + "throughput_tokens_per_sec_per_chip": 69.83, + "elapsed_seconds_median": 184.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 69.8, + "throughput_tokens_per_sec_per_chip": 69.8, + "elapsed_seconds_median": 184.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 2930.57, + "ttft_ms_p90": 3048.65, + "ttft_ms_p99": 3114.45, + "tpot_ms_p50": 13.29, + "tpot_ms_p90": 13.34, + "tpot_ms_p99": 13.37, + "peak_memory_gb": null, + "elapsed_seconds_median": 593.1 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 30.0, + "tokens_out": 1800, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 18025.2, + "ttft_ms_p99": 27610.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13000.0, + "ttft_ms_p99": 23096.7 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12720.3, + "ttft_ms_p99": 22496.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.5, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12766.5, + "ttft_ms_p99": 22799.3 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.3, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13037.1, + "ttft_ms_p99": 23097.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12530.2, + "ttft_ms_p99": 22328.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12713.3, + "ttft_ms_p99": 22593.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12988.3, + "ttft_ms_p99": 23269.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12533.6, + "ttft_ms_p99": 22729.7 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12410.6, + "ttft_ms_p99": 22526.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12997.3, + "ttft_ms_p99": 23090.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12444.4, + "ttft_ms_p99": 22530.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12602.2, + "ttft_ms_p99": 22621.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12976.9, + "ttft_ms_p99": 23039.4 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12667.5, + "ttft_ms_p99": 22389.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12697.4, + "ttft_ms_p99": 22485.3 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12990.7, + "ttft_ms_p99": 23079.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12545.7, + "ttft_ms_p99": 22560.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12772.7, + "ttft_ms_p99": 22571.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12985.6, + "ttft_ms_p99": 23236.6 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12553.3, + "ttft_ms_p99": 22583.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12485.0, + "ttft_ms_p99": 22745.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12820.4, + "ttft_ms_p99": 23010.5 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12564.8, + "ttft_ms_p99": 22787.1 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.5, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12760.2, + "ttft_ms_p99": 23006.6 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.1, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13106.9, + "ttft_ms_p99": 23269.8 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.3, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12823.8, + "ttft_ms_p99": 22659.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12859.9, + "ttft_ms_p99": 22780.9 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13104.1, + "ttft_ms_p99": 23273.1 + } + ], + "sustained_throughput_tokens_per_sec": 62.1, + "throttle_ratio": 0.67, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": 176.4 + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 81189.61, + "ttft_ms_p90": 150755.95, + "ttft_ms_p99": 166489.39, + "tpot_ms_p50": 128.63, + "tpot_ms_p90": 211.13, + "tpot_ms_p99": 227.02, + "elapsed_seconds_median": 360.8, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 116802.47, + "ttft_ms_p90": 210291.74, + "ttft_ms_p99": 238549.48, + "tpot_ms_p50": 128.73, + "tpot_ms_p90": 211.59, + "tpot_ms_p99": 227.45, + "elapsed_seconds_median": 359.7, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 151139.12, + "ttft_ms_p90": 267788.55, + "ttft_ms_p99": 299162.46, + "tpot_ms_p50": 128.7, + "tpot_ms_p90": 211.15, + "tpot_ms_p99": 226.99, + "elapsed_seconds_median": 356.5, + "sla_met": false + } + ] + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 45.11, + "throughput_tokens_per_sec_per_chip": 45.11, + "elapsed_seconds_median": 284.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 45.11, + "throughput_tokens_per_sec_per_chip": 45.11, + "elapsed_seconds_median": 284.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "11:14:30", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:56:04.137696+00:00", + "benchmark_end_time": "2026-05-07T11:14:30.254228+00:00", + "benchmark_elapsed_minutes": 133.5, + "model_load_seconds": 48.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online', 'speculative'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/offline", + "interactive": "results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/interactive", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/sustained", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/online", + "speculative": "results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/speculative" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/speculative/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/speculative/result.json new file mode 100644 index 00000000..d6108caf --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/speculative/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 45.11, + "throughput_tokens_per_sec_per_chip": 45.11, + "elapsed_seconds_median": 284.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 45.11, + "throughput_tokens_per_sec_per_chip": 45.11, + "elapsed_seconds_median": 284.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "13:20:08", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T12:51:36.084339+00:00", + "benchmark_end_time": "2026-05-07T13:20:08.289202+00:00", + "benchmark_elapsed_minutes": 28.5, + "model_load_seconds": 136.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/sustained/result.json new file mode 100644 index 00000000..690f3a3b --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T10:52:35.716348+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tPXB\tNODE\t64-127,192-254\t1\t\tN/A\nNIC0\tSYS\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tPXB\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tNODE\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 30.0, + "tokens_out": 1800, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 18025.2, + "ttft_ms_p99": 27610.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13000.0, + "ttft_ms_p99": 23096.7 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12720.3, + "ttft_ms_p99": 22496.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.5, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12766.5, + "ttft_ms_p99": 22799.3 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.3, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13037.1, + "ttft_ms_p99": 23097.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12530.2, + "ttft_ms_p99": 22328.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12713.3, + "ttft_ms_p99": 22593.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12988.3, + "ttft_ms_p99": 23269.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12533.6, + "ttft_ms_p99": 22729.7 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12410.6, + "ttft_ms_p99": 22526.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12997.3, + "ttft_ms_p99": 23090.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12444.4, + "ttft_ms_p99": 22530.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12602.2, + "ttft_ms_p99": 22621.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12976.9, + "ttft_ms_p99": 23039.4 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12667.5, + "ttft_ms_p99": 22389.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12697.4, + "ttft_ms_p99": 22485.3 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12990.7, + "ttft_ms_p99": 23079.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12545.7, + "ttft_ms_p99": 22560.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12772.7, + "ttft_ms_p99": 22571.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12985.6, + "ttft_ms_p99": 23236.6 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12553.3, + "ttft_ms_p99": 22583.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12485.0, + "ttft_ms_p99": 22745.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12820.4, + "ttft_ms_p99": 23010.5 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12564.8, + "ttft_ms_p99": 22787.1 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 89.5, + "tokens_out": 5400, + "tokens_in": 0, + "requests_completed": 24, + "ttft_ms_p50": 12760.2, + "ttft_ms_p99": 23006.6 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.1, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13106.9, + "ttft_ms_p99": 23269.8 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.3, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12823.8, + "ttft_ms_p99": 22659.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 12859.9, + "ttft_ms_p99": 22780.9 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 60.0, + "tokens_out": 3600, + "tokens_in": 0, + "requests_completed": 16, + "ttft_ms_p50": 13104.1, + "ttft_ms_p99": 23273.1 + } + ], + "sustained_throughput_tokens_per_sec": 62.1, + "throttle_ratio": 0.67, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": 176.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "12:10:57", + "run_id": "1992bcc0", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_D_nvidia_sglang_c43a8309_1992bcc0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T11:40:03.456694+00:00", + "benchmark_end_time": "2026-05-07T12:10:57.486926+00:00", + "benchmark_elapsed_minutes": 30.9, + "model_load_seconds": 106.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/accuracy/accuracy.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/accuracy/accuracy.json new file mode 100644 index 00000000..c35ff064 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.41, + "baseline_delta": 0.03, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/env_info.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/env_info.json new file mode 100644 index 00000000..6f79d2a5 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/interactive/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/interactive/result.json new file mode 100644 index 00000000..d67c576f --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 16.88, + "ttft_ms_p90": 17.77, + "ttft_ms_p99": 19.3, + "tpot_ms_p50": 1.75, + "tpot_ms_p90": 1.77, + "tpot_ms_p99": 1.84, + "peak_memory_gb": null, + "elapsed_seconds_median": 52.7 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "06:12:29", + "run_id": "088cfa14", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T06:09:51.154286+00:00", + "benchmark_end_time": "2026-05-08T06:12:29.847870+00:00", + "benchmark_elapsed_minutes": 2.6, + "model_load_seconds": 32.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/offline/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/offline/result.json new file mode 100644 index 00000000..62b6ca4e --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/offline/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 36706.23, + "throughput_tokens_per_sec_per_chip": 36706.23, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 36713.29, + "throughput_tokens_per_sec_per_chip": 36713.29, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 36252.6, + "throughput_tokens_per_sec_per_chip": 36252.6, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "06:05:58", + "run_id": "088cfa14", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T06:05:43.054956+00:00", + "benchmark_end_time": "2026-05-08T06:05:58.915362+00:00", + "benchmark_elapsed_minutes": 0.3, + "model_load_seconds": 40.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/online/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/online/result.json new file mode 100644 index 00000000..f25540d1 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/online/result.json @@ -0,0 +1,156 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 19.18, + "ttft_ms_p90": 31.91, + "ttft_ms_p99": 1454.8, + "tpot_ms_p50": 2.42, + "tpot_ms_p90": 2.82, + "tpot_ms_p99": 3.64, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 28.84, + "ttft_ms_p90": 38.18, + "ttft_ms_p99": 45.64, + "tpot_ms_p50": 4.62, + "tpot_ms_p90": 5.66, + "tpot_ms_p99": 10.56, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "06:08:54", + "run_id": "088cfa14", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T06:06:56.193536+00:00", + "benchmark_end_time": "2026-05-08T06:08:54.919553+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 32.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/result.json new file mode 100644 index 00000000..45b66d4e --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/result.json @@ -0,0 +1,371 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 36706.23, + "throughput_tokens_per_sec_per_chip": 36706.23, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 36713.29, + "throughput_tokens_per_sec_per_chip": 36713.29, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 36252.6, + "throughput_tokens_per_sec_per_chip": 36252.6, + "elapsed_seconds_median": 1.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 19.18, + "ttft_ms_p90": 31.91, + "ttft_ms_p99": 1454.8, + "tpot_ms_p50": 2.42, + "tpot_ms_p90": 2.82, + "tpot_ms_p99": 3.64, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 28.84, + "ttft_ms_p90": 38.18, + "ttft_ms_p99": 45.64, + "tpot_ms_p50": 4.62, + "tpot_ms_p90": 5.66, + "tpot_ms_p99": 10.56, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 16.88, + "ttft_ms_p90": 17.77, + "ttft_ms_p99": 19.3, + "tpot_ms_p50": 1.75, + "tpot_ms_p90": 1.77, + "tpot_ms_p99": 1.84, + "peak_memory_gb": null, + "elapsed_seconds_median": 52.7 + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6876.7, + "tokens_out": 412634, + "tokens_in": 0, + "requests_completed": 2224, + "ttft_ms_p50": 24.7, + "ttft_ms_p99": 3802.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7317.5, + "tokens_out": 439021, + "tokens_in": 0, + "requests_completed": 2366, + "ttft_ms_p50": 21.2, + "ttft_ms_p99": 43.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7381.5, + "tokens_out": 443011, + "tokens_in": 0, + "requests_completed": 2380, + "ttft_ms_p50": 21.1, + "ttft_ms_p99": 44.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7335.8, + "tokens_out": 440208, + "tokens_in": 0, + "requests_completed": 2361, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 44.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7359.0, + "tokens_out": 441625, + "tokens_in": 0, + "requests_completed": 2373, + "ttft_ms_p50": 21.4, + "ttft_ms_p99": 44.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7434.2, + "tokens_out": 445978, + "tokens_in": 0, + "requests_completed": 2394, + "ttft_ms_p50": 22.9, + "ttft_ms_p99": 45.2 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7321.9, + "tokens_out": 439358, + "tokens_in": 0, + "requests_completed": 2367, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 45.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7325.9, + "tokens_out": 439445, + "tokens_in": 0, + "requests_completed": 2360, + "ttft_ms_p50": 21.6, + "ttft_ms_p99": 45.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7424.4, + "tokens_out": 445514, + "tokens_in": 0, + "requests_completed": 2390, + "ttft_ms_p50": 21.4, + "ttft_ms_p99": 45.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7356.4, + "tokens_out": 441402, + "tokens_in": 0, + "requests_completed": 2367, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 44.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7302.7, + "tokens_out": 438070, + "tokens_in": 0, + "requests_completed": 2353, + "ttft_ms_p50": 21.0, + "ttft_ms_p99": 44.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7383.8, + "tokens_out": 443102, + "tokens_in": 0, + "requests_completed": 2384, + "ttft_ms_p50": 22.1, + "ttft_ms_p99": 45.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7300.9, + "tokens_out": 438169, + "tokens_in": 0, + "requests_completed": 2355, + "ttft_ms_p50": 21.3, + "ttft_ms_p99": 45.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7405.6, + "tokens_out": 444160, + "tokens_in": 0, + "requests_completed": 2392, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 46.7 + } + ], + "sustained_throughput_tokens_per_sec": 7323.3, + "throttle_ratio": 0.925, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -3755.5 + } + }, + "accuracy": { + "subset_score": 0.41, + "baseline_delta": 0.03, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "06:05:58", + "run_id": "088cfa14", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T06:05:43.054956+00:00", + "benchmark_end_time": "2026-05-08T06:05:58.915362+00:00", + "benchmark_elapsed_minutes": 19.9, + "model_load_seconds": 40.2, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/offline", + "online": "results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/online", + "interactive": "results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/interactive", + "sustained": "results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/sustained/result.json b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/sustained/result.json new file mode 100644 index 00000000..87c9ef54 --- /dev/null +++ b/results/community/nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-08T06:01:51.625025+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A800-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tNODE\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tNODE\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7763 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "580.65.06", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6876.7, + "tokens_out": 412634, + "tokens_in": 0, + "requests_completed": 2224, + "ttft_ms_p50": 24.7, + "ttft_ms_p99": 3802.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7317.5, + "tokens_out": 439021, + "tokens_in": 0, + "requests_completed": 2366, + "ttft_ms_p50": 21.2, + "ttft_ms_p99": 43.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7381.5, + "tokens_out": 443011, + "tokens_in": 0, + "requests_completed": 2380, + "ttft_ms_p50": 21.1, + "ttft_ms_p99": 44.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7335.8, + "tokens_out": 440208, + "tokens_in": 0, + "requests_completed": 2361, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 44.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7359.0, + "tokens_out": 441625, + "tokens_in": 0, + "requests_completed": 2373, + "ttft_ms_p50": 21.4, + "ttft_ms_p99": 44.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7434.2, + "tokens_out": 445978, + "tokens_in": 0, + "requests_completed": 2394, + "ttft_ms_p50": 22.9, + "ttft_ms_p99": 45.2 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7321.9, + "tokens_out": 439358, + "tokens_in": 0, + "requests_completed": 2367, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 45.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7325.9, + "tokens_out": 439445, + "tokens_in": 0, + "requests_completed": 2360, + "ttft_ms_p50": 21.6, + "ttft_ms_p99": 45.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7424.4, + "tokens_out": 445514, + "tokens_in": 0, + "requests_completed": 2390, + "ttft_ms_p50": 21.4, + "ttft_ms_p99": 45.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7356.4, + "tokens_out": 441402, + "tokens_in": 0, + "requests_completed": 2367, + "ttft_ms_p50": 21.5, + "ttft_ms_p99": 44.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7302.7, + "tokens_out": 438070, + "tokens_in": 0, + "requests_completed": 2353, + "ttft_ms_p50": 21.0, + "ttft_ms_p99": 44.8 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7383.8, + "tokens_out": 443102, + "tokens_in": 0, + "requests_completed": 2384, + "ttft_ms_p50": 22.1, + "ttft_ms_p99": 45.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7300.9, + "tokens_out": 438169, + "tokens_in": 0, + "requests_completed": 2355, + "ttft_ms_p50": 21.3, + "ttft_ms_p99": 45.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 7405.6, + "tokens_out": 444160, + "tokens_in": 0, + "requests_completed": 2392, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 46.7 + } + ], + "sustained_throughput_tokens_per_sec": 7323.3, + "throttle_ratio": 0.925, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -3755.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-08", + "time": "06:28:51", + "run_id": "088cfa14", + "run_name": "nvidia_a800_sxm4_80gbx1_suite_F_nvidia_sglang_c43a8309_088cfa14", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-08T06:13:50.418242+00:00", + "benchmark_end_time": "2026-05-08T06:28:51.771182+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 51.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/accuracy/accuracy.json new file mode 100644 index 00000000..c700e987 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.62, + "baseline_delta": 0.02, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/burst/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/burst/result.json new file mode 100644 index 00000000..0693ae77 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/burst/result.json @@ -0,0 +1,161 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 14.03, + "steady_ttft_p99_ms": 26.7, + "burst_ttft_p50_ms": 15.6, + "burst_ttft_p99_ms": 22.99, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.861, + "recovery_time_seconds": 1.0, + "recovery_time_seconds_per_cycle": [ + 1.54, + 0.47 + ], + "_recovery_definition": "Median seconds within the post-burst steady window before rolling TTFT p99 drops below 1.5x the long-term steady baseline. Lower is better; None means it never recovered within the window.", + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 33.62, + "burst_ttft_p99_ms": 22.98 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 19.28, + "burst_ttft_p99_ms": 22.86 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 19.65, + "burst_ttft_p99_ms": 23.15 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "12:15:49", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T12:07:57.193442+00:00", + "benchmark_end_time": "2026-05-22T12:15:49.102951+00:00", + "benchmark_elapsed_minutes": 7.9, + "model_load_seconds": 14.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/env_info.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/env_info.json new file mode 100644 index 00000000..0e3dfd10 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/env_info.json @@ -0,0 +1,44 @@ +{ + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/interactive/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/interactive/result.json new file mode 100644 index 00000000..d230dd40 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/interactive/result.json @@ -0,0 +1,139 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 10.26, + "ttft_ms_p90": 16.13, + "ttft_ms_p99": 24.23, + "tpot_ms_p50": 3.45, + "tpot_ms_p90": 3.45, + "tpot_ms_p99": 3.49, + "peak_memory_gb": null, + "elapsed_seconds_median": 182.7, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 19.4, + "std": 13.18, + "cv_pct": 67.91, + "stability": "high-variance", + "runs": [ + 34.58, + 12.75, + 10.88 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:36:27", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:27:18.873868+00:00", + "benchmark_end_time": "2026-05-22T11:36:27.754777+00:00", + "benchmark_elapsed_minutes": 9.1, + "model_load_seconds": 14.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/offline/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/offline/result.json new file mode 100644 index 00000000..7068ab41 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/offline/result.json @@ -0,0 +1,196 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 14604.36, + "throughput_tokens_per_sec_per_chip": 14604.36, + "throughput_tokens_per_sec_total": 25459.53, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14634.97, + "std": 124.14, + "cv_pct": 0.85, + "stability": "stable", + "runs": [ + 14529.0, + 14604.36, + 14771.55 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 14716.67, + "throughput_tokens_per_sec_per_chip": 14716.67, + "throughput_tokens_per_sec_total": 25633.8, + "elapsed_seconds_median": 2.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14720.92, + "std": 25.18, + "cv_pct": 0.17, + "stability": "stable", + "runs": [ + 14716.67, + 14698.13, + 14747.96 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 14609.48, + "throughput_tokens_per_sec_per_chip": 14609.48, + "throughput_tokens_per_sec_total": 25506.35, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14662.48, + "std": 95.33, + "cv_pct": 0.65, + "stability": "stable", + "runs": [ + 14605.42, + 14609.48, + 14772.53 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "10:20:27", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T10:19:58.812205+00:00", + "benchmark_end_time": "2026-05-22T10:20:27.746312+00:00", + "benchmark_elapsed_minutes": 0.5, + "model_load_seconds": 22.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/online/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/online/result.json new file mode 100644 index 00000000..958a5578 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/online/result.json @@ -0,0 +1,195 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 14.4, + "ttft_ms_p90": 21.6, + "ttft_ms_p99": 32.36, + "tpot_ms_p50": 3.63, + "tpot_ms_p90": 3.73, + "tpot_ms_p99": 3.86, + "elapsed_seconds_median": 64.3, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 25.5, + "std": 9.68, + "cv_pct": 37.95, + "stability": "high-variance", + "runs": [ + 36.67, + 19.85, + 19.97 + ] + } + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 15.89, + "ttft_ms_p90": 20.5, + "ttft_ms_p99": 23.82, + "tpot_ms_p50": 4.28, + "tpot_ms_p90": 4.42, + "tpot_ms_p99": 4.58, + "elapsed_seconds_median": 13.6, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 23.61, + "std": 0.42, + "cv_pct": 1.8, + "stability": "stable", + "runs": [ + 23.88, + 23.12, + 23.82 + ] + } + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 28.96, + "ttft_ms_p90": 1153.67, + "ttft_ms_p99": 1319.26, + "tpot_ms_p50": 5.78, + "tpot_ms_p90": 6.18, + "tpot_ms_p99": 7.6, + "elapsed_seconds_median": 6.5, + "sla_met": false, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 1276.65, + "std": 70.77, + "cv_pct": 5.54, + "stability": "noisy", + "runs": [ + 1196.4, + 1303.42, + 1330.13 + ] + } + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "10:25:43", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T10:21:19.944643+00:00", + "benchmark_end_time": "2026-05-22T10:25:43.914737+00:00", + "benchmark_elapsed_minutes": 4.4, + "model_load_seconds": 32.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/result.json new file mode 100644 index 00000000..b1e1251f --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/result.json @@ -0,0 +1,694 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 14604.36, + "throughput_tokens_per_sec_per_chip": 14604.36, + "throughput_tokens_per_sec_total": 25459.53, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14634.97, + "std": 124.14, + "cv_pct": 0.85, + "stability": "stable", + "runs": [ + 14529.0, + 14604.36, + 14771.55 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 14716.67, + "throughput_tokens_per_sec_per_chip": 14716.67, + "throughput_tokens_per_sec_total": 25633.8, + "elapsed_seconds_median": 2.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14720.92, + "std": 25.18, + "cv_pct": 0.17, + "stability": "stable", + "runs": [ + 14716.67, + 14698.13, + 14747.96 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 14609.48, + "throughput_tokens_per_sec_per_chip": 14609.48, + "throughput_tokens_per_sec_total": 25506.35, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14662.48, + "std": 95.33, + "cv_pct": 0.65, + "stability": "stable", + "runs": [ + 14605.42, + 14609.48, + 14772.53 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 14.4, + "ttft_ms_p90": 21.6, + "ttft_ms_p99": 32.36, + "tpot_ms_p50": 3.63, + "tpot_ms_p90": 3.73, + "tpot_ms_p99": 3.86, + "elapsed_seconds_median": 64.3, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 25.5, + "std": 9.68, + "cv_pct": 37.95, + "stability": "high-variance", + "runs": [ + 36.67, + 19.85, + 19.97 + ] + } + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 15.89, + "ttft_ms_p90": 20.5, + "ttft_ms_p99": 23.82, + "tpot_ms_p50": 4.28, + "tpot_ms_p90": 4.42, + "tpot_ms_p99": 4.58, + "elapsed_seconds_median": 13.6, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 23.61, + "std": 0.42, + "cv_pct": 1.8, + "stability": "stable", + "runs": [ + 23.88, + 23.12, + 23.82 + ] + } + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 28.96, + "ttft_ms_p90": 1153.67, + "ttft_ms_p99": 1319.26, + "tpot_ms_p50": 5.78, + "tpot_ms_p90": 6.18, + "tpot_ms_p99": 7.6, + "elapsed_seconds_median": 6.5, + "sla_met": false, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 1276.65, + "std": 70.77, + "cv_pct": 5.54, + "stability": "noisy", + "runs": [ + 1196.4, + 1303.42, + 1330.13 + ] + } + } + ] + }, + "interactive": { + "ttft_ms_p50": 10.26, + "ttft_ms_p90": 16.13, + "ttft_ms_p99": 24.23, + "tpot_ms_p50": 3.45, + "tpot_ms_p90": 3.45, + "tpot_ms_p99": 3.49, + "peak_memory_gb": null, + "elapsed_seconds_median": 182.7, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 19.4, + "std": 13.18, + "cv_pct": 67.91, + "stability": "high-variance", + "runs": [ + 34.58, + 12.75, + 10.88 + ] + } + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 2144.2, + "tokens_out": 128750, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.4, + "ttft_ms_p99": 72.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2164.2, + "tokens_out": 129771, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2185.9, + "tokens_out": 131171, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 18.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2179.1, + "tokens_out": 130749, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2185.8, + "tokens_out": 131201, + "tokens_in": 0, + "requests_completed": 378, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2167.9, + "tokens_out": 130007, + "tokens_in": 0, + "requests_completed": 374, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.5, + "tokens_out": 130510, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.6, + "tokens_out": 131197, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2171.7, + "tokens_out": 130309, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.5, + "tokens_out": 131207, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.8, + "tokens_out": 131151, + "tokens_in": 0, + "requests_completed": 379, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2177.8, + "tokens_out": 130678, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2175.4, + "tokens_out": 130541, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2180.4, + "tokens_out": 130871, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.7 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2167.4, + "tokens_out": 130014, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.0 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2177.5, + "tokens_out": 130582, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2190.4, + "tokens_out": 131464, + "tokens_in": 0, + "requests_completed": 380, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 18.8 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2179.4, + "tokens_out": 130785, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.0 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.4, + "tokens_out": 130481, + "tokens_in": 0, + "requests_completed": 378, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2191.7, + "tokens_out": 131426, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 17.5 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2154.5, + "tokens_out": 129318, + "tokens_in": 0, + "requests_completed": 371, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2205.5, + "tokens_out": 132338, + "tokens_in": 0, + "requests_completed": 380, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2166.4, + "tokens_out": 130010, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2181.6, + "tokens_out": 130895, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2187.3, + "tokens_out": 131242, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.3 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.6, + "tokens_out": 130401, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2162.4, + "tokens_out": 129814, + "tokens_in": 0, + "requests_completed": 374, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2182.0, + "tokens_out": 130929, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.0 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2188.6, + "tokens_out": 131229, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.7 + } + ], + "sustained_throughput_tokens_per_sec": 2178.8, + "throttle_ratio": 0.977, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2, + "throughput_post_warmup_reliability": { + "n": 28, + "mean": 2178.8, + "std": 10.7, + "cv_pct": 0.49, + "stability": "stable", + "runs": [ + 2164.2, + 2185.9, + 2179.1, + 2185.8, + 2167.9, + 2174.5, + 2186.6, + 2171.7, + 2186.5, + 2186.8, + 2177.8, + 2175.4, + 2180.4, + 2167.4, + 2177.5, + 2190.4, + 2179.4, + 2174.4, + 2191.7, + 2154.5, + 2205.5, + 2166.4, + 2181.6, + 2187.3, + 2174.6, + 2162.4, + 2182.0, + 2188.6 + ] + } + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 14.03, + "steady_ttft_p99_ms": 26.7, + "burst_ttft_p50_ms": 15.6, + "burst_ttft_p99_ms": 22.99, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.861, + "recovery_time_seconds": 1.0, + "recovery_time_seconds_per_cycle": [ + 1.54, + 0.47 + ], + "_recovery_definition": "Median seconds within the post-burst steady window before rolling TTFT p99 drops below 1.5x the long-term steady baseline. Lower is better; None means it never recovered within the window.", + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 33.62, + "burst_ttft_p99_ms": 22.98 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 19.28, + "burst_ttft_p99_ms": 22.86 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 19.65, + "burst_ttft_p99_ms": 23.15 + } + ] + } + }, + "accuracy": { + "subset_score": 0.62, + "baseline_delta": 0.02, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "10:20:27", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online', 'interactive', 'sustained', 'burst'] succeeded, ['speculative'] failed.", + "benchmark_start_time": "2026-05-22T10:19:58.812205+00:00", + "benchmark_end_time": "2026-05-22T10:20:27.746312+00:00", + "benchmark_elapsed_minutes": 51.9, + "model_load_seconds": 22.6, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/offline", + "online": "results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/online", + "interactive": "results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/interactive", + "sustained": "results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/sustained", + "burst": "results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/burst" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/sustained/result.json b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/sustained/result.json new file mode 100644 index 00000000..80ec4dad --- /dev/null +++ b/results/community/nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026/sustained/result.json @@ -0,0 +1,456 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T10:08:28.757340+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 2144.2, + "tokens_out": 128750, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.4, + "ttft_ms_p99": 72.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2164.2, + "tokens_out": 129771, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2185.9, + "tokens_out": 131171, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 18.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2179.1, + "tokens_out": 130749, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.9 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2185.8, + "tokens_out": 131201, + "tokens_in": 0, + "requests_completed": 378, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2167.9, + "tokens_out": 130007, + "tokens_in": 0, + "requests_completed": 374, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.5, + "tokens_out": 130510, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.6, + "tokens_out": 131197, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2171.7, + "tokens_out": 130309, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.5, + "tokens_out": 131207, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2186.8, + "tokens_out": 131151, + "tokens_in": 0, + "requests_completed": 379, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2177.8, + "tokens_out": 130678, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2175.4, + "tokens_out": 130541, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2180.4, + "tokens_out": 130871, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.7 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2167.4, + "tokens_out": 130014, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.0 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2177.5, + "tokens_out": 130582, + "tokens_in": 0, + "requests_completed": 373, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2190.4, + "tokens_out": 131464, + "tokens_in": 0, + "requests_completed": 380, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 18.8 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2179.4, + "tokens_out": 130785, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 18.0 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.4, + "tokens_out": 130481, + "tokens_in": 0, + "requests_completed": 378, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2191.7, + "tokens_out": 131426, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 17.5 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2154.5, + "tokens_out": 129318, + "tokens_in": 0, + "requests_completed": 371, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2205.5, + "tokens_out": 132338, + "tokens_in": 0, + "requests_completed": 380, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2166.4, + "tokens_out": 130010, + "tokens_in": 0, + "requests_completed": 377, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.3 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2181.6, + "tokens_out": 130895, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2187.3, + "tokens_out": 131242, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 17.3 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2174.6, + "tokens_out": 130401, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2162.4, + "tokens_out": 129814, + "tokens_in": 0, + "requests_completed": 374, + "ttft_ms_p50": 14.1, + "ttft_ms_p99": 19.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2182.0, + "tokens_out": 130929, + "tokens_in": 0, + "requests_completed": 376, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.0 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2188.6, + "tokens_out": 131229, + "tokens_in": 0, + "requests_completed": 375, + "ttft_ms_p50": 14.2, + "ttft_ms_p99": 19.7 + } + ], + "sustained_throughput_tokens_per_sec": 2178.8, + "throttle_ratio": 0.977, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2, + "throughput_post_warmup_reliability": { + "n": 28, + "mean": 2178.8, + "std": 10.7, + "cv_pct": 0.49, + "stability": "stable", + "runs": [ + 2164.2, + 2185.9, + 2179.1, + 2185.8, + 2167.9, + 2174.5, + 2186.6, + 2171.7, + 2186.5, + 2186.8, + 2177.8, + 2175.4, + 2180.4, + 2167.4, + 2177.5, + 2190.4, + 2179.4, + 2174.4, + 2191.7, + 2154.5, + 2205.5, + 2166.4, + 2181.6, + 2187.3, + 2174.6, + 2162.4, + 2182.0, + 2188.6 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "12:07:04", + "run_id": "2c345026", + "run_name": "nvidia_b200x1_suite_A_nvidia_vllm020_0f6c56e4_2c345026", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:37:01.802929+00:00", + "benchmark_end_time": "2026-05-22T12:07:04.553612+00:00", + "benchmark_elapsed_minutes": 30.0, + "model_load_seconds": 15.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/accuracy/accuracy.json new file mode 100644 index 00000000..95fced50 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline/result.json new file mode 100644 index 00000000..9d89c5b8 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline/result.json @@ -0,0 +1,221 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 14871.53, + "throughput_tokens_per_sec_per_chip": 14871.53, + "throughput_tokens_per_sec_total": 26690.47, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14902.66, + "std": 138.54, + "cv_pct": 0.93, + "stability": "stable", + "runs": [ + 15054.11, + 14871.53, + 14782.33 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 14978.33, + "throughput_tokens_per_sec_per_chip": 14978.33, + "throughput_tokens_per_sec_total": 26758.41, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14974.05, + "std": 48.37, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 14923.68, + 14978.33, + 15020.15 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 14970.34, + "throughput_tokens_per_sec_per_chip": 14970.34, + "throughput_tokens_per_sec_total": 26748.09, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14952.51, + "std": 88.96, + "cv_pct": 0.59, + "stability": "stable", + "runs": [ + 15031.2, + 14855.98, + 14970.34 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 14891.3, + "throughput_tokens_per_sec_per_chip": 14891.3, + "throughput_tokens_per_sec_total": 26609.16, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14889.98, + "std": 10.96, + "cv_pct": 0.07, + "stability": "stable", + "runs": [ + 14900.22, + 14878.42, + 14891.3 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:02:20", + "run_id": "ea976bca", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:01:41.553802+00:00", + "benchmark_end_time": "2026-05-22T11:02:20.190019+00:00", + "benchmark_elapsed_minutes": 0.6, + "model_load_seconds": 16.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/result.json new file mode 100644 index 00000000..e9939d8b --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/result.json @@ -0,0 +1,228 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 14871.53, + "throughput_tokens_per_sec_per_chip": 14871.53, + "throughput_tokens_per_sec_total": 26690.47, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14902.66, + "std": 138.54, + "cv_pct": 0.93, + "stability": "stable", + "runs": [ + 15054.11, + 14871.53, + 14782.33 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 14978.33, + "throughput_tokens_per_sec_per_chip": 14978.33, + "throughput_tokens_per_sec_total": 26758.41, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14974.05, + "std": 48.37, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 14923.68, + 14978.33, + 15020.15 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 14970.34, + "throughput_tokens_per_sec_per_chip": 14970.34, + "throughput_tokens_per_sec_total": 26748.09, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14952.51, + "std": 88.96, + "cv_pct": 0.59, + "stability": "stable", + "runs": [ + 15031.2, + 14855.98, + 14970.34 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 14891.3, + "throughput_tokens_per_sec_per_chip": 14891.3, + "throughput_tokens_per_sec_total": 26609.16, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14889.98, + "std": 10.96, + "cv_pct": 0.07, + "stability": "stable", + "runs": [ + 14900.22, + 14878.42, + 14891.3 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:02:20", + "run_id": "ea976bca", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:01:41.553802+00:00", + "benchmark_end_time": "2026-05-22T11:02:20.190019+00:00", + "benchmark_elapsed_minutes": 0.6, + "model_load_seconds": 16.2, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/env_info.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/env_info.json new file mode 100644 index 00000000..23ec7e16 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/env_info.json @@ -0,0 +1,44 @@ +{ + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/accuracy/accuracy.json new file mode 100644 index 00000000..d345a007 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.56, + "baseline_delta": -0.02, + "valid": true, + "framework": "vLLM", + "precision": "FP8", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline/result.json new file mode 100644 index 00000000..2dc2f5f0 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline/result.json @@ -0,0 +1,221 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 19428.03, + "throughput_tokens_per_sec_per_chip": 19428.03, + "throughput_tokens_per_sec_total": 34894.39, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19417.97, + "std": 21.51, + "cv_pct": 0.11, + "stability": "stable", + "runs": [ + 19393.27, + 19428.03, + 19432.61 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 19471.49, + "throughput_tokens_per_sec_per_chip": 19471.49, + "throughput_tokens_per_sec_total": 34942.69, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19468.3, + "std": 62.9, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 19471.49, + 19403.86, + 19529.54 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 19439.08, + "throughput_tokens_per_sec_per_chip": 19439.08, + "throughput_tokens_per_sec_total": 34947.06, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19377.92, + "std": 122.1, + "cv_pct": 0.63, + "stability": "stable", + "runs": [ + 19237.32, + 19439.08, + 19457.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 19515.3, + "throughput_tokens_per_sec_per_chip": 19515.3, + "throughput_tokens_per_sec_total": 34972.45, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19518.76, + "std": 6.5, + "cv_pct": 0.03, + "stability": "stable", + "runs": [ + 19515.3, + 19514.73, + 19526.26 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:04:38", + "run_id": "87ccc74d", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_87ccc74d", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:04:09.398030+00:00", + "benchmark_end_time": "2026-05-22T11:04:38.829294+00:00", + "benchmark_elapsed_minutes": 0.5, + "model_load_seconds": 31.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/result.json new file mode 100644 index 00000000..b7deb51f --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/result.json @@ -0,0 +1,228 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 19428.03, + "throughput_tokens_per_sec_per_chip": 19428.03, + "throughput_tokens_per_sec_total": 34894.39, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19417.97, + "std": 21.51, + "cv_pct": 0.11, + "stability": "stable", + "runs": [ + 19393.27, + 19428.03, + 19432.61 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 19471.49, + "throughput_tokens_per_sec_per_chip": 19471.49, + "throughput_tokens_per_sec_total": 34942.69, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19468.3, + "std": 62.9, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 19471.49, + 19403.86, + 19529.54 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 19439.08, + "throughput_tokens_per_sec_per_chip": 19439.08, + "throughput_tokens_per_sec_total": 34947.06, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19377.92, + "std": 122.1, + "cv_pct": 0.63, + "stability": "stable", + "runs": [ + 19237.32, + 19439.08, + 19457.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 19515.3, + "throughput_tokens_per_sec_per_chip": 19515.3, + "throughput_tokens_per_sec_total": 34972.45, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19518.76, + "std": 6.5, + "cv_pct": 0.03, + "stability": "stable", + "runs": [ + 19515.3, + 19514.73, + 19526.26 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.56, + "baseline_delta": -0.02, + "valid": true, + "framework": "vLLM", + "precision": "FP8", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:04:38", + "run_id": "87ccc74d", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_87ccc74d", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:04:09.398030+00:00", + "benchmark_end_time": "2026-05-22T11:04:38.829294+00:00", + "benchmark_elapsed_minutes": 0.5, + "model_load_seconds": 31.3, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/result.json new file mode 100644 index 00000000..9808c373 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/result.json @@ -0,0 +1,603 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 14978.33, + "accuracy_score": 0.56, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 8387.9, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 14871.53, + "throughput_tokens_per_sec_per_chip": 14871.53, + "throughput_tokens_per_sec_total": 26690.47, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14902.66, + "std": 138.54, + "cv_pct": 0.93, + "stability": "stable", + "runs": [ + 15054.11, + 14871.53, + 14782.33 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 14978.33, + "throughput_tokens_per_sec_per_chip": 14978.33, + "throughput_tokens_per_sec_total": 26758.41, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14974.05, + "std": 48.37, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 14923.68, + 14978.33, + 15020.15 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 14970.34, + "throughput_tokens_per_sec_per_chip": 14970.34, + "throughput_tokens_per_sec_total": 26748.09, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14952.51, + "std": 88.96, + "cv_pct": 0.59, + "stability": "stable", + "runs": [ + 15031.2, + 14855.98, + 14970.34 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 14891.3, + "throughput_tokens_per_sec_per_chip": 14891.3, + "throughput_tokens_per_sec_total": 26609.16, + "elapsed_seconds_median": 2.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 14889.98, + "std": 10.96, + "cv_pct": 0.07, + "stability": "stable", + "runs": [ + 14900.22, + 14878.42, + 14891.3 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "FP8", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "best_throughput_tokens_per_sec": 19515.3, + "accuracy_score": 0.56, + "accuracy_baseline_delta": -0.02, + "accuracy_valid": true, + "quality_efficiency": 10928.6, + "speedup_vs_bf16": 1.303, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 19428.03, + "throughput_tokens_per_sec_per_chip": 19428.03, + "throughput_tokens_per_sec_total": 34894.39, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19417.97, + "std": 21.51, + "cv_pct": 0.11, + "stability": "stable", + "runs": [ + 19393.27, + 19428.03, + 19432.61 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 19471.49, + "throughput_tokens_per_sec_per_chip": 19471.49, + "throughput_tokens_per_sec_total": 34942.69, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19468.3, + "std": 62.9, + "cv_pct": 0.32, + "stability": "stable", + "runs": [ + 19471.49, + 19403.86, + 19529.54 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 19439.08, + "throughput_tokens_per_sec_per_chip": 19439.08, + "throughput_tokens_per_sec_total": 34947.06, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19377.92, + "std": 122.1, + "cv_pct": 0.63, + "stability": "stable", + "runs": [ + 19237.32, + 19439.08, + 19457.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 19515.3, + "throughput_tokens_per_sec_per_chip": 19515.3, + "throughput_tokens_per_sec_total": 34972.45, + "elapsed_seconds_median": 1.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 19518.76, + "std": 6.5, + "cv_pct": 0.03, + "stability": "stable", + "runs": [ + 19515.3, + 19514.73, + 19526.26 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "fp8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 8893.35, + "accuracy_score": 0.59, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 5247.1, + "speedup_vs_bf16": 0.594, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 8851.94, + "throughput_tokens_per_sec_per_chip": 8851.94, + "throughput_tokens_per_sec_total": 15963.0, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8806.03, + "std": 91.92, + "cv_pct": 1.04, + "stability": "stable", + "runs": [ + 8851.94, + 8700.21, + 8865.95 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 8857.65, + "throughput_tokens_per_sec_per_chip": 8857.65, + "throughput_tokens_per_sec_total": 16020.53, + "elapsed_seconds_median": 3.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8829.44, + "std": 137.52, + "cv_pct": 1.56, + "stability": "stable", + "runs": [ + 8950.67, + 8680.0, + 8857.65 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 8893.35, + "throughput_tokens_per_sec_per_chip": 8893.35, + "throughput_tokens_per_sec_total": 15970.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8827.61, + "std": 134.27, + "cv_pct": 1.52, + "stability": "stable", + "runs": [ + 8916.33, + 8673.13, + 8893.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 8773.67, + "throughput_tokens_per_sec_per_chip": 8773.67, + "throughput_tokens_per_sec_total": 15760.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8771.22, + "std": 98.04, + "cv_pct": 1.12, + "stability": "stable", + "runs": [ + 8773.67, + 8671.98, + 8868.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 6347.31, + "accuracy_score": 0.55, + "accuracy_baseline_delta": -0.02, + "accuracy_valid": true, + "quality_efficiency": 3491.0, + "speedup_vs_bf16": 0.424, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 6347.31, + "throughput_tokens_per_sec_per_chip": 6347.31, + "throughput_tokens_per_sec_total": 11527.65, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6344.22, + "std": 6.19, + "cv_pct": 0.1, + "stability": "stable", + "runs": [ + 6347.31, + 6348.26, + 6337.09 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6317.14, + "throughput_tokens_per_sec_per_chip": 6317.14, + "throughput_tokens_per_sec_total": 11472.26, + "elapsed_seconds_median": 5.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6222.32, + "std": 180.96, + "cv_pct": 2.91, + "stability": "stable", + "runs": [ + 6336.15, + 6013.65, + 6317.14 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6340.01, + "throughput_tokens_per_sec_per_chip": 6340.01, + "throughput_tokens_per_sec_total": 11514.39, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6510.1, + "std": 295.71, + "cv_pct": 4.54, + "stability": "noisy", + "runs": [ + 6338.74, + 6851.56, + 6340.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6343.3, + "throughput_tokens_per_sec_per_chip": 6343.3, + "throughput_tokens_per_sec_total": 11520.37, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6338.61, + "std": 10.39, + "cv_pct": 0.16, + "stability": "stable", + "runs": [ + 6343.3, + 6345.83, + 6326.7 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "float16", + "quantization_method": "gptq" + } + ] + }, + "derived": {} + }, + "accuracy": null, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:02:20", + "run_id": "ea976bca", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:01:41.553802+00:00", + "benchmark_end_time": "2026-05-22T11:02:20.190019+00:00", + "benchmark_elapsed_minutes": 3.7, + "model_load_seconds": 16.2, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/bf16/offline", + "fp8/offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/fp8/offline", + "w8a8/offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a8/offline", + "w8a8/online": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a8/online", + "w8a8/sustained": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a8/sustained", + "w8a16/offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline", + "w4a16/offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..9443ecf5 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.02, + "valid": true, + "framework": "vLLM", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline/result.json new file mode 100644 index 00000000..29559b78 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline/result.json @@ -0,0 +1,221 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "float16", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 6347.31, + "throughput_tokens_per_sec_per_chip": 6347.31, + "throughput_tokens_per_sec_total": 11527.65, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6344.22, + "std": 6.19, + "cv_pct": 0.1, + "stability": "stable", + "runs": [ + 6347.31, + 6348.26, + 6337.09 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6317.14, + "throughput_tokens_per_sec_per_chip": 6317.14, + "throughput_tokens_per_sec_total": 11472.26, + "elapsed_seconds_median": 5.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6222.32, + "std": 180.96, + "cv_pct": 2.91, + "stability": "stable", + "runs": [ + 6336.15, + 6013.65, + 6317.14 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6340.01, + "throughput_tokens_per_sec_per_chip": 6340.01, + "throughput_tokens_per_sec_total": 11514.39, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6510.1, + "std": 295.71, + "cv_pct": 4.54, + "stability": "noisy", + "runs": [ + 6338.74, + 6851.56, + 6340.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6343.3, + "throughput_tokens_per_sec_per_chip": 6343.3, + "throughput_tokens_per_sec_total": 11520.37, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6338.61, + "std": 10.39, + "cv_pct": 0.16, + "stability": "stable", + "runs": [ + 6343.3, + 6345.83, + 6326.7 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:12:27", + "run_id": "109a7792", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_109a7792", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:10:59.753021+00:00", + "benchmark_end_time": "2026-05-22T11:12:27.008821+00:00", + "benchmark_elapsed_minutes": 1.5, + "model_load_seconds": 16.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/result.json new file mode 100644 index 00000000..5ca60e8e --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/result.json @@ -0,0 +1,228 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "float16", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 6347.31, + "throughput_tokens_per_sec_per_chip": 6347.31, + "throughput_tokens_per_sec_total": 11527.65, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6344.22, + "std": 6.19, + "cv_pct": 0.1, + "stability": "stable", + "runs": [ + 6347.31, + 6348.26, + 6337.09 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6317.14, + "throughput_tokens_per_sec_per_chip": 6317.14, + "throughput_tokens_per_sec_total": 11472.26, + "elapsed_seconds_median": 5.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6222.32, + "std": 180.96, + "cv_pct": 2.91, + "stability": "stable", + "runs": [ + 6336.15, + 6013.65, + 6317.14 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6340.01, + "throughput_tokens_per_sec_per_chip": 6340.01, + "throughput_tokens_per_sec_total": 11514.39, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6510.1, + "std": 295.71, + "cv_pct": 4.54, + "stability": "noisy", + "runs": [ + 6338.74, + 6851.56, + 6340.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6343.3, + "throughput_tokens_per_sec_per_chip": 6343.3, + "throughput_tokens_per_sec_total": 11520.37, + "elapsed_seconds_median": 5.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 6338.61, + "std": 10.39, + "cv_pct": 0.16, + "stability": "stable", + "runs": [ + 6343.3, + 6345.83, + 6326.7 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.02, + "valid": true, + "framework": "vLLM", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:12:27", + "run_id": "109a7792", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_109a7792", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:10:59.753021+00:00", + "benchmark_end_time": "2026-05-22T11:12:27.008821+00:00", + "benchmark_elapsed_minutes": 1.5, + "model_load_seconds": 16.9, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w4a16/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..296afa87 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline/result.json new file mode 100644 index 00000000..e0604587 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline/result.json @@ -0,0 +1,221 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 8851.94, + "throughput_tokens_per_sec_per_chip": 8851.94, + "throughput_tokens_per_sec_total": 15963.0, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8806.03, + "std": 91.92, + "cv_pct": 1.04, + "stability": "stable", + "runs": [ + 8851.94, + 8700.21, + 8865.95 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 8857.65, + "throughput_tokens_per_sec_per_chip": 8857.65, + "throughput_tokens_per_sec_total": 16020.53, + "elapsed_seconds_median": 3.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8829.44, + "std": 137.52, + "cv_pct": 1.56, + "stability": "stable", + "runs": [ + 8950.67, + 8680.0, + 8857.65 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 8893.35, + "throughput_tokens_per_sec_per_chip": 8893.35, + "throughput_tokens_per_sec_total": 15970.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8827.61, + "std": 134.27, + "cv_pct": 1.52, + "stability": "stable", + "runs": [ + 8916.33, + 8673.13, + 8893.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 8773.67, + "throughput_tokens_per_sec_per_chip": 8773.67, + "throughput_tokens_per_sec_total": 15760.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8771.22, + "std": 98.04, + "cv_pct": 1.12, + "stability": "stable", + "runs": [ + 8773.67, + 8671.98, + 8868.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:09:15", + "run_id": "051e6316", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_051e6316", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:08:09.836599+00:00", + "benchmark_end_time": "2026-05-22T11:09:15.364459+00:00", + "benchmark_elapsed_minutes": 1.1, + "model_load_seconds": 18.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/result.json b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/result.json new file mode 100644 index 00000000..0ada0247 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/result.json @@ -0,0 +1,228 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:00:26.756229+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 8851.94, + "throughput_tokens_per_sec_per_chip": 8851.94, + "throughput_tokens_per_sec_total": 15963.0, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8806.03, + "std": 91.92, + "cv_pct": 1.04, + "stability": "stable", + "runs": [ + 8851.94, + 8700.21, + 8865.95 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 8857.65, + "throughput_tokens_per_sec_per_chip": 8857.65, + "throughput_tokens_per_sec_total": 16020.53, + "elapsed_seconds_median": 3.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8829.44, + "std": 137.52, + "cv_pct": 1.56, + "stability": "stable", + "runs": [ + 8950.67, + 8680.0, + 8857.65 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 8893.35, + "throughput_tokens_per_sec_per_chip": 8893.35, + "throughput_tokens_per_sec_total": 15970.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8827.61, + "std": 134.27, + "cv_pct": 1.52, + "stability": "stable", + "runs": [ + 8916.33, + 8673.13, + 8893.35 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 8773.67, + "throughput_tokens_per_sec_per_chip": 8773.67, + "throughput_tokens_per_sec_total": 15760.82, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 8771.22, + "std": 98.04, + "cv_pct": 1.12, + "stability": "stable", + "runs": [ + 8773.67, + 8671.98, + 8868.01 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:09:15", + "run_id": "051e6316", + "run_name": "nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_051e6316", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:08:09.836599+00:00", + "benchmark_end_time": "2026-05-22T11:09:15.364459+00:00", + "benchmark_elapsed_minutes": 1.1, + "model_load_seconds": 18.0, + "benchmark_elapsed_minutes_note": "Total across ['offline'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_C_nvidia_vllm020_0f6c56e4_ea976bca/w8a16/offline" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/accuracy/accuracy.json new file mode 100644 index 00000000..3e6d6c6c --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/env_info.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/env_info.json new file mode 100644 index 00000000..f2bb120e --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/env_info.json @@ -0,0 +1,44 @@ +{ + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/interactive/result.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/interactive/result.json new file mode 100644 index 00000000..36b21c41 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/interactive/result.json @@ -0,0 +1,138 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 599.84, + "ttft_ms_p90": 620.68, + "ttft_ms_p99": 636.57, + "tpot_ms_p50": 4.09, + "tpot_ms_p90": 4.11, + "tpot_ms_p99": 4.12, + "peak_memory_gb": null, + "elapsed_seconds_median": 164.1, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 637.52, + "std": 2.57, + "cv_pct": 0.4, + "stability": "stable", + "runs": [ + 639.34, + 635.7 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-25", + "time": "08:49:00", + "run_id": "c35cf907", + "run_name": "nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-25T08:43:32.662876+00:00", + "benchmark_end_time": "2026-05-25T08:49:00.780269+00:00", + "benchmark_elapsed_minutes": 5.5, + "model_load_seconds": 32.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/offline/result.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/offline/result.json new file mode 100644 index 00000000..b3228379 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/offline/result.json @@ -0,0 +1,169 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 392.64, + "throughput_tokens_per_sec_per_chip": 392.64, + "throughput_tokens_per_sec_total": 44317.41, + "elapsed_seconds_median": 32.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 2, + "mean": 392.64, + "std": 0.28, + "cv_pct": 0.07, + "stability": "stable", + "runs": [ + 392.84, + 392.44 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 392.15, + "throughput_tokens_per_sec_per_chip": 392.15, + "throughput_tokens_per_sec_total": 44262.41, + "elapsed_seconds_median": 32.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 2, + "mean": 392.15, + "std": 0.02, + "cv_pct": 0.01, + "stability": "stable", + "runs": [ + 392.17, + 392.14 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:18:17", + "run_id": "c35cf907", + "run_name": "nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:15:01.295237+00:00", + "benchmark_end_time": "2026-05-22T11:18:17.104254+00:00", + "benchmark_elapsed_minutes": 3.3, + "model_load_seconds": 15.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/online/result.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/online/result.json new file mode 100644 index 00000000..e299f948 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/online/result.json @@ -0,0 +1,192 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 1, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 627.57, + "ttft_ms_p90": 1041.15, + "ttft_ms_p99": 1676.04, + "tpot_ms_p50": 6.44, + "tpot_ms_p90": 11.11, + "tpot_ms_p99": 24.51, + "elapsed_seconds_median": 201.7, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 1470.83, + "std": 392.14, + "cv_pct": 26.66, + "stability": "high-variance", + "runs": [ + 1748.11, + 1193.54 + ] + } + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 733.03, + "ttft_ms_p90": 1441.58, + "ttft_ms_p99": 1987.27, + "tpot_ms_p50": 10.92, + "tpot_ms_p90": 22.24, + "tpot_ms_p99": 29.0, + "elapsed_seconds_median": 114.9, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 1955.72, + "std": 44.61, + "cv_pct": 2.28, + "stability": "stable", + "runs": [ + 1987.27, + 1924.18 + ] + } + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 9263.55, + "ttft_ms_p90": 17389.95, + "ttft_ms_p99": 22195.2, + "tpot_ms_p50": 48.55, + "tpot_ms_p90": 49.18, + "tpot_ms_p99": 49.35, + "elapsed_seconds_median": 70.0, + "sla_met": false, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 19059.25, + "std": 4927.55, + "cv_pct": 25.85, + "stability": "high-variance", + "runs": [ + 22543.56, + 15574.95 + ] + } + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-25", + "time": "09:33:20", + "run_id": "c35cf907", + "run_name": "nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-25T09:20:11.033130+00:00", + "benchmark_end_time": "2026-05-25T09:33:20.789590+00:00", + "benchmark_elapsed_minutes": 13.2, + "model_load_seconds": 13.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/result.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/result.json new file mode 100644 index 00000000..01711191 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/result.json @@ -0,0 +1,617 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 392.64, + "throughput_tokens_per_sec_per_chip": 392.64, + "throughput_tokens_per_sec_total": 44317.41, + "elapsed_seconds_median": 32.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 2, + "mean": 392.64, + "std": 0.28, + "cv_pct": 0.07, + "stability": "stable", + "runs": [ + 392.84, + 392.44 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 392.15, + "throughput_tokens_per_sec_per_chip": 392.15, + "throughput_tokens_per_sec_total": 44262.41, + "elapsed_seconds_median": 32.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 2, + "mean": 392.15, + "std": 0.02, + "cv_pct": 0.01, + "stability": "stable", + "runs": [ + 392.17, + 392.14 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 599.84, + "ttft_ms_p90": 620.68, + "ttft_ms_p99": 636.57, + "tpot_ms_p50": 4.09, + "tpot_ms_p90": 4.11, + "tpot_ms_p99": 4.12, + "peak_memory_gb": null, + "elapsed_seconds_median": 164.1, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 637.52, + "std": 2.57, + "cv_pct": 0.4, + "stability": "stable", + "runs": [ + 639.34, + 635.7 + ] + } + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 311.4, + "tokens_out": 18688, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 1017.3, + "ttft_ms_p99": 4280.7 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 970.6, + "ttft_ms_p99": 1157.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1020.2, + "ttft_ms_p99": 1128.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 982.6, + "ttft_ms_p99": 1092.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 983.7, + "ttft_ms_p99": 1170.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 999.2, + "ttft_ms_p99": 1113.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.5, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 954.9, + "ttft_ms_p99": 1161.0 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1026.8, + "ttft_ms_p99": 1161.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.5, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 982.9, + "ttft_ms_p99": 1110.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.1, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 982.9, + "ttft_ms_p99": 1155.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 997.9, + "ttft_ms_p99": 1113.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 955.1, + "ttft_ms_p99": 1164.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1019.6, + "ttft_ms_p99": 1245.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 984.3, + "ttft_ms_p99": 1109.2 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 986.7, + "ttft_ms_p99": 1171.3 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1002.4, + "ttft_ms_p99": 1122.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 975.1, + "ttft_ms_p99": 1162.6 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.0, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 1053.6, + "ttft_ms_p99": 1161.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 981.7, + "ttft_ms_p99": 1092.2 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.1, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 982.7, + "ttft_ms_p99": 1169.6 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 999.7, + "ttft_ms_p99": 1197.2 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 975.5, + "ttft_ms_p99": 1160.0 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 1021.8, + "ttft_ms_p99": 1165.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 980.9, + "ttft_ms_p99": 1093.8 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.0, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 981.8, + "ttft_ms_p99": 1157.1 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1018.1, + "ttft_ms_p99": 1131.0 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.1, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 979.7, + "ttft_ms_p99": 1159.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1021.3, + "ttft_ms_p99": 1131.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 980.8, + "ttft_ms_p99": 1085.5 + } + ], + "sustained_throughput_tokens_per_sec": 340.3, + "throttle_ratio": 0.987, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -71.6, + "throughput_post_warmup_reliability": { + "n": 28, + "mean": 340.3, + "std": 1.9, + "cv_pct": 0.54, + "stability": "stable", + "runs": [ + 341.2, + 341.3, + 337.2, + 341.3, + 341.2, + 341.5, + 341.2, + 341.5, + 341.1, + 341.3, + 341.4, + 341.3, + 341.3, + 337.2, + 341.4, + 341.3, + 337.0, + 341.2, + 337.1, + 341.4, + 341.4, + 337.2, + 341.3, + 337.0, + 341.3, + 337.1, + 341.3, + 341.3 + ] + } + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 1, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 627.57, + "ttft_ms_p90": 1041.15, + "ttft_ms_p99": 1676.04, + "tpot_ms_p50": 6.44, + "tpot_ms_p90": 11.11, + "tpot_ms_p99": 24.51, + "elapsed_seconds_median": 201.7, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 1470.83, + "std": 392.14, + "cv_pct": 26.66, + "stability": "high-variance", + "runs": [ + 1748.11, + 1193.54 + ] + } + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 733.03, + "ttft_ms_p90": 1441.58, + "ttft_ms_p99": 1987.27, + "tpot_ms_p50": 10.92, + "tpot_ms_p90": 22.24, + "tpot_ms_p99": 29.0, + "elapsed_seconds_median": 114.9, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 1955.72, + "std": 44.61, + "cv_pct": 2.28, + "stability": "stable", + "runs": [ + 1987.27, + 1924.18 + ] + } + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 9263.55, + "ttft_ms_p90": 17389.95, + "ttft_ms_p99": 22195.2, + "tpot_ms_p50": 48.55, + "tpot_ms_p90": 49.18, + "tpot_ms_p99": 49.35, + "elapsed_seconds_median": 70.0, + "sla_met": false, + "ttft_ms_p99_reliability": { + "n": 2, + "mean": 19059.25, + "std": 4927.55, + "cv_pct": 25.85, + "stability": "high-variance", + "runs": [ + 22543.56, + 15574.95 + ] + } + } + ] + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:18:17", + "run_id": "c35cf907", + "run_name": "nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:15:01.295237+00:00", + "benchmark_end_time": "2026-05-22T11:18:17.104254+00:00", + "benchmark_elapsed_minutes": 52.1, + "model_load_seconds": 15.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/offline", + "interactive": "results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/interactive", + "sustained": "results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/sustained", + "online": "results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/online" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/sustained/result.json b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/sustained/result.json new file mode 100644 index 00000000..0b3813c5 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907/sustained/result.json @@ -0,0 +1,456 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:13:53.452954+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 311.4, + "tokens_out": 18688, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 1017.3, + "ttft_ms_p99": 4280.7 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 970.6, + "ttft_ms_p99": 1157.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1020.2, + "ttft_ms_p99": 1128.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 982.6, + "ttft_ms_p99": 1092.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 983.7, + "ttft_ms_p99": 1170.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 999.2, + "ttft_ms_p99": 1113.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.5, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 954.9, + "ttft_ms_p99": 1161.0 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1026.8, + "ttft_ms_p99": 1161.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.5, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 982.9, + "ttft_ms_p99": 1110.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.1, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 982.9, + "ttft_ms_p99": 1155.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 997.9, + "ttft_ms_p99": 1113.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 955.1, + "ttft_ms_p99": 1164.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1019.6, + "ttft_ms_p99": 1245.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 984.3, + "ttft_ms_p99": 1109.2 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 986.7, + "ttft_ms_p99": 1171.3 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1002.4, + "ttft_ms_p99": 1122.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 975.1, + "ttft_ms_p99": 1162.6 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.0, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 1053.6, + "ttft_ms_p99": 1161.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.2, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 981.7, + "ttft_ms_p99": 1092.2 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.1, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 982.7, + "ttft_ms_p99": 1169.6 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 999.7, + "ttft_ms_p99": 1197.2 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.4, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 975.5, + "ttft_ms_p99": 1160.0 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.2, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 1021.8, + "ttft_ms_p99": 1165.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 980.9, + "ttft_ms_p99": 1093.8 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.0, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 981.8, + "ttft_ms_p99": 1157.1 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1018.1, + "ttft_ms_p99": 1131.0 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 337.1, + "tokens_out": 20224, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 979.7, + "ttft_ms_p99": 1159.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 1021.3, + "ttft_ms_p99": 1131.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 341.3, + "tokens_out": 20480, + "tokens_in": 0, + "requests_completed": 80, + "ttft_ms_p50": 980.8, + "ttft_ms_p99": 1085.5 + } + ], + "sustained_throughput_tokens_per_sec": 340.3, + "throttle_ratio": 0.987, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -71.6, + "throughput_post_warmup_reliability": { + "n": 28, + "mean": 340.3, + "std": 1.9, + "cv_pct": 0.54, + "stability": "stable", + "runs": [ + 341.2, + 341.3, + 337.2, + 341.3, + 341.2, + 341.5, + 341.2, + 341.5, + 341.1, + 341.3, + 341.4, + 341.3, + 341.3, + 337.2, + 341.4, + 341.3, + 337.0, + 341.2, + 337.1, + 341.4, + 341.4, + 337.2, + 341.3, + 337.0, + 341.3, + 337.1, + 341.3, + 341.3 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-25", + "time": "09:19:38", + "run_id": "c35cf907", + "run_name": "nvidia_b200x1_suite_D_nvidia_vllm020_0f6c56e4_c35cf907", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-25T08:49:32.474610+00:00", + "benchmark_end_time": "2026-05-25T09:19:38.964660+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 13.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/accuracy/accuracy.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/accuracy/accuracy.json new file mode 100644 index 00000000..66467933 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.4, + "baseline_delta": 0.02, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/env_info.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/env_info.json new file mode 100644 index 00000000..8e183ac0 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/env_info.json @@ -0,0 +1,44 @@ +{ + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/interactive/result.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/interactive/result.json new file mode 100644 index 00000000..0e4329b2 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/interactive/result.json @@ -0,0 +1,139 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 6.14, + "ttft_ms_p90": 6.66, + "ttft_ms_p99": 7.75, + "tpot_ms_p50": 1.27, + "tpot_ms_p90": 1.29, + "tpot_ms_p99": 1.34, + "peak_memory_gb": null, + "elapsed_seconds_median": 40.8, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 7.57, + "std": 0.95, + "cv_pct": 12.48, + "stability": "high-variance", + "runs": [ + 8.43, + 7.72, + 6.56 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:25:57", + "run_id": "ef7c9362", + "run_name": "nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:23:54.757444+00:00", + "benchmark_end_time": "2026-05-22T11:25:57.449221+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 10.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/offline/result.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/offline/result.json new file mode 100644 index 00000000..d7685d4b --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/offline/result.json @@ -0,0 +1,196 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 40406.2, + "throughput_tokens_per_sec_per_chip": 40406.2, + "throughput_tokens_per_sec_total": 60226.68, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40333.75, + "std": 326.78, + "cv_pct": 0.81, + "stability": "stable", + "runs": [ + 40618.23, + 40406.2, + 39976.83 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 40280.84, + "throughput_tokens_per_sec_per_chip": 40280.84, + "throughput_tokens_per_sec_total": 60077.93, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40327.57, + "std": 183.4, + "cv_pct": 0.45, + "stability": "stable", + "runs": [ + 40280.84, + 40529.82, + 40172.05 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 40387.24, + "throughput_tokens_per_sec_per_chip": 40387.24, + "throughput_tokens_per_sec_total": 60236.62, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40423.33, + "std": 134.85, + "cv_pct": 0.33, + "stability": "stable", + "runs": [ + 40310.2, + 40572.56, + 40387.24 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:20:39", + "run_id": "ef7c9362", + "run_name": "nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:20:26.814666+00:00", + "benchmark_end_time": "2026-05-22T11:20:39.423697+00:00", + "benchmark_elapsed_minutes": 0.2, + "model_load_seconds": 17.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/online/result.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/online/result.json new file mode 100644 index 00000000..94b805d6 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/online/result.json @@ -0,0 +1,171 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 8.15, + "ttft_ms_p90": 8.73, + "ttft_ms_p99": 12.48, + "tpot_ms_p50": 1.39, + "tpot_ms_p90": 1.5, + "tpot_ms_p99": 1.61, + "elapsed_seconds_median": 31.8, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 12.48, + "std": 0.6, + "cv_pct": 4.78, + "stability": "noisy", + "runs": [ + 13.16, + 12.19, + 12.08 + ] + } + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 9.25, + "ttft_ms_p90": 13.26, + "ttft_ms_p99": 16.48, + "tpot_ms_p50": 1.86, + "tpot_ms_p90": 2.13, + "tpot_ms_p99": 2.28, + "elapsed_seconds_median": 7.7, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 16.0, + "std": 1.57, + "cv_pct": 9.83, + "stability": "high-variance", + "runs": [ + 17.22, + 14.23, + 16.56 + ] + } + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:23:25", + "run_id": "ef7c9362", + "run_name": "nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:21:24.333603+00:00", + "benchmark_end_time": "2026-05-22T11:23:25.745227+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 26.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/result.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/result.json new file mode 100644 index 00000000..523af6a1 --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/result.json @@ -0,0 +1,460 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 40406.2, + "throughput_tokens_per_sec_per_chip": 40406.2, + "throughput_tokens_per_sec_total": 60226.68, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40333.75, + "std": 326.78, + "cv_pct": 0.81, + "stability": "stable", + "runs": [ + 40618.23, + 40406.2, + 39976.83 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 40280.84, + "throughput_tokens_per_sec_per_chip": 40280.84, + "throughput_tokens_per_sec_total": 60077.93, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40327.57, + "std": 183.4, + "cv_pct": 0.45, + "stability": "stable", + "runs": [ + 40280.84, + 40529.82, + 40172.05 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 40387.24, + "throughput_tokens_per_sec_per_chip": 40387.24, + "throughput_tokens_per_sec_total": 60236.62, + "elapsed_seconds_median": 1.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "throughput_tokens_per_sec_reliability": { + "n": 3, + "mean": 40423.33, + "std": 134.85, + "cv_pct": 0.33, + "stability": "stable", + "runs": [ + 40310.2, + 40572.56, + 40387.24 + ] + }, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 8.15, + "ttft_ms_p90": 8.73, + "ttft_ms_p99": 12.48, + "tpot_ms_p50": 1.39, + "tpot_ms_p90": 1.5, + "tpot_ms_p99": 1.61, + "elapsed_seconds_median": 31.8, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 12.48, + "std": 0.6, + "cv_pct": 4.78, + "stability": "noisy", + "runs": [ + 13.16, + 12.19, + 12.08 + ] + } + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 9.25, + "ttft_ms_p90": 13.26, + "ttft_ms_p99": 16.48, + "tpot_ms_p50": 1.86, + "tpot_ms_p90": 2.13, + "tpot_ms_p99": 2.28, + "elapsed_seconds_median": 7.7, + "sla_met": true, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 16.0, + "std": 1.57, + "cv_pct": 9.83, + "stability": "high-variance", + "runs": [ + 17.22, + 14.23, + 16.56 + ] + } + } + ] + }, + "interactive": { + "ttft_ms_p50": 6.14, + "ttft_ms_p90": 6.66, + "ttft_ms_p99": 7.75, + "tpot_ms_p50": 1.27, + "tpot_ms_p90": 1.29, + "tpot_ms_p99": 1.34, + "peak_memory_gb": null, + "elapsed_seconds_median": 40.8, + "ttft_ms_p99_reliability": { + "n": 3, + "mean": 7.57, + "std": 0.95, + "cv_pct": 12.48, + "stability": "high-variance", + "runs": [ + 8.43, + 7.72, + 6.56 + ] + } + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13004.8, + "tokens_out": 780773, + "tokens_in": 0, + "requests_completed": 3739, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 39.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13172.1, + "tokens_out": 790104, + "tokens_in": 0, + "requests_completed": 3781, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13168.2, + "tokens_out": 789862, + "tokens_in": 0, + "requests_completed": 3786, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13279.4, + "tokens_out": 797350, + "tokens_in": 0, + "requests_completed": 3826, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13197.2, + "tokens_out": 791576, + "tokens_in": 0, + "requests_completed": 3789, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13141.7, + "tokens_out": 788568, + "tokens_in": 0, + "requests_completed": 3794, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13155.2, + "tokens_out": 788960, + "tokens_in": 0, + "requests_completed": 3787, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13167.2, + "tokens_out": 790032, + "tokens_in": 0, + "requests_completed": 3793, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13199.1, + "tokens_out": 792415, + "tokens_in": 0, + "requests_completed": 3791, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13284.6, + "tokens_out": 796782, + "tokens_in": 0, + "requests_completed": 3818, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13196.6, + "tokens_out": 792186, + "tokens_in": 0, + "requests_completed": 3799, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13155.4, + "tokens_out": 788693, + "tokens_in": 0, + "requests_completed": 3786, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13191.5, + "tokens_out": 791827, + "tokens_in": 0, + "requests_completed": 3796, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13135.2, + "tokens_out": 787979, + "tokens_in": 0, + "requests_completed": 3779, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.4 + } + ], + "sustained_throughput_tokens_per_sec": 13174.9, + "throttle_ratio": 0.979, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -22.4, + "throughput_post_warmup_reliability": { + "n": 14, + "mean": 13174.9, + "std": 66.4, + "cv_pct": 0.5, + "stability": "stable", + "runs": [ + 13004.8, + 13172.1, + 13168.2, + 13279.4, + 13197.2, + 13141.7, + 13155.2, + 13167.2, + 13199.1, + 13284.6, + 13196.6, + 13155.4, + 13191.5, + 13135.2 + ] + } + } + }, + "accuracy": { + "subset_score": 0.4, + "baseline_delta": 0.02, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-22", + "time": "11:20:39", + "run_id": "ef7c9362", + "run_name": "nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-22T11:20:26.814666+00:00", + "benchmark_end_time": "2026-05-22T11:20:39.423697+00:00", + "benchmark_elapsed_minutes": 19.2, + "model_load_seconds": 17.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/offline", + "online": "results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/online", + "interactive": "results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/interactive", + "sustained": "results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/sustained/result.json b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/sustained/result.json new file mode 100644 index 00000000..9c1844ce --- /dev/null +++ b/results/community/nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362/sustained/result.json @@ -0,0 +1,292 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 179.1, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-22T11:19:19.953009+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA B200", + "vendor": "NVIDIA", + "memory_gb": 179.1, + "driver_version": "595.71.05", + "firmware_version": null, + "compute_capability": "10.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-47,96-143\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8559C", + "physical_cores": 96, + "logical_cores": 192, + "numa_nodes": 2 + }, + "system_memory_gb": 1996.0, + "pcie_generation": "PCIe Gen 5", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "6.17.0-1013-aws", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.9.0", + "driver_version": "595.71.05", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13004.8, + "tokens_out": 780773, + "tokens_in": 0, + "requests_completed": 3739, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 39.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13172.1, + "tokens_out": 790104, + "tokens_in": 0, + "requests_completed": 3781, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13168.2, + "tokens_out": 789862, + "tokens_in": 0, + "requests_completed": 3786, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13279.4, + "tokens_out": 797350, + "tokens_in": 0, + "requests_completed": 3826, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13197.2, + "tokens_out": 791576, + "tokens_in": 0, + "requests_completed": 3789, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13141.7, + "tokens_out": 788568, + "tokens_in": 0, + "requests_completed": 3794, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13155.2, + "tokens_out": 788960, + "tokens_in": 0, + "requests_completed": 3787, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13167.2, + "tokens_out": 790032, + "tokens_in": 0, + "requests_completed": 3793, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13199.1, + "tokens_out": 792415, + "tokens_in": 0, + "requests_completed": 3791, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13284.6, + "tokens_out": 796782, + "tokens_in": 0, + "requests_completed": 3818, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13196.6, + "tokens_out": 792186, + "tokens_in": 0, + "requests_completed": 3799, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13155.4, + "tokens_out": 788693, + "tokens_in": 0, + "requests_completed": 3786, + "ttft_ms_p50": 9.7, + "ttft_ms_p99": 17.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13191.5, + "tokens_out": 791827, + "tokens_in": 0, + "requests_completed": 3796, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 13135.2, + "tokens_out": 787979, + "tokens_in": 0, + "requests_completed": 3779, + "ttft_ms_p50": 9.6, + "ttft_ms_p99": 17.4 + } + ], + "sustained_throughput_tokens_per_sec": 13174.9, + "throttle_ratio": 0.979, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -22.4, + "throughput_post_warmup_reliability": { + "n": 14, + "mean": 13174.9, + "std": 66.4, + "cv_pct": 0.5, + "stability": "stable", + "runs": [ + 13004.8, + 13172.1, + 13168.2, + 13279.4, + 13197.2, + 13141.7, + 13155.2, + 13167.2, + 13199.1, + 13284.6, + 13196.6, + 13155.4, + 13191.5, + 13135.2 + ] + } + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-25", + "time": "09:59:33", + "run_id": "ef7c9362", + "run_name": "nvidia_b200x1_suite_F_nvidia_vllm020_0f6c56e4_ef7c9362", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-25T09:44:32.432302+00:00", + "benchmark_end_time": "2026-05-25T09:59:33.602088+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 25.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/accuracy/accuracy.json new file mode 100644 index 00000000..ca1b4692 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.6, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/burst/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/burst/result.json new file mode 100644 index 00000000..7c9ab3df --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/burst/result.json @@ -0,0 +1,143 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 65.89, + "steady_ttft_p99_ms": 6708.31, + "burst_ttft_p50_ms": 9683.26, + "burst_ttft_p99_ms": 22975.84, + "sla_met_during_burst": false, + "burst_degradation_ratio": 3.425, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 7772.85, + "burst_ttft_p99_ms": 23080.27 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 119.28, + "burst_ttft_p99_ms": 22460.91 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 123.02, + "burst_ttft_p99_ms": 22360.77 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "09:30:01", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:20:56.451446+00:00", + "benchmark_end_time": "2026-05-07T09:30:01.912668+00:00", + "benchmark_elapsed_minutes": 9.1, + "model_load_seconds": 93.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/env_info.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/env_info.json new file mode 100644 index 00000000..de151680 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/env_info.json @@ -0,0 +1,32 @@ +{ + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/interactive/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/interactive/result.json new file mode 100644 index 00000000..04ecf18d --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/interactive/result.json @@ -0,0 +1,115 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 34.55, + "ttft_ms_p90": 56.25, + "ttft_ms_p99": 80.59, + "tpot_ms_p50": 16.58, + "tpot_ms_p90": 16.64, + "tpot_ms_p99": 16.75, + "peak_memory_gb": null, + "elapsed_seconds_median": 485.8 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:25:51", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:01:18.850226+00:00", + "benchmark_end_time": "2026-05-07T08:25:51.605313+00:00", + "benchmark_elapsed_minutes": 24.5, + "model_load_seconds": 163.0 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/offline/result.json similarity index 54% rename from results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json rename to results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/offline/result.json index a050fe47..9a373f6d 100644 --- a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/offline/result.json @@ -1,70 +1,54 @@ { "schema_version": "1.0", "suite_id": "suite_A", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "implementation_id": "nvidia_sglang_c43a8309", "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", "count": 1, - "memory_gb": 48.0, + "memory_gb": 24.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "environment": { - "collected_at": "2026-05-18T09:21:31.092840+00:00", + "collected_at": "2026-05-07T07:39:40.369666+00:00", "accelerators": [ { "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", - "memory_gb": 48.0, - "driver_version": "2.7.0", + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", "firmware_version": null, + "compute_capability": "8.9", "supports_bf16": true } ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", "intra_node_interconnect": null, "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", + "model": "INTEL(R) XEON(R) GOLD 6530", "physical_cores": 64, "logical_cores": 128, "numa_nodes": 2 }, - "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": [ - { - "name": "mlx5_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_1", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - }, - { - "name": "mlx5_bond_0", - "type": "InfiniBand/RoCE", - "bandwidth_gbps": null - } - ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" }, "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" }, "model": { "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", @@ -75,7 +59,7 @@ "architecture": "dense", "parameter_count_b": 8.0, "precision": "BF16", - "effective_dtype": "float16", + "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original" }, @@ -97,10 +81,9 @@ "results_by_concurrency": [ { "client_concurrency": 8, - "throughput_tokens_per_sec": 332.62, - "throughput_tokens_per_sec_per_chip": 332.62, - "throughput_tokens_per_sec_total": 922.83, - "elapsed_seconds_median": 43.4, + "throughput_tokens_per_sec": 2026.75, + "throughput_tokens_per_sec_per_chip": 2026.75, + "elapsed_seconds_median": 17.1, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -110,10 +93,9 @@ }, { "client_concurrency": 32, - "throughput_tokens_per_sec": 331.64, - "throughput_tokens_per_sec_per_chip": 331.64, - "throughput_tokens_per_sec_total": 920.1, - "elapsed_seconds_median": 43.6, + "throughput_tokens_per_sec": 1967.36, + "throughput_tokens_per_sec_per_chip": 1967.36, + "elapsed_seconds_median": 17.4, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -123,10 +105,9 @@ }, { "client_concurrency": 128, - "throughput_tokens_per_sec": 331.76, - "throughput_tokens_per_sec_per_chip": 331.76, - "throughput_tokens_per_sec_total": 920.46, - "elapsed_seconds_median": 43.6, + "throughput_tokens_per_sec": 1976.69, + "throughput_tokens_per_sec_per_chip": 1976.69, + "elapsed_seconds_median": 17.4, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -144,21 +125,21 @@ "notes": "Run --scenario accuracy to check model accuracy." }, "meta": { - "submitted_by": "JuhaoLiang1997", + "submitted_by": "Gong-K", "submission_type": "individual", - "date": "2026-05-18", - "time": "17:34:52", - "run_id": "cabb7bd0", - "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "date": "2026-05-07", + "time": "07:49:17", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, - "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", - "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", - "benchmark_elapsed_minutes": 8.7, - "model_load_seconds": 116.8 + "benchmark_start_time": "2026-05-07T07:45:42.464403+00:00", + "benchmark_end_time": "2026-05-07T07:49:17.840697+00:00", + "benchmark_elapsed_minutes": 3.6, + "model_load_seconds": 91.1 } } \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/online/result.json new file mode 100644 index 00000000..b1edfa56 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/online/result.json @@ -0,0 +1,147 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 70.25, + "ttft_ms_p90": 102.66, + "ttft_ms_p99": 2415.93, + "tpot_ms_p50": 23.17, + "tpot_ms_p90": 25.89, + "tpot_ms_p99": 31.86, + "elapsed_seconds_median": 67.2, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4399.78, + "ttft_ms_p90": 8909.53, + "ttft_ms_p99": 9633.91, + "tpot_ms_p50": 40.9, + "tpot_ms_p90": 43.39, + "tpot_ms_p99": 113.85, + "elapsed_seconds_median": 25.9, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 8348.54, + "ttft_ms_p90": 16328.81, + "ttft_ms_p99": 16742.85, + "tpot_ms_p50": 35.32, + "tpot_ms_p90": 43.82, + "tpot_ms_p99": 842.87, + "elapsed_seconds_median": 24.8, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:57:27", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:51:35.930159+00:00", + "benchmark_end_time": "2026-05-07T07:57:27.625509+00:00", + "benchmark_elapsed_minutes": 5.9, + "model_load_seconds": 96.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/result.json new file mode 100644 index 00000000..1d7fa650 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/result.json @@ -0,0 +1,594 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "speculative", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 2026.75, + "throughput_tokens_per_sec_per_chip": 2026.75, + "elapsed_seconds_median": 17.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 1967.36, + "throughput_tokens_per_sec_per_chip": 1967.36, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 1976.69, + "throughput_tokens_per_sec_per_chip": 1976.69, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 70.25, + "ttft_ms_p90": 102.66, + "ttft_ms_p99": 2415.93, + "tpot_ms_p50": 23.17, + "tpot_ms_p90": 25.89, + "tpot_ms_p99": 31.86, + "elapsed_seconds_median": 67.2, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4399.78, + "ttft_ms_p90": 8909.53, + "ttft_ms_p99": 9633.91, + "tpot_ms_p50": 40.9, + "tpot_ms_p90": 43.39, + "tpot_ms_p99": 113.85, + "elapsed_seconds_median": 25.9, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 8348.54, + "ttft_ms_p90": 16328.81, + "ttft_ms_p99": 16742.85, + "tpot_ms_p50": 35.32, + "tpot_ms_p90": 43.82, + "tpot_ms_p99": 842.87, + "elapsed_seconds_median": 24.8, + "sla_met": false + } + ] + }, + "interactive": { + "ttft_ms_p50": 34.55, + "ttft_ms_p90": 56.25, + "ttft_ms_p99": 80.59, + "tpot_ms_p50": 16.58, + "tpot_ms_p90": 16.64, + "tpot_ms_p99": 16.75, + "peak_memory_gb": null, + "elapsed_seconds_median": 485.8 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 350.8, + "tokens_out": 21065, + "tokens_in": 0, + "requests_completed": 112, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 8410.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.0, + "tokens_out": 26516, + "tokens_in": 0, + "requests_completed": 142, + "ttft_ms_p50": 49.7, + "ttft_ms_p99": 61.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.8, + "tokens_out": 26138, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.7, + "ttft_ms_p99": 57.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 441.5, + "tokens_out": 26496, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 71.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26234, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 63.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.0, + "tokens_out": 26582, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 65.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.5, + "tokens_out": 26066, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 63.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.9, + "tokens_out": 26649, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 64.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.1, + "tokens_out": 26393, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 65.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.3, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 64.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.4, + "tokens_out": 26183, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 55.7 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.9, + "tokens_out": 26576, + "tokens_in": 0, + "requests_completed": 143, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 62.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.4, + "tokens_out": 26344, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 62.9 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.2, + "tokens_out": 26066, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 66.4 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 444.4, + "tokens_out": 26668, + "tokens_in": 0, + "requests_completed": 142, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 66.2 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 428.1, + "tokens_out": 25687, + "tokens_in": 0, + "requests_completed": 136, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 66.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.3, + "tokens_out": 26531, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 61.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.0, + "tokens_out": 26347, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 61.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.0, + "tokens_out": 26202, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 60.3 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.3, + "tokens_out": 26417, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 66.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.4, + "tokens_out": 26621, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 63.3 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.7, + "tokens_out": 26254, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 62.3 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.6, + "tokens_out": 26070, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 61.4 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 447.5, + "tokens_out": 26855, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.4, + "ttft_ms_p99": 63.2 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.5, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 58.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.9, + "tokens_out": 26161, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 63.0 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.5, + "tokens_out": 26231, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 63.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.4, + "tokens_out": 26443, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 65.2 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26242, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 62.0 + } + ], + "sustained_throughput_tokens_per_sec": 439.0, + "throttle_ratio": 0.957, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2 + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 434.67, + "throughput_tokens_per_sec_per_chip": 434.67, + "elapsed_seconds_median": 79.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 439.2, + "throughput_tokens_per_sec_per_chip": 439.2, + "elapsed_seconds_median": 78.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 439.01, + "throughput_tokens_per_sec_per_chip": 439.01, + "elapsed_seconds_median": 78.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 65.89, + "steady_ttft_p99_ms": 6708.31, + "burst_ttft_p50_ms": 9683.26, + "burst_ttft_p99_ms": 22975.84, + "sla_met_during_burst": false, + "burst_degradation_ratio": 3.425, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 7772.85, + "burst_ttft_p99_ms": 23080.27 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 119.28, + "burst_ttft_p99_ms": 22460.91 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 123.02, + "burst_ttft_p99_ms": 22360.77 + } + ] + } + }, + "accuracy": { + "subset_score": 0.6, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "07:49:17", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T07:45:42.464403+00:00", + "benchmark_end_time": "2026-05-07T07:49:17.840697+00:00", + "benchmark_elapsed_minutes": 89.2, + "model_load_seconds": 91.1, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/online", + "interactive": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/interactive", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/sustained", + "speculative": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/speculative", + "burst": "results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/burst" + } + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/speculative/result.json similarity index 50% rename from results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json rename to results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/speculative/result.json index 2e6fc7fc..6d376d23 100644 --- a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/speculative/result.json @@ -1,55 +1,54 @@ { "schema_version": "1.0", "suite_id": "suite_A", - "implementation_id": "nvidia_onecat_vllm_12a253c2", + "implementation_id": "nvidia_sglang_c43a8309", "chip": { - "name": "Tesla V100-PCIE-32GB", + "name": "NVIDIA GeForce RTX 4090", "vendor": "NVIDIA", "count": 1, - "memory_gb": 32.0, + "memory_gb": 24.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "environment": { - "collected_at": "2026-05-18T09:38:50.346241+00:00", + "collected_at": "2026-05-07T07:39:40.369666+00:00", "accelerators": [ { "index": 0, - "name": "Tesla V100-PCIE-32GB", + "name": "NVIDIA GeForce RTX 4090", "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", + "memory_gb": 24.0, + "driver_version": "565.57.01", "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false + "compute_capability": "8.9", + "supports_bf16": true } ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", "intra_node_interconnect": null, "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, "numa_nodes": 2 }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", "runtime_version": "CUDA 12.8", "pytorch_version": "2.9.1+cu128" }, "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" }, "model": { "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", @@ -59,13 +58,13 @@ "model_source": "local", "architecture": "dense", "parameter_count_b": 8.0, - "precision": "FP16", - "effective_dtype": "float16", + "precision": "BF16", + "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original" }, "task": { - "scenario": "offline", + "scenario": "speculative", "num_runs": 3, "warmup_runs": 1, "parallelism": { @@ -74,28 +73,17 @@ "expert_parallel_size": 1, "data_parallel_size": 1 }, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - }, + "extra_config": null, "runtime_metrics": null }, "metrics": { - "offline": { + "speculative": { "results_by_concurrency": [ { "client_concurrency": 8, - "throughput_tokens_per_sec": 671.32, - "throughput_tokens_per_sec_per_chip": 671.32, - "throughput_tokens_per_sec_total": 1168.67, - "elapsed_seconds_median": 51.6, + "throughput_tokens_per_sec": 434.67, + "throughput_tokens_per_sec_per_chip": 434.67, + "elapsed_seconds_median": 79.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -105,10 +93,9 @@ }, { "client_concurrency": 32, - "throughput_tokens_per_sec": 670.99, - "throughput_tokens_per_sec_per_chip": 670.99, - "throughput_tokens_per_sec_total": 1168.09, - "elapsed_seconds_median": 51.6, + "throughput_tokens_per_sec": 439.2, + "throughput_tokens_per_sec_per_chip": 439.2, + "elapsed_seconds_median": 78.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -118,10 +105,9 @@ }, { "client_concurrency": 128, - "throughput_tokens_per_sec": 671.43, - "throughput_tokens_per_sec_per_chip": 671.43, - "throughput_tokens_per_sec_total": 1168.44, - "elapsed_seconds_median": 51.6, + "throughput_tokens_per_sec": 439.01, + "throughput_tokens_per_sec_per_chip": 439.01, + "elapsed_seconds_median": 78.7, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -139,21 +125,21 @@ "notes": "Run --scenario accuracy to check model accuracy." }, "meta": { - "submitted_by": "JuhaoLiang1997", + "submitted_by": "Gong-K", "submission_type": "individual", - "date": "2026-05-18", - "time": "18:03:39", - "run_id": "4e0e6eba", - "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "date": "2026-05-07", + "time": "09:18:46", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, - "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", - "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", - "benchmark_elapsed_minutes": 10.3, - "model_load_seconds": 47.8 + "benchmark_start_time": "2026-05-07T09:02:48.231664+00:00", + "benchmark_end_time": "2026-05-07T09:18:46.844554+00:00", + "benchmark_elapsed_minutes": 16.0, + "model_load_seconds": 141.6 } } \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/sustained/result.json new file mode 100644 index 00000000..3f3146d9 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd/sustained/result.json @@ -0,0 +1,407 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T07:39:40.369666+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 350.8, + "tokens_out": 21065, + "tokens_in": 0, + "requests_completed": 112, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 8410.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.0, + "tokens_out": 26516, + "tokens_in": 0, + "requests_completed": 142, + "ttft_ms_p50": 49.7, + "ttft_ms_p99": 61.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.8, + "tokens_out": 26138, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.7, + "ttft_ms_p99": 57.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 441.5, + "tokens_out": 26496, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 71.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26234, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 63.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.0, + "tokens_out": 26582, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 65.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.5, + "tokens_out": 26066, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 63.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.9, + "tokens_out": 26649, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 64.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.1, + "tokens_out": 26393, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 65.4 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.3, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 64.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.4, + "tokens_out": 26183, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 55.7 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.9, + "tokens_out": 26576, + "tokens_in": 0, + "requests_completed": 143, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 62.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.4, + "tokens_out": 26344, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 62.9 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.2, + "tokens_out": 26066, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 66.4 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 444.4, + "tokens_out": 26668, + "tokens_in": 0, + "requests_completed": 142, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 66.2 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 428.1, + "tokens_out": 25687, + "tokens_in": 0, + "requests_completed": 136, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 66.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 442.3, + "tokens_out": 26531, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 61.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.0, + "tokens_out": 26347, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 61.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.0, + "tokens_out": 26202, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 60.3 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.3, + "tokens_out": 26417, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 66.2 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 443.4, + "tokens_out": 26621, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 63.3 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.7, + "tokens_out": 26254, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 49.9, + "ttft_ms_p99": 62.3 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 434.6, + "tokens_out": 26070, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 49.8, + "ttft_ms_p99": 61.4 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 447.5, + "tokens_out": 26855, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.4, + "ttft_ms_p99": 63.2 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.5, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 58.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.9, + "tokens_out": 26161, + "tokens_in": 0, + "requests_completed": 137, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 63.0 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.5, + "tokens_out": 26231, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 63.3 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.4, + "tokens_out": 26443, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 65.2 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26242, + "tokens_in": 0, + "requests_completed": 138, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 62.0 + } + ], + "sustained_throughput_tokens_per_sec": 439.0, + "throttle_ratio": 0.957, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": 0.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "08:59:47", + "run_id": "9b2b01fd", + "run_name": "nvidia_geforce_rtx_4090x1_suite_A_nvidia_sglang_c43a8309_9b2b01fd", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T08:29:41.518630+00:00", + "benchmark_end_time": "2026-05-07T08:59:47.912999+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 153.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline/result.json new file mode 100644 index 00000000..2b8d274e --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1986.44, + "throughput_tokens_per_sec_per_chip": 1986.44, + "elapsed_seconds_median": 18.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1966.2, + "throughput_tokens_per_sec_per_chip": 1966.2, + "elapsed_seconds_median": 18.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1933.26, + "throughput_tokens_per_sec_per_chip": 1933.26, + "elapsed_seconds_median": 18.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1965.04, + "throughput_tokens_per_sec_per_chip": 1965.04, + "elapsed_seconds_median": 18.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "13:25:04", + "run_id": "4ce353c2", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:20:07.690461+00:00", + "benchmark_end_time": "2026-04-27T13:25:04.187964+00:00", + "benchmark_elapsed_minutes": 4.9, + "model_load_seconds": 81.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online/result.json new file mode 100644 index 00000000..040498d9 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 68.99, + "ttft_ms_p90": 101.54, + "ttft_ms_p99": 1975.21, + "tpot_ms_p50": 23.2, + "tpot_ms_p90": 25.96, + "tpot_ms_p99": 29.33, + "elapsed_seconds_median": 67.1, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 77.79, + "ttft_ms_p90": 120.18, + "ttft_ms_p99": 155.35, + "tpot_ms_p50": 34.33, + "tpot_ms_p90": 37.28, + "tpot_ms_p99": 48.36, + "elapsed_seconds_median": 34.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4302.15, + "ttft_ms_p90": 8634.91, + "ttft_ms_p99": 9469.27, + "tpot_ms_p50": 40.74, + "tpot_ms_p90": 44.2, + "tpot_ms_p99": 115.81, + "elapsed_seconds_median": 25.9, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 7003.83, + "ttft_ms_p90": 13096.96, + "ttft_ms_p99": 14256.05, + "tpot_ms_p50": 35.52, + "tpot_ms_p90": 45.45, + "tpot_ms_p99": 831.83, + "elapsed_seconds_median": 24.8, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "13:34:34", + "run_id": "4ce353c2", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:26:59.027717+00:00", + "benchmark_end_time": "2026-04-27T13:34:34.420033+00:00", + "benchmark_elapsed_minutes": 7.6, + "model_load_seconds": 80.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/result.json new file mode 100644 index 00000000..7317b5bf --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/result.json @@ -0,0 +1,374 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1986.44, + "throughput_tokens_per_sec_per_chip": 1986.44, + "elapsed_seconds_median": 18.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1966.2, + "throughput_tokens_per_sec_per_chip": 1966.2, + "elapsed_seconds_median": 18.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1933.26, + "throughput_tokens_per_sec_per_chip": 1933.26, + "elapsed_seconds_median": 18.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1965.04, + "throughput_tokens_per_sec_per_chip": 1965.04, + "elapsed_seconds_median": 18.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 68.99, + "ttft_ms_p90": 101.54, + "ttft_ms_p99": 1975.21, + "tpot_ms_p50": 23.2, + "tpot_ms_p90": 25.96, + "tpot_ms_p99": 29.33, + "elapsed_seconds_median": 67.1, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 77.79, + "ttft_ms_p90": 120.18, + "ttft_ms_p99": 155.35, + "tpot_ms_p50": 34.33, + "tpot_ms_p90": 37.28, + "tpot_ms_p99": 48.36, + "elapsed_seconds_median": 34.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4302.15, + "ttft_ms_p90": 8634.91, + "ttft_ms_p99": 9469.27, + "tpot_ms_p50": 40.74, + "tpot_ms_p90": 44.2, + "tpot_ms_p99": 115.81, + "elapsed_seconds_median": 25.9, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 7003.83, + "ttft_ms_p90": 13096.96, + "ttft_ms_p99": 14256.05, + "tpot_ms_p50": 35.52, + "tpot_ms_p90": 45.45, + "tpot_ms_p99": 831.83, + "elapsed_seconds_median": 24.8, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 393.4, + "tokens_out": 23625, + "tokens_in": 0, + "requests_completed": 131, + "ttft_ms_p50": 60.8, + "ttft_ms_p99": 3031.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.1, + "tokens_out": 25985, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 61.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.6, + "tokens_out": 26191, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 62.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 446.9, + "tokens_out": 26811, + "tokens_in": 0, + "requests_completed": 148, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 59.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.6, + "tokens_out": 26429, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.3, + "tokens_out": 26190, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 61.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.4, + "tokens_out": 26187, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 71.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.7, + "tokens_out": 26197, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 64.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.8, + "tokens_out": 26436, + "tokens_in": 0, + "requests_completed": 143, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 63.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 62.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.6, + "tokens_out": 26250, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26162, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 58.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.8, + "tokens_out": 26158, + "tokens_in": 0, + "requests_completed": 145, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 62.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 438.7, + "tokens_out": 26320, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.4, + "ttft_ms_p99": 62.8 + } + ], + "sustained_throughput_tokens_per_sec": 434.7, + "throttle_ratio": 0.88, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2968.6 + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "13:25:04", + "run_id": "4ce353c2", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:20:07.690461+00:00", + "benchmark_end_time": "2026-04-27T13:25:04.187964+00:00", + "benchmark_elapsed_minutes": 27.6, + "model_load_seconds": 81.4, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained/result.json new file mode 100644 index 00000000..468b0fda --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 393.4, + "tokens_out": 23625, + "tokens_in": 0, + "requests_completed": 131, + "ttft_ms_p50": 60.8, + "ttft_ms_p99": 3031.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.1, + "tokens_out": 25985, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 61.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.6, + "tokens_out": 26191, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 62.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 446.9, + "tokens_out": 26811, + "tokens_in": 0, + "requests_completed": 148, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 59.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.6, + "tokens_out": 26429, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.3, + "tokens_out": 26190, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 61.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.4, + "tokens_out": 26187, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 71.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.7, + "tokens_out": 26197, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 64.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.8, + "tokens_out": 26436, + "tokens_in": 0, + "requests_completed": 143, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 63.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 62.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.6, + "tokens_out": 26250, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26162, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 58.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.8, + "tokens_out": 26158, + "tokens_in": 0, + "requests_completed": 145, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 62.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 438.7, + "tokens_out": 26320, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.4, + "ttft_ms_p99": 62.8 + } + ], + "sustained_throughput_tokens_per_sec": 434.7, + "throttle_ratio": 0.88, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2968.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "13:51:48", + "run_id": "4ce353c2", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:36:40.827984+00:00", + "benchmark_end_time": "2026-04-27T13:51:48.094516+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 92.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/env_info.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/env_info.json new file mode 100644 index 00000000..54f5ab5a --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/env_info.json @@ -0,0 +1,32 @@ +{ + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/accuracy/accuracy.json new file mode 100644 index 00000000..1129e80c --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.58, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "FP8", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline/result.json new file mode 100644 index 00000000..03dc80bc --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": "FP8 quantized by RedHatAI using llm-compressor. Weights and activations in FP8. Native speedup on H100/MI300X; computed in BF16 on A100.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3870.35, + "throughput_tokens_per_sec_per_chip": 3870.35, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3873.21, + "throughput_tokens_per_sec_per_chip": 3873.21, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3965.79, + "throughput_tokens_per_sec_per_chip": 3965.79, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3935.02, + "throughput_tokens_per_sec_per_chip": 3935.02, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:00:18", + "run_id": "c609f262", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_c609f262", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:57:50.323406+00:00", + "benchmark_end_time": "2026-04-27T14:00:18.468430+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 71.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online/result.json new file mode 100644 index 00000000..e70d046a --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": "FP8 quantized by RedHatAI using llm-compressor. Weights and activations in FP8. Native speedup on H100/MI300X; computed in BF16 on A100.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.91, + "ttft_ms_p90": 58.78, + "ttft_ms_p99": 1717.74, + "tpot_ms_p50": 12.0, + "tpot_ms_p90": 12.87, + "tpot_ms_p99": 15.87, + "elapsed_seconds_median": 65.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.32, + "ttft_ms_p90": 53.51, + "ttft_ms_p99": 90.52, + "tpot_ms_p50": 18.34, + "tpot_ms_p90": 19.66, + "tpot_ms_p99": 22.26, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.71, + "ttft_ms_p90": 64.15, + "ttft_ms_p99": 109.33, + "tpot_ms_p50": 29.33, + "tpot_ms_p90": 36.05, + "tpot_ms_p99": 46.56, + "elapsed_seconds_median": 16.0, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.61, + "ttft_ms_p90": 1478.22, + "ttft_ms_p99": 4184.64, + "tpot_ms_p50": 38.21, + "tpot_ms_p90": 43.17, + "tpot_ms_p99": 64.28, + "elapsed_seconds_median": 13.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:08:19", + "run_id": "c609f262", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_c609f262", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:01:58.865277+00:00", + "benchmark_end_time": "2026-04-27T14:08:19.969799+00:00", + "benchmark_elapsed_minutes": 6.4, + "model_load_seconds": 67.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/result.json new file mode 100644 index 00000000..cb9da1ce --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/result.json @@ -0,0 +1,374 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": "FP8 quantized by RedHatAI using llm-compressor. Weights and activations in FP8. Native speedup on H100/MI300X; computed in BF16 on A100.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3870.35, + "throughput_tokens_per_sec_per_chip": 3870.35, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3873.21, + "throughput_tokens_per_sec_per_chip": 3873.21, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3965.79, + "throughput_tokens_per_sec_per_chip": 3965.79, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3935.02, + "throughput_tokens_per_sec_per_chip": 3935.02, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.91, + "ttft_ms_p90": 58.78, + "ttft_ms_p99": 1717.74, + "tpot_ms_p50": 12.0, + "tpot_ms_p90": 12.87, + "tpot_ms_p99": 15.87, + "elapsed_seconds_median": 65.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.32, + "ttft_ms_p90": 53.51, + "ttft_ms_p99": 90.52, + "tpot_ms_p50": 18.34, + "tpot_ms_p90": 19.66, + "tpot_ms_p99": 22.26, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.71, + "ttft_ms_p90": 64.15, + "ttft_ms_p99": 109.33, + "tpot_ms_p50": 29.33, + "tpot_ms_p90": 36.05, + "tpot_ms_p99": 46.56, + "elapsed_seconds_median": 16.0, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.61, + "ttft_ms_p90": 1478.22, + "ttft_ms_p99": 4184.64, + "tpot_ms_p50": 38.21, + "tpot_ms_p90": 43.17, + "tpot_ms_p99": 64.28, + "elapsed_seconds_median": 13.5, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 645.5, + "tokens_out": 38742, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 38.2, + "ttft_ms_p99": 2750.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.3, + "tokens_out": 42131, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 83.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.8, + "tokens_out": 41779, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 56.1 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.5, + "tokens_out": 41888, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 55.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 700.5, + "tokens_out": 42050, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.2 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41730, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 691.3, + "tokens_out": 41466, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 704.3, + "tokens_out": 42253, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.3, + "tokens_out": 41616, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 51.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.2, + "tokens_out": 41595, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 58.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.0, + "tokens_out": 41865, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.1, + "tokens_out": 42061, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 55.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41764, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 59.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42097, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 57.7 + } + ], + "sustained_throughput_tokens_per_sec": 694.1, + "throttle_ratio": 0.917, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2693.1 + } + }, + "accuracy": { + "subset_score": 0.58, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "FP8", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:00:18", + "run_id": "c609f262", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_c609f262", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:57:50.323406+00:00", + "benchmark_end_time": "2026-04-27T14:00:18.468430+00:00", + "benchmark_elapsed_minutes": 24.0, + "model_load_seconds": 71.6, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained/result.json new file mode 100644 index 00000000..31643190 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "model_name": null, + "model_note": "FP8 quantized by RedHatAI using llm-compressor. Weights and activations in FP8. Native speedup on H100/MI300X; computed in BF16 on A100.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 645.5, + "tokens_out": 38742, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 38.2, + "ttft_ms_p99": 2750.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.3, + "tokens_out": 42131, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 83.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.8, + "tokens_out": 41779, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 56.1 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.5, + "tokens_out": 41888, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 55.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 700.5, + "tokens_out": 42050, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.2 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41730, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 691.3, + "tokens_out": 41466, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 704.3, + "tokens_out": 42253, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.3, + "tokens_out": 41616, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 51.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.2, + "tokens_out": 41595, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 58.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.0, + "tokens_out": 41865, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.1, + "tokens_out": 42061, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 55.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41764, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 59.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42097, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 57.7 + } + ], + "sustained_throughput_tokens_per_sec": 694.1, + "throttle_ratio": 0.917, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2693.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:25:07", + "run_id": "c609f262", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_c609f262", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:10:02.789377+00:00", + "benchmark_end_time": "2026-04-27T14:25:07.007784+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 68.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/result.json new file mode 100644 index 00000000..3c31a113 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/result.json @@ -0,0 +1,1499 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 1986.44, + "accuracy_score": 0.55, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 1092.5, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1986.44, + "throughput_tokens_per_sec_per_chip": 1986.44, + "elapsed_seconds_median": 18.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1966.2, + "throughput_tokens_per_sec_per_chip": 1966.2, + "elapsed_seconds_median": 18.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1933.26, + "throughput_tokens_per_sec_per_chip": 1933.26, + "elapsed_seconds_median": 18.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1965.04, + "throughput_tokens_per_sec_per_chip": 1965.04, + "elapsed_seconds_median": 18.3, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "FP8", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "best_throughput_tokens_per_sec": 3965.79, + "accuracy_score": 0.58, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 2300.2, + "speedup_vs_bf16": 1.996, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3870.35, + "throughput_tokens_per_sec_per_chip": 3870.35, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3873.21, + "throughput_tokens_per_sec_per_chip": 3873.21, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3965.79, + "throughput_tokens_per_sec_per_chip": 3965.79, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3935.02, + "throughput_tokens_per_sec_per_chip": 3935.02, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "fp8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W8A8", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "best_throughput_tokens_per_sec": 4044.47, + "accuracy_score": 0.6, + "accuracy_baseline_delta": 0.01, + "accuracy_valid": true, + "quality_efficiency": 2426.7, + "speedup_vs_bf16": 2.036, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3903.39, + "throughput_tokens_per_sec_per_chip": 3903.39, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4044.47, + "throughput_tokens_per_sec_per_chip": 4044.47, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4000.58, + "throughput_tokens_per_sec_per_chip": 4000.58, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4027.83, + "throughput_tokens_per_sec_per_chip": 4027.83, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 2987.4, + "accuracy_score": 0.59, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 1762.6, + "speedup_vs_bf16": 1.504, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2972.67, + "throughput_tokens_per_sec_per_chip": 2972.67, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2969.3, + "throughput_tokens_per_sec_per_chip": 2969.3, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2973.03, + "throughput_tokens_per_sec_per_chip": 2973.03, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2987.4, + "throughput_tokens_per_sec_per_chip": 2987.4, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 1611.83, + "accuracy_score": 0.55, + "accuracy_baseline_delta": -0.02, + "accuracy_valid": true, + "quality_efficiency": 886.5, + "speedup_vs_bf16": 0.811, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1611.42, + "throughput_tokens_per_sec_per_chip": 1611.42, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1611.83, + "throughput_tokens_per_sec_per_chip": 1611.83, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1609.86, + "throughput_tokens_per_sec_per_chip": 1609.86, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1610.66, + "throughput_tokens_per_sec_per_chip": 1610.66, + "elapsed_seconds_median": 21.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "auto", + "quantization_method": "gptq" + } + ] + }, + "derived": {}, + "quantization_online": { + "results_by_precision": [ + { + "precision": "BF16", + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 68.99, + "ttft_ms_p90": 101.54, + "ttft_ms_p99": 1975.21, + "tpot_ms_p50": 23.2, + "tpot_ms_p90": 25.96, + "tpot_ms_p99": 29.33, + "elapsed_seconds_median": 67.1, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 77.79, + "ttft_ms_p90": 120.18, + "ttft_ms_p99": 155.35, + "tpot_ms_p50": 34.33, + "tpot_ms_p90": 37.28, + "tpot_ms_p99": 48.36, + "elapsed_seconds_median": 34.2, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4302.15, + "ttft_ms_p90": 8634.91, + "ttft_ms_p99": 9469.27, + "tpot_ms_p50": 40.74, + "tpot_ms_p90": 44.2, + "tpot_ms_p99": 115.81, + "elapsed_seconds_median": 25.9, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 7003.83, + "ttft_ms_p90": 13096.96, + "ttft_ms_p99": 14256.05, + "tpot_ms_p50": 35.52, + "tpot_ms_p90": 45.45, + "tpot_ms_p99": 831.83, + "elapsed_seconds_median": 24.8, + "sla_met": false + } + ] + }, + { + "precision": "FP8", + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.91, + "ttft_ms_p90": 58.78, + "ttft_ms_p99": 1717.74, + "tpot_ms_p50": 12.0, + "tpot_ms_p90": 12.87, + "tpot_ms_p99": 15.87, + "elapsed_seconds_median": 65.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.32, + "ttft_ms_p90": 53.51, + "ttft_ms_p99": 90.52, + "tpot_ms_p50": 18.34, + "tpot_ms_p90": 19.66, + "tpot_ms_p99": 22.26, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.71, + "ttft_ms_p90": 64.15, + "ttft_ms_p99": 109.33, + "tpot_ms_p50": 29.33, + "tpot_ms_p90": 36.05, + "tpot_ms_p99": 46.56, + "elapsed_seconds_median": 16.0, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 53.61, + "ttft_ms_p90": 1478.22, + "ttft_ms_p99": 4184.64, + "tpot_ms_p50": 38.21, + "tpot_ms_p90": 43.17, + "tpot_ms_p99": 64.28, + "elapsed_seconds_median": 13.5, + "sla_met": false + } + ] + }, + { + "precision": "W8A8", + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.71, + "ttft_ms_p90": 53.38, + "ttft_ms_p99": 1589.47, + "tpot_ms_p50": 11.46, + "tpot_ms_p90": 12.3, + "tpot_ms_p99": 15.34, + "elapsed_seconds_median": 65.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.87, + "ttft_ms_p90": 53.48, + "ttft_ms_p99": 65.3, + "tpot_ms_p50": 18.42, + "tpot_ms_p90": 19.56, + "tpot_ms_p99": 21.14, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.04, + "ttft_ms_p90": 61.38, + "ttft_ms_p99": 85.27, + "tpot_ms_p50": 28.19, + "tpot_ms_p90": 33.11, + "tpot_ms_p99": 44.66, + "elapsed_seconds_median": 15.7, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 52.01, + "ttft_ms_p90": 1450.28, + "ttft_ms_p99": 3527.1, + "tpot_ms_p50": 34.6, + "tpot_ms_p90": 40.12, + "tpot_ms_p99": 58.13, + "elapsed_seconds_median": 12.6, + "sla_met": false + } + ] + }, + { + "precision": "W8A16", + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.31, + "ttft_ms_p90": 74.84, + "ttft_ms_p99": 1847.63, + "tpot_ms_p50": 11.74, + "tpot_ms_p90": 13.92, + "tpot_ms_p99": 18.31, + "elapsed_seconds_median": 65.3, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 39.48, + "ttft_ms_p90": 47.1, + "ttft_ms_p99": 61.58, + "tpot_ms_p50": 16.6, + "tpot_ms_p90": 17.88, + "tpot_ms_p99": 20.06, + "elapsed_seconds_median": 32.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.43, + "ttft_ms_p90": 78.15, + "ttft_ms_p99": 105.59, + "tpot_ms_p50": 40.03, + "tpot_ms_p90": 45.76, + "tpot_ms_p99": 61.97, + "elapsed_seconds_median": 18.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 61.94, + "ttft_ms_p90": 3528.85, + "ttft_ms_p99": 7833.34, + "tpot_ms_p50": 53.7, + "tpot_ms_p90": 59.48, + "tpot_ms_p99": 83.56, + "elapsed_seconds_median": 17.8, + "sla_met": false + } + ] + }, + { + "precision": "W4A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 33.91, + "ttft_ms_p90": 103.24, + "ttft_ms_p99": 5457.75, + "tpot_ms_p50": 10.27, + "tpot_ms_p90": 21.22, + "tpot_ms_p99": 53.71, + "elapsed_seconds_median": 64.9, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 54.88, + "ttft_ms_p90": 92.23, + "ttft_ms_p99": 103.95, + "tpot_ms_p50": 44.0, + "tpot_ms_p90": 49.52, + "tpot_ms_p99": 53.52, + "elapsed_seconds_median": 36.4, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 64.76, + "ttft_ms_p90": 102.45, + "ttft_ms_p99": 140.98, + "tpot_ms_p50": 57.46, + "tpot_ms_p90": 61.94, + "tpot_ms_p99": 74.79, + "elapsed_seconds_median": 22.2, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.06, + "ttft_ms_p90": 102.77, + "ttft_ms_p99": 143.02, + "tpot_ms_p50": 61.0, + "tpot_ms_p90": 66.57, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 17.9, + "sla_met": true + } + ] + } + ] + }, + "quantization_sustained": { + "results_by_precision": [ + { + "precision": "BF16", + "sustained_throughput_tokens_per_sec": 434.7, + "throttle_ratio": 0.88, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2968.6, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 393.4, + "tokens_out": 23625, + "tokens_in": 0, + "requests_completed": 131, + "ttft_ms_p50": 60.8, + "ttft_ms_p99": 3031.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.1, + "tokens_out": 25985, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 61.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.6, + "tokens_out": 26191, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 62.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 446.9, + "tokens_out": 26811, + "tokens_in": 0, + "requests_completed": 148, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 59.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.6, + "tokens_out": 26429, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.3, + "tokens_out": 26190, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 61.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.4, + "tokens_out": 26187, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.2, + "ttft_ms_p99": 71.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.7, + "tokens_out": 26197, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 64.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 440.8, + "tokens_out": 26436, + "tokens_in": 0, + "requests_completed": 143, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 63.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.4, + "tokens_out": 26252, + "tokens_in": 0, + "requests_completed": 141, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 62.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 437.6, + "tokens_out": 26250, + "tokens_in": 0, + "requests_completed": 144, + "ttft_ms_p50": 50.1, + "ttft_ms_p99": 62.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26162, + "tokens_in": 0, + "requests_completed": 140, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 58.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 435.8, + "tokens_out": 26158, + "tokens_in": 0, + "requests_completed": 145, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 62.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 438.7, + "tokens_out": 26320, + "tokens_in": 0, + "requests_completed": 139, + "ttft_ms_p50": 50.4, + "ttft_ms_p99": 62.8 + } + ] + }, + { + "precision": "FP8", + "sustained_throughput_tokens_per_sec": 694.1, + "throttle_ratio": 0.917, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2693.1, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 645.5, + "tokens_out": 38742, + "tokens_in": 0, + "requests_completed": 211, + "ttft_ms_p50": 38.2, + "ttft_ms_p99": 2750.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.3, + "tokens_out": 42131, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 83.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.8, + "tokens_out": 41779, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 56.1 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.5, + "tokens_out": 41888, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 55.8 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 700.5, + "tokens_out": 42050, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.2 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41730, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 691.3, + "tokens_out": 41466, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 56.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 704.3, + "tokens_out": 42253, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 54.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.3, + "tokens_out": 41616, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 51.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 693.2, + "tokens_out": 41595, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 58.1 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.0, + "tokens_out": 41865, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 54.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.1, + "tokens_out": 42061, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 55.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 695.7, + "tokens_out": 41764, + "tokens_in": 0, + "requests_completed": 226, + "ttft_ms_p50": 37.4, + "ttft_ms_p99": 59.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42097, + "tokens_in": 0, + "requests_completed": 229, + "ttft_ms_p50": 37.2, + "ttft_ms_p99": 57.7 + } + ] + }, + { + "precision": "W8A8", + "sustained_throughput_tokens_per_sec": 719.2, + "throttle_ratio": 0.92, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2756.6, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 677.6, + "tokens_out": 40664, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 2809.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.9, + "tokens_out": 42916, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 52.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43580, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 46.2 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.1, + "tokens_out": 43251, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 53.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43460, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 60.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.4, + "tokens_out": 43288, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 45.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43696, + "tokens_in": 0, + "requests_completed": 238, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 52.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.3, + "tokens_out": 42941, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 54.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.2, + "tokens_out": 43310, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 50.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.3, + "tokens_out": 43356, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 66.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.5, + "tokens_out": 42869, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 53.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 736.2, + "tokens_out": 44141, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 51.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.5, + "tokens_out": 42963, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.6, + "tokens_out": 43711, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.5 + } + ] + }, + { + "precision": "W8A16", + "sustained_throughput_tokens_per_sec": 718.4, + "throttle_ratio": 0.871, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3856.6, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 639.5, + "tokens_out": 38376, + "tokens_in": 0, + "requests_completed": 208, + "ttft_ms_p50": 38.1, + "ttft_ms_p99": 3904.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.0, + "tokens_out": 43705, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 49.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.2, + "tokens_out": 43391, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 54.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.2, + "tokens_out": 43268, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 48.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.7, + "tokens_out": 43427, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 52.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.8, + "tokens_out": 43530, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 60.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.4, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 47.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 734.3, + "tokens_out": 44054, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 51.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43577, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 45.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.4, + "tokens_out": 43519, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 64.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.7, + "tokens_out": 42986, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 48.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.8, + "tokens_out": 43518, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 44.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.3, + "tokens_out": 43383, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 727.9, + "tokens_out": 43664, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 47.6 + } + ] + }, + { + "precision": "W4A16", + "sustained_throughput_tokens_per_sec": 970.9, + "throttle_ratio": 0.882, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3382.4, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 868.0, + "tokens_out": 52079, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 31.6, + "ttft_ms_p99": 3422.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.5, + "tokens_out": 59080, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 50.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.5, + "tokens_out": 58862, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 42.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 982.3, + "tokens_out": 58913, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 40.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58764, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 39.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 973.9, + "tokens_out": 58410, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 44.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 974.7, + "tokens_out": 58492, + "tokens_in": 0, + "requests_completed": 325, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 41.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 977.1, + "tokens_out": 58653, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 45.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.2, + "tokens_out": 58825, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 39.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 976.1, + "tokens_out": 58525, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 44.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 981.0, + "tokens_out": 58888, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.7, + "ttft_ms_p99": 41.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58736, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 971.5, + "tokens_out": 58325, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 46.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.4, + "tokens_out": 59028, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.3, + "ttft_ms_p99": 39.8 + } + ] + } + ] + } + }, + "accuracy": null, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "13:25:04", + "run_id": "4ce353c2", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T13:20:07.690461+00:00", + "benchmark_end_time": "2026-04-27T13:25:04.187964+00:00", + "benchmark_elapsed_minutes": 128.5, + "model_load_seconds": 81.4, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/offline", + "bf16/online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/online", + "bf16/sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/bf16/sustained", + "fp8/offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/offline", + "fp8/online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/online", + "fp8/sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/fp8/sustained", + "w8a8/offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline", + "w8a8/online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online", + "w8a8/sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained", + "w8a16/offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline", + "w8a16/online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online", + "w8a16/sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained", + "w4a16/offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline", + "w4a16/online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online", + "w4a16/sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization — reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization — larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..68a15e84 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.02, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline/result.json new file mode 100644 index 00000000..914af955 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1611.42, + "throughput_tokens_per_sec_per_chip": 1611.42, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1611.83, + "throughput_tokens_per_sec_per_chip": 1611.83, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1609.86, + "throughput_tokens_per_sec_per_chip": 1609.86, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1610.66, + "throughput_tokens_per_sec_per_chip": 1610.66, + "elapsed_seconds_median": 21.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:48:10", + "run_id": "98b818e7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_98b818e7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:42:22.200967+00:00", + "benchmark_end_time": "2026-04-27T15:48:10.359781+00:00", + "benchmark_elapsed_minutes": 5.8, + "model_load_seconds": 80.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online/result.json new file mode 100644 index 00000000..188df7d2 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 33.91, + "ttft_ms_p90": 103.24, + "ttft_ms_p99": 5457.75, + "tpot_ms_p50": 10.27, + "tpot_ms_p90": 21.22, + "tpot_ms_p99": 53.71, + "elapsed_seconds_median": 64.9, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 54.88, + "ttft_ms_p90": 92.23, + "ttft_ms_p99": 103.95, + "tpot_ms_p50": 44.0, + "tpot_ms_p90": 49.52, + "tpot_ms_p99": 53.52, + "elapsed_seconds_median": 36.4, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 64.76, + "ttft_ms_p90": 102.45, + "ttft_ms_p99": 140.98, + "tpot_ms_p50": 57.46, + "tpot_ms_p90": 61.94, + "tpot_ms_p99": 74.79, + "elapsed_seconds_median": 22.2, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.06, + "ttft_ms_p90": 102.77, + "ttft_ms_p99": 143.02, + "tpot_ms_p50": 61.0, + "tpot_ms_p90": 66.57, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 17.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:58:41", + "run_id": "98b818e7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_98b818e7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:51:40.707094+00:00", + "benchmark_end_time": "2026-04-27T15:58:41.603670+00:00", + "benchmark_elapsed_minutes": 7.0, + "model_load_seconds": 152.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/result.json new file mode 100644 index 00000000..19fad6ca --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/result.json @@ -0,0 +1,374 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1611.42, + "throughput_tokens_per_sec_per_chip": 1611.42, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1611.83, + "throughput_tokens_per_sec_per_chip": 1611.83, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1609.86, + "throughput_tokens_per_sec_per_chip": 1609.86, + "elapsed_seconds_median": 21.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1610.66, + "throughput_tokens_per_sec_per_chip": 1610.66, + "elapsed_seconds_median": 21.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 33.91, + "ttft_ms_p90": 103.24, + "ttft_ms_p99": 5457.75, + "tpot_ms_p50": 10.27, + "tpot_ms_p90": 21.22, + "tpot_ms_p99": 53.71, + "elapsed_seconds_median": 64.9, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 54.88, + "ttft_ms_p90": 92.23, + "ttft_ms_p99": 103.95, + "tpot_ms_p50": 44.0, + "tpot_ms_p90": 49.52, + "tpot_ms_p99": 53.52, + "elapsed_seconds_median": 36.4, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 64.76, + "ttft_ms_p90": 102.45, + "ttft_ms_p99": 140.98, + "tpot_ms_p50": 57.46, + "tpot_ms_p90": 61.94, + "tpot_ms_p99": 74.79, + "elapsed_seconds_median": 22.2, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 60.06, + "ttft_ms_p90": 102.77, + "ttft_ms_p99": 143.02, + "tpot_ms_p50": 61.0, + "tpot_ms_p90": 66.57, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 17.9, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 868.0, + "tokens_out": 52079, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 31.6, + "ttft_ms_p99": 3422.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.5, + "tokens_out": 59080, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 50.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.5, + "tokens_out": 58862, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 42.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 982.3, + "tokens_out": 58913, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 40.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58764, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 39.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 973.9, + "tokens_out": 58410, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 44.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 974.7, + "tokens_out": 58492, + "tokens_in": 0, + "requests_completed": 325, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 41.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 977.1, + "tokens_out": 58653, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 45.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.2, + "tokens_out": 58825, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 39.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 976.1, + "tokens_out": 58525, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 44.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 981.0, + "tokens_out": 58888, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.7, + "ttft_ms_p99": 41.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58736, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 971.5, + "tokens_out": 58325, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 46.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.4, + "tokens_out": 59028, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.3, + "ttft_ms_p99": 39.8 + } + ], + "sustained_throughput_tokens_per_sec": 970.9, + "throttle_ratio": 0.882, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3382.4 + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.02, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:48:10", + "run_id": "98b818e7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_98b818e7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:42:22.200967+00:00", + "benchmark_end_time": "2026-04-27T15:48:10.359781+00:00", + "benchmark_elapsed_minutes": 27.9, + "model_load_seconds": 80.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/online", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained/result.json new file mode 100644 index 00000000..1f0f3d66 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w4a16/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 868.0, + "tokens_out": 52079, + "tokens_in": 0, + "requests_completed": 293, + "ttft_ms_p50": 31.6, + "ttft_ms_p99": 3422.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.5, + "tokens_out": 59080, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 50.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.5, + "tokens_out": 58862, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 42.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 982.3, + "tokens_out": 58913, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 40.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58764, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 39.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 973.9, + "tokens_out": 58410, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 44.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 974.7, + "tokens_out": 58492, + "tokens_in": 0, + "requests_completed": 325, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 41.7 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 977.1, + "tokens_out": 58653, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 45.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 980.2, + "tokens_out": 58825, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.6, + "ttft_ms_p99": 39.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 976.1, + "tokens_out": 58525, + "tokens_in": 0, + "requests_completed": 328, + "ttft_ms_p50": 26.4, + "ttft_ms_p99": 44.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 981.0, + "tokens_out": 58888, + "tokens_in": 0, + "requests_completed": 330, + "ttft_ms_p50": 26.7, + "ttft_ms_p99": 41.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 979.3, + "tokens_out": 58736, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 971.5, + "tokens_out": 58325, + "tokens_in": 0, + "requests_completed": 331, + "ttft_ms_p50": 26.9, + "ttft_ms_p99": 46.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 984.4, + "tokens_out": 59028, + "tokens_in": 0, + "requests_completed": 327, + "ttft_ms_p50": 26.3, + "ttft_ms_p99": 39.8 + } + ], + "sustained_throughput_tokens_per_sec": 970.9, + "throttle_ratio": 0.882, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3382.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "16:15:57", + "run_id": "98b818e7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_98b818e7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T16:00:54.374341+00:00", + "benchmark_end_time": "2026-04-27T16:15:57.425199+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 97.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..60c0d5fb --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline/result.json new file mode 100644 index 00000000..b409ebea --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2972.67, + "throughput_tokens_per_sec_per_chip": 2972.67, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2969.3, + "throughput_tokens_per_sec_per_chip": 2969.3, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2973.03, + "throughput_tokens_per_sec_per_chip": 2973.03, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2987.4, + "throughput_tokens_per_sec_per_chip": 2987.4, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:08:30", + "run_id": "637ce9ca", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_637ce9ca", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:05:13.757784+00:00", + "benchmark_end_time": "2026-04-27T15:08:30.679669+00:00", + "benchmark_elapsed_minutes": 3.3, + "model_load_seconds": 90.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online/result.json new file mode 100644 index 00000000..123c452f --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.31, + "ttft_ms_p90": 74.84, + "ttft_ms_p99": 1847.63, + "tpot_ms_p50": 11.74, + "tpot_ms_p90": 13.92, + "tpot_ms_p99": 18.31, + "elapsed_seconds_median": 65.3, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 39.48, + "ttft_ms_p90": 47.1, + "ttft_ms_p99": 61.58, + "tpot_ms_p50": 16.6, + "tpot_ms_p90": 17.88, + "tpot_ms_p99": 20.06, + "elapsed_seconds_median": 32.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.43, + "ttft_ms_p90": 78.15, + "ttft_ms_p99": 105.59, + "tpot_ms_p50": 40.03, + "tpot_ms_p90": 45.76, + "tpot_ms_p99": 61.97, + "elapsed_seconds_median": 18.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 61.94, + "ttft_ms_p90": 3528.85, + "ttft_ms_p99": 7833.34, + "tpot_ms_p50": 53.7, + "tpot_ms_p90": 59.48, + "tpot_ms_p99": 83.56, + "elapsed_seconds_median": 17.8, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:17:09", + "run_id": "637ce9ca", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_637ce9ca", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:10:30.734651+00:00", + "benchmark_end_time": "2026-04-27T15:17:09.889423+00:00", + "benchmark_elapsed_minutes": 6.7, + "model_load_seconds": 87.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/result.json new file mode 100644 index 00000000..e953270e --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/result.json @@ -0,0 +1,374 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2972.67, + "throughput_tokens_per_sec_per_chip": 2972.67, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2969.3, + "throughput_tokens_per_sec_per_chip": 2969.3, + "elapsed_seconds_median": 12.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2973.03, + "throughput_tokens_per_sec_per_chip": 2973.03, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2987.4, + "throughput_tokens_per_sec_per_chip": 2987.4, + "elapsed_seconds_median": 12.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.31, + "ttft_ms_p90": 74.84, + "ttft_ms_p99": 1847.63, + "tpot_ms_p50": 11.74, + "tpot_ms_p90": 13.92, + "tpot_ms_p99": 18.31, + "elapsed_seconds_median": 65.3, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 39.48, + "ttft_ms_p90": 47.1, + "ttft_ms_p99": 61.58, + "tpot_ms_p50": 16.6, + "tpot_ms_p90": 17.88, + "tpot_ms_p99": 20.06, + "elapsed_seconds_median": 32.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.43, + "ttft_ms_p90": 78.15, + "ttft_ms_p99": 105.59, + "tpot_ms_p50": 40.03, + "tpot_ms_p90": 45.76, + "tpot_ms_p99": 61.97, + "elapsed_seconds_median": 18.1, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 61.94, + "ttft_ms_p90": 3528.85, + "ttft_ms_p99": 7833.34, + "tpot_ms_p50": 53.7, + "tpot_ms_p90": 59.48, + "tpot_ms_p99": 83.56, + "elapsed_seconds_median": 17.8, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 639.5, + "tokens_out": 38376, + "tokens_in": 0, + "requests_completed": 208, + "ttft_ms_p50": 38.1, + "ttft_ms_p99": 3904.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.0, + "tokens_out": 43705, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 49.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.2, + "tokens_out": 43391, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 54.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.2, + "tokens_out": 43268, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 48.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.7, + "tokens_out": 43427, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 52.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.8, + "tokens_out": 43530, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 60.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.4, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 47.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 734.3, + "tokens_out": 44054, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 51.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43577, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 45.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.4, + "tokens_out": 43519, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 64.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.7, + "tokens_out": 42986, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 48.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.8, + "tokens_out": 43518, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 44.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.3, + "tokens_out": 43383, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 727.9, + "tokens_out": 43664, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 47.6 + } + ], + "sustained_throughput_tokens_per_sec": 718.4, + "throttle_ratio": 0.871, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3856.6 + } + }, + "accuracy": { + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:08:30", + "run_id": "637ce9ca", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_637ce9ca", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:05:13.757784+00:00", + "benchmark_end_time": "2026-04-27T15:08:30.679669+00:00", + "benchmark_elapsed_minutes": 25.1, + "model_load_seconds": 90.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/online", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained/result.json new file mode 100644 index 00000000..234affa9 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a16/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 639.5, + "tokens_out": 38376, + "tokens_in": 0, + "requests_completed": 208, + "ttft_ms_p50": 38.1, + "ttft_ms_p99": 3904.2 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.0, + "tokens_out": 43705, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 49.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.2, + "tokens_out": 43391, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 54.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.2, + "tokens_out": 43268, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 48.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.7, + "tokens_out": 43427, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 52.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.8, + "tokens_out": 43530, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 60.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.4, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 231, + "ttft_ms_p50": 33.0, + "ttft_ms_p99": 47.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 734.3, + "tokens_out": 44054, + "tokens_in": 0, + "requests_completed": 240, + "ttft_ms_p50": 33.4, + "ttft_ms_p99": 51.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43577, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 45.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.4, + "tokens_out": 43519, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 64.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.7, + "tokens_out": 42986, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 48.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.8, + "tokens_out": 43518, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 44.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.3, + "tokens_out": 43383, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 33.2, + "ttft_ms_p99": 52.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 727.9, + "tokens_out": 43664, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 33.3, + "ttft_ms_p99": 47.6 + } + ], + "sustained_throughput_tokens_per_sec": 718.4, + "throttle_ratio": 0.871, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -3856.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "15:35:26", + "run_id": "637ce9ca", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_637ce9ca", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T15:20:22.197972+00:00", + "benchmark_end_time": "2026-04-27T15:35:26.482828+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 149.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/accuracy/accuracy.json new file mode 100644 index 00000000..5835a80c --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.6, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "W8A8", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline/result.json new file mode 100644 index 00000000..33a4a933 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3903.39, + "throughput_tokens_per_sec_per_chip": 3903.39, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4044.47, + "throughput_tokens_per_sec_per_chip": 4044.47, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4000.58, + "throughput_tokens_per_sec_per_chip": 4000.58, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4027.83, + "throughput_tokens_per_sec_per_chip": 4027.83, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:33:14", + "run_id": "30a46687", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_30a46687", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:30:47.317448+00:00", + "benchmark_end_time": "2026-04-27T14:33:14.431323+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 71.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online/result.json new file mode 100644 index 00000000..b02c0244 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.71, + "ttft_ms_p90": 53.38, + "ttft_ms_p99": 1589.47, + "tpot_ms_p50": 11.46, + "tpot_ms_p90": 12.3, + "tpot_ms_p99": 15.34, + "elapsed_seconds_median": 65.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.87, + "ttft_ms_p90": 53.48, + "ttft_ms_p99": 65.3, + "tpot_ms_p50": 18.42, + "tpot_ms_p90": 19.56, + "tpot_ms_p99": 21.14, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.04, + "ttft_ms_p90": 61.38, + "ttft_ms_p99": 85.27, + "tpot_ms_p50": 28.19, + "tpot_ms_p90": 33.11, + "tpot_ms_p99": 44.66, + "elapsed_seconds_median": 15.7, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 52.01, + "ttft_ms_p90": 1450.28, + "ttft_ms_p99": 3527.1, + "tpot_ms_p50": 34.6, + "tpot_ms_p90": 40.12, + "tpot_ms_p99": 58.13, + "elapsed_seconds_median": 12.6, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:41:14", + "run_id": "30a46687", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_30a46687", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:34:57.279473+00:00", + "benchmark_end_time": "2026-04-27T14:41:14.157626+00:00", + "benchmark_elapsed_minutes": 6.3, + "model_load_seconds": 69.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/result.json new file mode 100644 index 00000000..f4210cac --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/result.json @@ -0,0 +1,374 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3903.39, + "throughput_tokens_per_sec_per_chip": 3903.39, + "elapsed_seconds_median": 9.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4044.47, + "throughput_tokens_per_sec_per_chip": 4044.47, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4000.58, + "throughput_tokens_per_sec_per_chip": 4000.58, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4027.83, + "throughput_tokens_per_sec_per_chip": 4027.83, + "elapsed_seconds_median": 8.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 25, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 38.71, + "ttft_ms_p90": 53.38, + "ttft_ms_p99": 1589.47, + "tpot_ms_p50": 11.46, + "tpot_ms_p90": 12.3, + "tpot_ms_p99": 15.34, + "elapsed_seconds_median": 65.2, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 44.87, + "ttft_ms_p90": 53.48, + "ttft_ms_p99": 65.3, + "tpot_ms_p50": 18.42, + "tpot_ms_p90": 19.56, + "tpot_ms_p99": 21.14, + "elapsed_seconds_median": 32.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 50.04, + "ttft_ms_p90": 61.38, + "ttft_ms_p99": 85.27, + "tpot_ms_p50": 28.19, + "tpot_ms_p90": 33.11, + "tpot_ms_p99": 44.66, + "elapsed_seconds_median": 15.7, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 52.01, + "ttft_ms_p90": 1450.28, + "ttft_ms_p99": 3527.1, + "tpot_ms_p50": 34.6, + "tpot_ms_p90": 40.12, + "tpot_ms_p99": 58.13, + "elapsed_seconds_median": 12.6, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 677.6, + "tokens_out": 40664, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 2809.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.9, + "tokens_out": 42916, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 52.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43580, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 46.2 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.1, + "tokens_out": 43251, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 53.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43460, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 60.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.4, + "tokens_out": 43288, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 45.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43696, + "tokens_in": 0, + "requests_completed": 238, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 52.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.3, + "tokens_out": 42941, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 54.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.2, + "tokens_out": 43310, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 50.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.3, + "tokens_out": 43356, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 66.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.5, + "tokens_out": 42869, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 53.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 736.2, + "tokens_out": 44141, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 51.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.5, + "tokens_out": 42963, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.6, + "tokens_out": 43711, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.5 + } + ], + "sustained_throughput_tokens_per_sec": 719.2, + "throttle_ratio": 0.92, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2756.6 + } + }, + "accuracy": { + "subset_score": 0.6, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "W8A8", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:33:14", + "run_id": "30a46687", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_30a46687", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:30:47.317448+00:00", + "benchmark_end_time": "2026-04-27T14:33:14.431323+00:00", + "benchmark_elapsed_minutes": 23.9, + "model_load_seconds": 71.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/online", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained/result.json new file mode 100644 index 00000000..03d17a6b --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_4ce353c2/w8a8/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-27T13:14:17.420434+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 677.6, + "tokens_out": 40664, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 2809.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.9, + "tokens_out": 42916, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 52.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.3, + "tokens_out": 43580, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 35.9, + "ttft_ms_p99": 46.2 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.1, + "tokens_out": 43251, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 53.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43460, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 60.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.4, + "tokens_out": 43288, + "tokens_in": 0, + "requests_completed": 234, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 45.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43696, + "tokens_in": 0, + "requests_completed": 238, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 52.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.3, + "tokens_out": 42941, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 54.6 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.2, + "tokens_out": 43310, + "tokens_in": 0, + "requests_completed": 232, + "ttft_ms_p50": 35.8, + "ttft_ms_p99": 50.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 722.3, + "tokens_out": 43356, + "tokens_in": 0, + "requests_completed": 235, + "ttft_ms_p50": 36.1, + "ttft_ms_p99": 66.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.5, + "tokens_out": 42869, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.2, + "ttft_ms_p99": 53.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 736.2, + "tokens_out": 44141, + "tokens_in": 0, + "requests_completed": 237, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 51.9 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.5, + "tokens_out": 42963, + "tokens_in": 0, + "requests_completed": 236, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.6, + "tokens_out": 43711, + "tokens_in": 0, + "requests_completed": 233, + "ttft_ms_p50": 36.0, + "ttft_ms_p99": 52.5 + } + ], + "sustained_throughput_tokens_per_sec": 719.2, + "throttle_ratio": 0.92, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2756.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-27", + "time": "14:58:07", + "run_id": "30a46687", + "run_name": "nvidia_geforce_rtx_4090x1_suite_C_nvidia_sglang_c43a8309_30a46687", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-27T14:43:03.698180+00:00", + "benchmark_end_time": "2026-04-27T14:58:07.654245+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 75.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/accuracy/accuracy.json new file mode 100644 index 00000000..37f4d824 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/env_info.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/env_info.json new file mode 100644 index 00000000..89e8ceb3 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/env_info.json @@ -0,0 +1,32 @@ +{ + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/interactive/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/interactive/result.json new file mode 100644 index 00000000..c9462eb8 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/interactive/result.json @@ -0,0 +1,115 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 4305.61, + "ttft_ms_p90": 4494.03, + "ttft_ms_p99": 4587.16, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.72, + "tpot_ms_p99": 20.96, + "peak_memory_gb": null, + "elapsed_seconds_median": 896.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:56:58", + "run_id": "3f838de7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:27:06.729869+00:00", + "benchmark_end_time": "2026-05-07T10:56:58.928536+00:00", + "benchmark_elapsed_minutes": 29.9, + "model_load_seconds": 160.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/offline/result.json new file mode 100644 index 00000000..a739f138 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/offline/result.json @@ -0,0 +1,133 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 27.14, + "throughput_tokens_per_sec_per_chip": 27.14, + "elapsed_seconds_median": 473.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 27.07, + "throughput_tokens_per_sec_per_chip": 27.07, + "elapsed_seconds_median": 474.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:23:05", + "run_id": "3f838de7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T09:35:39.694912+00:00", + "benchmark_end_time": "2026-05-07T10:23:05.704936+00:00", + "benchmark_elapsed_minutes": 47.4, + "model_load_seconds": 81.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/online/result.json new file mode 100644 index 00000000..d12fdeb7 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/online/result.json @@ -0,0 +1,147 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 351298.63, + "ttft_ms_p90": 625325.33, + "ttft_ms_p99": 680302.66, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.69, + "tpot_ms_p99": 20.74, + "elapsed_seconds_median": 884.9, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 381686.19, + "ttft_ms_p90": 687285.75, + "ttft_ms_p99": 759388.6, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.7, + "tpot_ms_p99": 20.75, + "elapsed_seconds_median": 883.3, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 415347.43, + "ttft_ms_p90": 744515.16, + "ttft_ms_p99": 822031.46, + "tpot_ms_p50": 20.61, + "tpot_ms_p90": 20.71, + "tpot_ms_p99": 20.86, + "elapsed_seconds_median": 880.7, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "13:02:24", + "run_id": "3f838de7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T11:34:06.635917+00:00", + "benchmark_end_time": "2026-05-07T13:02:24.424539+00:00", + "benchmark_elapsed_minutes": 88.3, + "model_load_seconds": 162.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/result.json new file mode 100644 index 00000000..5e822930 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/result.json @@ -0,0 +1,500 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 27.14, + "throughput_tokens_per_sec_per_chip": 27.14, + "elapsed_seconds_median": 473.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 27.07, + "throughput_tokens_per_sec_per_chip": 27.07, + "elapsed_seconds_median": 474.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 4305.61, + "ttft_ms_p90": 4494.03, + "ttft_ms_p99": 4587.16, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.72, + "tpot_ms_p99": 20.96, + "peak_memory_gb": null, + "elapsed_seconds_median": 896.1 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 29474.9, + "ttft_ms_p99": 50800.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65800.9, + "ttft_ms_p99": 68681.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66378.2, + "ttft_ms_p99": 66953.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66799.4, + "ttft_ms_p99": 67336.4 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65978.9, + "ttft_ms_p99": 66173.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65047.4, + "ttft_ms_p99": 66056.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65985.2, + "ttft_ms_p99": 66517.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65430.6, + "ttft_ms_p99": 65984.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66326.7, + "ttft_ms_p99": 66529.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66475.0, + "ttft_ms_p99": 66630.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66933.2, + "ttft_ms_p99": 67325.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66210.0, + "ttft_ms_p99": 66866.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 65003.8, + "ttft_ms_p99": 65056.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65985.2, + "ttft_ms_p99": 66384.5 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65385.6, + "ttft_ms_p99": 65743.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66150.9, + "ttft_ms_p99": 66460.8 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66230.9, + "ttft_ms_p99": 66732.4 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 67157.7, + "ttft_ms_p99": 67520.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66791.7, + "ttft_ms_p99": 66894.9 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65156.8, + "ttft_ms_p99": 66151.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65935.1, + "ttft_ms_p99": 66244.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65618.1, + "ttft_ms_p99": 66477.5 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 65725.5, + "ttft_ms_p99": 66185.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66058.3, + "ttft_ms_p99": 66293.7 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66477.1, + "ttft_ms_p99": 67254.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66985.0, + "ttft_ms_p99": 67472.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66160.4, + "ttft_ms_p99": 66370.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 64984.1, + "ttft_ms_p99": 66016.4 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66045.7, + "ttft_ms_p99": 66588.0 + } + ], + "sustained_throughput_tokens_per_sec": 25.4, + "throttle_ratio": 0.856, + "throttle_onset_minute": 3.0, + "ttft_p99_drift_ms": -2093.1 + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 351298.63, + "ttft_ms_p90": 625325.33, + "ttft_ms_p99": 680302.66, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.69, + "tpot_ms_p99": 20.74, + "elapsed_seconds_median": 884.9, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 381686.19, + "ttft_ms_p90": 687285.75, + "ttft_ms_p99": 759388.6, + "tpot_ms_p50": 20.59, + "tpot_ms_p90": 20.7, + "tpot_ms_p99": 20.75, + "elapsed_seconds_median": 883.3, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 415347.43, + "ttft_ms_p90": 744515.16, + "ttft_ms_p99": 822031.46, + "tpot_ms_p50": 20.61, + "tpot_ms_p90": 20.71, + "tpot_ms_p99": 20.86, + "elapsed_seconds_median": 880.7, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.55, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "10:23:05", + "run_id": "3f838de7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'interactive', 'sustained', 'online'] succeeded, ['speculative'] failed.", + "benchmark_start_time": "2026-05-07T09:35:39.694912+00:00", + "benchmark_end_time": "2026-05-07T10:23:05.704936+00:00", + "benchmark_elapsed_minutes": 196.6, + "model_load_seconds": 81.3, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/offline", + "interactive": "results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/interactive", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/sustained", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/online" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/sustained/result.json new file mode 100644 index 00000000..480026c0 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7/sustained/result.json @@ -0,0 +1,407 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-07T09:31:39.284738+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 29474.9, + "ttft_ms_p99": 50800.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65800.9, + "ttft_ms_p99": 68681.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66378.2, + "ttft_ms_p99": 66953.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66799.4, + "ttft_ms_p99": 67336.4 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65978.9, + "ttft_ms_p99": 66173.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65047.4, + "ttft_ms_p99": 66056.9 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65985.2, + "ttft_ms_p99": 66517.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65430.6, + "ttft_ms_p99": 65984.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66326.7, + "ttft_ms_p99": 66529.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66475.0, + "ttft_ms_p99": 66630.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66933.2, + "ttft_ms_p99": 67325.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66210.0, + "ttft_ms_p99": 66866.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 65003.8, + "ttft_ms_p99": 65056.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65985.2, + "ttft_ms_p99": 66384.5 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65385.6, + "ttft_ms_p99": 65743.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66150.9, + "ttft_ms_p99": 66460.8 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66230.9, + "ttft_ms_p99": 66732.4 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 67157.7, + "ttft_ms_p99": 67520.3 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66791.7, + "ttft_ms_p99": 66894.9 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65156.8, + "ttft_ms_p99": 66151.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65935.1, + "ttft_ms_p99": 66244.4 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 65618.1, + "ttft_ms_p99": 66477.5 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 65725.5, + "ttft_ms_p99": 66185.8 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66058.3, + "ttft_ms_p99": 66293.7 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66477.1, + "ttft_ms_p99": 67254.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66985.0, + "ttft_ms_p99": 67472.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 22.5, + "tokens_out": 1350, + "tokens_in": 0, + "requests_completed": 6, + "ttft_ms_p50": 66160.4, + "ttft_ms_p99": 66370.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 64984.1, + "ttft_ms_p99": 66016.4 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 66045.7, + "ttft_ms_p99": 66588.0 + } + ], + "sustained_throughput_tokens_per_sec": 25.4, + "throttle_ratio": 0.856, + "throttle_onset_minute": 3.0, + "ttft_p99_drift_ms": -2093.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-07", + "time": "11:30:07", + "run_id": "3f838de7", + "run_name": "nvidia_geforce_rtx_4090x1_suite_D_nvidia_sglang_c43a8309_3f838de7", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-07T10:59:07.695088+00:00", + "benchmark_end_time": "2026-05-07T11:30:07.732770+00:00", + "benchmark_elapsed_minutes": 31.0, + "model_load_seconds": 88.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/accuracy/accuracy.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/accuracy/accuracy.json new file mode 100644 index 00000000..0dd1af70 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.42, + "baseline_delta": 0.04, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/env_info.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/env_info.json new file mode 100644 index 00000000..5ad0f9ef --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/env_info.json @@ -0,0 +1,32 @@ +{ + "collected_at": "2026-05-14T03:54:50.748183+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/interactive/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/interactive/result.json new file mode 100644 index 00000000..1f04d8f0 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/interactive/result.json @@ -0,0 +1,115 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-14T03:54:50.748183+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 13.54, + "ttft_ms_p90": 14.88, + "ttft_ms_p99": 22.4, + "tpot_ms_p50": 1.62, + "tpot_ms_p90": 1.66, + "tpot_ms_p99": 1.78, + "peak_memory_gb": null, + "elapsed_seconds_median": 48.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-14", + "time": "04:07:10", + "run_id": "12d28697", + "run_name": "nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-14T04:04:43.073670+00:00", + "benchmark_end_time": "2026-05-14T04:07:10.437796+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 49.4 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/offline/result.json similarity index 51% rename from results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json rename to results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/offline/result.json index da8126bc..c346d66f 100644 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/offline/result.json @@ -1,55 +1,54 @@ { "schema_version": "1.0", "suite_id": "suite_F", - "implementation_id": "nvidia_onecat_vllm_12a253c2", + "implementation_id": "nvidia_sglang_c43a8309", "chip": { - "name": "Tesla V100-PCIE-32GB", + "name": "NVIDIA GeForce RTX 4090", "vendor": "NVIDIA", "count": 1, - "memory_gb": 32.0, + "memory_gb": 24.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "environment": { - "collected_at": "2026-05-18T12:26:03.593928+00:00", + "collected_at": "2026-05-14T03:54:50.748183+00:00", "accelerators": [ { "index": 0, - "name": "Tesla V100-PCIE-32GB", + "name": "NVIDIA GeForce RTX 4090", "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", + "memory_gb": 24.0, + "driver_version": "565.57.01", "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false + "compute_capability": "8.9", + "supports_bf16": true } ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", "intra_node_interconnect": null, "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, "numa_nodes": 2 }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", "runtime_version": "CUDA 12.8", "pytorch_version": "2.9.1+cu128" }, "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" }, "model": { "model_id": "Qwen/Qwen2.5-0.5B-Instruct", @@ -59,8 +58,8 @@ "model_source": "local", "architecture": "dense", "parameter_count_b": 0.5, - "precision": "FP16", - "effective_dtype": "float16", + "precision": "BF16", + "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original" }, @@ -74,17 +73,7 @@ "expert_parallel_size": 1, "data_parallel_size": 1 }, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - }, + "extra_config": null, "runtime_metrics": null }, "metrics": { @@ -92,10 +81,10 @@ "results_by_concurrency": [ { "client_concurrency": 4, - "throughput_tokens_per_sec": 6234.82, - "throughput_tokens_per_sec_per_chip": 6234.82, - "throughput_tokens_per_sec_total": 9303.11, - "elapsed_seconds_median": 6.8, + "throughput_tokens_per_sec": 14832.92, + "throughput_tokens_per_sec_per_chip": 14832.92, + "throughput_tokens_per_sec_total": 20031.5, + "elapsed_seconds_median": 2.9, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -105,10 +94,10 @@ }, { "client_concurrency": 16, - "throughput_tokens_per_sec": 6292.79, - "throughput_tokens_per_sec_per_chip": 6292.79, - "throughput_tokens_per_sec_total": 9356.18, - "elapsed_seconds_median": 6.7, + "throughput_tokens_per_sec": 14771.95, + "throughput_tokens_per_sec_per_chip": 14771.95, + "throughput_tokens_per_sec_total": 19959.08, + "elapsed_seconds_median": 2.9, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -118,10 +107,10 @@ }, { "client_concurrency": 64, - "throughput_tokens_per_sec": 6243.51, - "throughput_tokens_per_sec_per_chip": 6243.51, - "throughput_tokens_per_sec_total": 9267.55, - "elapsed_seconds_median": 6.8, + "throughput_tokens_per_sec": 14824.27, + "throughput_tokens_per_sec_per_chip": 14824.27, + "throughput_tokens_per_sec_total": 20046.95, + "elapsed_seconds_median": 2.8, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -139,21 +128,21 @@ "notes": "Run --scenario accuracy to check model accuracy." }, "meta": { - "submitted_by": "JuhaoLiang1997", + "submitted_by": "Gong-K", "submission_type": "individual", - "date": "2026-05-18", - "time": "20:28:55", - "run_id": "419b138c", - "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "date": "2026-05-14", + "time": "04:00:02", + "run_id": "12d28697", + "run_name": "nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697", "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, - "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", - "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", - "benchmark_elapsed_minutes": 1.4, - "model_load_seconds": 31.7 + "benchmark_start_time": "2026-05-14T03:59:22.196953+00:00", + "benchmark_end_time": "2026-05-14T04:00:02.171314+00:00", + "benchmark_elapsed_minutes": 0.7, + "model_load_seconds": 54.9 } } \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/online/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/online/result.json new file mode 100644 index 00000000..3d472d35 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/online/result.json @@ -0,0 +1,135 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-14T03:54:50.748183+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 15.66, + "ttft_ms_p90": 26.95, + "ttft_ms_p99": 1894.51, + "tpot_ms_p50": 2.07, + "tpot_ms_p90": 2.47, + "tpot_ms_p99": 13.98, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 27.64, + "ttft_ms_p90": 32.89, + "ttft_ms_p99": 41.81, + "tpot_ms_p50": 14.31, + "tpot_ms_p90": 15.93, + "tpot_ms_p99": 21.71, + "elapsed_seconds_median": 9.1, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-14", + "time": "04:03:23", + "run_id": "12d28697", + "run_name": "nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-14T04:01:21.418175+00:00", + "benchmark_end_time": "2026-05-14T04:03:23.369790+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 49.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/result.json new file mode 100644 index 00000000..c87f4c41 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/result.json @@ -0,0 +1,353 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-14T03:54:50.748183+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 14832.92, + "throughput_tokens_per_sec_per_chip": 14832.92, + "throughput_tokens_per_sec_total": 20031.5, + "elapsed_seconds_median": 2.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 14771.95, + "throughput_tokens_per_sec_per_chip": 14771.95, + "throughput_tokens_per_sec_total": 19959.08, + "elapsed_seconds_median": 2.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 14824.27, + "throughput_tokens_per_sec_per_chip": 14824.27, + "throughput_tokens_per_sec_total": 20046.95, + "elapsed_seconds_median": 2.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 15.66, + "ttft_ms_p90": 26.95, + "ttft_ms_p99": 1894.51, + "tpot_ms_p50": 2.07, + "tpot_ms_p90": 2.47, + "tpot_ms_p99": 13.98, + "elapsed_seconds_median": 31.9, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 27.64, + "ttft_ms_p90": 32.89, + "ttft_ms_p99": 41.81, + "tpot_ms_p50": 14.31, + "tpot_ms_p90": 15.93, + "tpot_ms_p99": 21.71, + "elapsed_seconds_median": 9.1, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 13.54, + "ttft_ms_p90": 14.88, + "ttft_ms_p99": 22.4, + "tpot_ms_p50": 1.62, + "tpot_ms_p90": 1.66, + "tpot_ms_p99": 1.78, + "peak_memory_gb": null, + "elapsed_seconds_median": 48.5 + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2928.3, + "tokens_out": 175824, + "tokens_in": 0, + "requests_completed": 946, + "ttft_ms_p50": 24.2, + "ttft_ms_p99": 2551.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3143.7, + "tokens_out": 188505, + "tokens_in": 0, + "requests_completed": 1013, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3116.7, + "tokens_out": 187133, + "tokens_in": 0, + "requests_completed": 1004, + "ttft_ms_p50": 23.4, + "ttft_ms_p99": 32.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3064.3, + "tokens_out": 183809, + "tokens_in": 0, + "requests_completed": 985, + "ttft_ms_p50": 23.7, + "ttft_ms_p99": 36.0 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3101.3, + "tokens_out": 186067, + "tokens_in": 0, + "requests_completed": 997, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 30.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3086.1, + "tokens_out": 185226, + "tokens_in": 0, + "requests_completed": 993, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 35.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3134.9, + "tokens_out": 188082, + "tokens_in": 0, + "requests_completed": 1007, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3063.3, + "tokens_out": 183696, + "tokens_in": 0, + "requests_completed": 986, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 32.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3108.4, + "tokens_out": 186640, + "tokens_in": 0, + "requests_completed": 1000, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 31.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3101.4, + "tokens_out": 186059, + "tokens_in": 0, + "requests_completed": 997, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 36.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3080.3, + "tokens_out": 184804, + "tokens_in": 0, + "requests_completed": 991, + "ttft_ms_p50": 23.9, + "ttft_ms_p99": 35.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3084.6, + "tokens_out": 184961, + "tokens_in": 0, + "requests_completed": 994, + "ttft_ms_p50": 23.7, + "ttft_ms_p99": 33.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3134.2, + "tokens_out": 188117, + "tokens_in": 0, + "requests_completed": 1009, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.7 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3112.2, + "tokens_out": 186781, + "tokens_in": 0, + "requests_completed": 1001, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.6 + } + ], + "sustained_throughput_tokens_per_sec": 3090.0, + "throttle_ratio": 0.931, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2518.9 + } + }, + "accuracy": { + "subset_score": 0.42, + "baseline_delta": 0.04, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-14", + "time": "04:00:02", + "run_id": "12d28697", + "run_name": "nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-14T03:59:22.196953+00:00", + "benchmark_end_time": "2026-05-14T04:00:02.171314+00:00", + "benchmark_elapsed_minutes": 20.2, + "model_load_seconds": 54.9, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/offline", + "online": "results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/online", + "interactive": "results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/interactive", + "sustained": "results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/sustained/result.json b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/sustained/result.json new file mode 100644 index 00000000..073ea2d4 --- /dev/null +++ b/results/community/nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697/sustained/result.json @@ -0,0 +1,257 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 24.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-14T03:54:50.748183+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA GeForce RTX 4090", + "vendor": "NVIDIA", + "memory_gb": 24.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.9", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-31,64-95\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "INTEL(R) XEON(R) GOLD 6530", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.6, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 2928.3, + "tokens_out": 175824, + "tokens_in": 0, + "requests_completed": 946, + "ttft_ms_p50": 24.2, + "ttft_ms_p99": 2551.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3143.7, + "tokens_out": 188505, + "tokens_in": 0, + "requests_completed": 1013, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3116.7, + "tokens_out": 187133, + "tokens_in": 0, + "requests_completed": 1004, + "ttft_ms_p50": 23.4, + "ttft_ms_p99": 32.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3064.3, + "tokens_out": 183809, + "tokens_in": 0, + "requests_completed": 985, + "ttft_ms_p50": 23.7, + "ttft_ms_p99": 36.0 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3101.3, + "tokens_out": 186067, + "tokens_in": 0, + "requests_completed": 997, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 30.8 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3086.1, + "tokens_out": 185226, + "tokens_in": 0, + "requests_completed": 993, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 35.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3134.9, + "tokens_out": 188082, + "tokens_in": 0, + "requests_completed": 1007, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3063.3, + "tokens_out": 183696, + "tokens_in": 0, + "requests_completed": 986, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 32.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3108.4, + "tokens_out": 186640, + "tokens_in": 0, + "requests_completed": 1000, + "ttft_ms_p50": 23.6, + "ttft_ms_p99": 31.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3101.4, + "tokens_out": 186059, + "tokens_in": 0, + "requests_completed": 997, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 36.4 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3080.3, + "tokens_out": 184804, + "tokens_in": 0, + "requests_completed": 991, + "ttft_ms_p50": 23.9, + "ttft_ms_p99": 35.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3084.6, + "tokens_out": 184961, + "tokens_in": 0, + "requests_completed": 994, + "ttft_ms_p50": 23.7, + "ttft_ms_p99": 33.3 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3134.2, + "tokens_out": 188117, + "tokens_in": 0, + "requests_completed": 1009, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.7 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 3112.2, + "tokens_out": 186781, + "tokens_in": 0, + "requests_completed": 1001, + "ttft_ms_p50": 23.5, + "ttft_ms_p99": 32.6 + } + ], + "sustained_throughput_tokens_per_sec": 3090.0, + "throttle_ratio": 0.931, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2518.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-14", + "time": "04:23:31", + "run_id": "12d28697", + "run_name": "nvidia_geforce_rtx_4090x1_suite_F_nvidia_sglang_c43a8309_12d28697", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-14T04:08:29.029603+00:00", + "benchmark_end_time": "2026-05-14T04:23:31.519887+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 48.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/accuracy/accuracy.json new file mode 100644 index 00000000..5b260195 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/burst/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/burst/result.json new file mode 100644 index 00000000..ac6da65e --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/burst/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 71.76, + "steady_ttft_p99_ms": 5316.14, + "burst_ttft_p50_ms": 91.93, + "burst_ttft_p99_ms": 361.51, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.068, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 6204.77, + "burst_ttft_p99_ms": 439.74 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 94.27, + "burst_ttft_p99_ms": 305.42 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 94.55, + "burst_ttft_p99_ms": 360.12 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "09:32:37", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T09:23:55.580940+00:00", + "benchmark_end_time": "2026-04-29T09:32:37.748022+00:00", + "benchmark_elapsed_minutes": 8.7, + "model_load_seconds": 125.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/env_info.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/env_info.json similarity index 63% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/env_info.json rename to results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/env_info.json index cfd770eb..e9877c8b 100644 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_F_nvidia_sglang_c43a8309_435424a8/env_info.json +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/env_info.json @@ -1,27 +1,27 @@ { - "collected_at": "2026-05-07T10:52:35.700123+00:00", + "collected_at": "2026-04-29T07:36:07.207290+00:00", "accelerators": [ { "index": 0, - "name": "NVIDIA A100-SXM4-40GB", + "name": "NVIDIA RTX A6000", "vendor": "NVIDIA", - "memory_gb": 40, + "memory_gb": 48.0, "driver_version": "565.57.01", "firmware_version": null, - "compute_capability": "8.0", + "compute_capability": "8.6", "supports_bf16": true } ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", "intra_node_interconnect": null, "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, - "logical_cores": 128, + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, "numa_nodes": 2 }, - "system_memory_gb": 1007.7, - "pcie_generation": "PCIe Gen 4", + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": [ { @@ -38,6 +38,11 @@ "name": "mlx5_2", "type": "InfiniBand/RoCE", "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null } ], "os": "Ubuntu 22.04.4 LTS", diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/interactive/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/interactive/result.json new file mode 100644 index 00000000..f9238a36 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 47.37, + "ttft_ms_p90": 81.71, + "ttft_ms_p99": 111.29, + "tpot_ms_p50": 23.0, + "tpot_ms_p90": 23.08, + "tpot_ms_p99": 23.19, + "peak_memory_gb": null, + "elapsed_seconds_median": 676.7 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "08:26:48", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T07:52:53.926105+00:00", + "benchmark_end_time": "2026-04-29T08:26:48.699149+00:00", + "benchmark_elapsed_minutes": 33.9, + "model_load_seconds": 80.2 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/offline/result.json new file mode 100644 index 00000000..9f2ab0e3 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/offline/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 2039.11, + "throughput_tokens_per_sec_per_chip": 2039.11, + "elapsed_seconds_median": 17.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 2024.26, + "throughput_tokens_per_sec_per_chip": 2024.26, + "elapsed_seconds_median": 17.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 2029.11, + "throughput_tokens_per_sec_per_chip": 2029.11, + "elapsed_seconds_median": 17.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "07:44:11", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T07:40:39.895067+00:00", + "benchmark_end_time": "2026-04-29T07:44:11.019231+00:00", + "benchmark_elapsed_minutes": 3.5, + "model_load_seconds": 58.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/online/result.json new file mode 100644 index 00000000..57011166 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 73.61, + "ttft_ms_p90": 126.19, + "ttft_ms_p99": 1989.06, + "tpot_ms_p50": 30.47, + "tpot_ms_p90": 35.48, + "tpot_ms_p99": 38.38, + "elapsed_seconds_median": 68.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.09, + "ttft_ms_p90": 108.59, + "ttft_ms_p99": 152.02, + "tpot_ms_p50": 68.29, + "tpot_ms_p90": 75.68, + "tpot_ms_p99": 117.11, + "elapsed_seconds_median": 23.6, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 68.38, + "ttft_ms_p90": 80.76, + "ttft_ms_p99": 109.03, + "tpot_ms_p50": 75.85, + "tpot_ms_p90": 91.71, + "tpot_ms_p99": 361.36, + "elapsed_seconds_median": 18.4, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "07:50:57", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T07:45:27.699512+00:00", + "benchmark_end_time": "2026-04-29T07:50:57.659935+00:00", + "benchmark_elapsed_minutes": 5.5, + "model_load_seconds": 51.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/result.json new file mode 100644 index 00000000..0eb4d2a9 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/result.json @@ -0,0 +1,615 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "speculative", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 2039.11, + "throughput_tokens_per_sec_per_chip": 2039.11, + "elapsed_seconds_median": 17.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 2024.26, + "throughput_tokens_per_sec_per_chip": 2024.26, + "elapsed_seconds_median": 17.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 2029.11, + "throughput_tokens_per_sec_per_chip": 2029.11, + "elapsed_seconds_median": 17.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 100, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 73.61, + "ttft_ms_p90": 126.19, + "ttft_ms_p99": 1989.06, + "tpot_ms_p50": 30.47, + "tpot_ms_p90": 35.48, + "tpot_ms_p99": 38.38, + "elapsed_seconds_median": 68.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.09, + "ttft_ms_p90": 108.59, + "ttft_ms_p99": 152.02, + "tpot_ms_p50": 68.29, + "tpot_ms_p90": 75.68, + "tpot_ms_p99": 117.11, + "elapsed_seconds_median": 23.6, + "sla_met": true + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 68.38, + "ttft_ms_p90": 80.76, + "ttft_ms_p99": 109.03, + "tpot_ms_p50": 75.85, + "tpot_ms_p90": 91.71, + "tpot_ms_p99": 361.36, + "elapsed_seconds_median": 18.4, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 47.37, + "ttft_ms_p90": 81.71, + "ttft_ms_p99": 111.29, + "tpot_ms_p50": 23.0, + "tpot_ms_p90": 23.08, + "tpot_ms_p99": 23.19, + "peak_memory_gb": null, + "elapsed_seconds_median": 676.7 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 255.3, + "tokens_out": 15327, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 111.7, + "ttft_ms_p99": 7156.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18877, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 134.7 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.9, + "tokens_out": 18900, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18885, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 85.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.5, + "tokens_out": 18740, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 87.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 320.4, + "tokens_out": 19229, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.0, + "tokens_out": 18898, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 86.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.8, + "tokens_out": 19132, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 69.9, + "ttft_ms_p99": 86.1 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.1, + "tokens_out": 18971, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 69.8, + "ttft_ms_p99": 85.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.2, + "tokens_out": 18733, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 69.9, + "ttft_ms_p99": 71.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.6, + "tokens_out": 18627, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.2, + "tokens_out": 18971, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 85.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.0, + "tokens_out": 19083, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 86.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 319.1, + "tokens_out": 19143, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 86.3 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 313.8, + "tokens_out": 18826, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 72.7 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.5, + "tokens_out": 19119, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.4, + "tokens_out": 18923, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.6, + "tokens_out": 18867, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18894, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 319.1, + "tokens_out": 19135, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 87.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18897, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 86.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.4, + "tokens_out": 18919, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 71.8 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.3, + "tokens_out": 18980, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 86.0 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.9, + "tokens_out": 18658, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 85.9 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.2, + "tokens_out": 18905, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.7 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.8, + "tokens_out": 19135, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.6 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.0, + "tokens_out": 18898, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 313.6, + "tokens_out": 18815, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.1, + "tokens_out": 18908, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.1 + } + ], + "sustained_throughput_tokens_per_sec": 315.5, + "throttle_ratio": 0.969, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -48.6 + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 421.15, + "throughput_tokens_per_sec_per_chip": 421.15, + "elapsed_seconds_median": 82.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 421.71, + "throughput_tokens_per_sec_per_chip": 421.71, + "elapsed_seconds_median": 82.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 421.73, + "throughput_tokens_per_sec_per_chip": 421.73, + "elapsed_seconds_median": 82.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 71.76, + "steady_ttft_p99_ms": 5316.14, + "burst_ttft_p50_ms": 91.93, + "burst_ttft_p99_ms": 361.51, + "sla_met_during_burst": true, + "burst_degradation_ratio": 0.068, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 6204.77, + "burst_ttft_p99_ms": 439.74 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 94.27, + "burst_ttft_p99_ms": 305.42 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 94.55, + "burst_ttft_p99_ms": 360.12 + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "07:44:11", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T07:40:39.895067+00:00", + "benchmark_end_time": "2026-04-29T07:44:11.019231+00:00", + "benchmark_elapsed_minutes": 98.5, + "model_load_seconds": 58.9, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/offline", + "online": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/online", + "interactive": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/interactive", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/sustained", + "speculative": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/speculative", + "burst": "results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/burst" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/speculative/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/speculative/result.json new file mode 100644 index 00000000..581494c9 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/speculative/result.json @@ -0,0 +1,166 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 421.15, + "throughput_tokens_per_sec_per_chip": 421.15, + "elapsed_seconds_median": 82.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 421.71, + "throughput_tokens_per_sec_per_chip": 421.71, + "elapsed_seconds_median": 82.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 421.73, + "throughput_tokens_per_sec_per_chip": 421.73, + "elapsed_seconds_median": 82.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "09:20:48", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T09:04:08.250654+00:00", + "benchmark_end_time": "2026-04-29T09:20:48.246844+00:00", + "benchmark_elapsed_minutes": 16.7, + "model_load_seconds": 170.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/sustained/result.json new file mode 100644 index 00000000..a3bc39e3 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T07:36:07.207290+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 255.3, + "tokens_out": 15327, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 111.7, + "ttft_ms_p99": 7156.3 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18877, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 134.7 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.9, + "tokens_out": 18900, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.6 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18885, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 85.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.5, + "tokens_out": 18740, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 87.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 320.4, + "tokens_out": 19229, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.0, + "tokens_out": 18898, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 86.3 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.8, + "tokens_out": 19132, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 69.9, + "ttft_ms_p99": 86.1 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.1, + "tokens_out": 18971, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 69.8, + "ttft_ms_p99": 85.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.2, + "tokens_out": 18733, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 69.9, + "ttft_ms_p99": 71.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.6, + "tokens_out": 18627, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.2, + "tokens_out": 18971, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 85.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.0, + "tokens_out": 19083, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 86.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 319.1, + "tokens_out": 19143, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 86.3 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 313.8, + "tokens_out": 18826, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 72.7 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.5, + "tokens_out": 19119, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.4, + "tokens_out": 18923, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.6, + "tokens_out": 18867, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18894, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.9 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 319.1, + "tokens_out": 19135, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 87.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 314.8, + "tokens_out": 18897, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 86.1 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.4, + "tokens_out": 18919, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 71.8 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 316.3, + "tokens_out": 18980, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 86.0 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.9, + "tokens_out": 18658, + "tokens_in": 0, + "requests_completed": 99, + "ttft_ms_p50": 70.0, + "ttft_ms_p99": 85.9 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.2, + "tokens_out": 18905, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 85.7 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.8, + "tokens_out": 19135, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.6 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.0, + "tokens_out": 18898, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 313.6, + "tokens_out": 18815, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 85.8 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.1, + "tokens_out": 18908, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.1, + "ttft_ms_p99": 86.1 + } + ], + "sustained_throughput_tokens_per_sec": 315.5, + "throttle_ratio": 0.969, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -48.6 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "09:00:11", + "run_id": "9c6920b5", + "run_name": "nvidia_rtx_a6000x1_suite_A_nvidia_sglang_c43a8309_9c6920b5", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T08:30:01.272008+00:00", + "benchmark_end_time": "2026-04-29T09:00:11.931324+00:00", + "benchmark_elapsed_minutes": 30.2, + "model_load_seconds": 128.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/accuracy/accuracy.json similarity index 100% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/accuracy/accuracy.json rename to results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/accuracy/accuracy.json diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline/result.json new file mode 100644 index 00000000..3280c33f --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2044.08, + "throughput_tokens_per_sec_per_chip": 2044.08, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2043.12, + "throughput_tokens_per_sec_per_chip": 2043.12, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2043.77, + "throughput_tokens_per_sec_per_chip": 2043.77, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2045.83, + "throughput_tokens_per_sec_per_chip": 2045.83, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "10:16:44", + "run_id": "4c65fcfb", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:11:59.573174+00:00", + "benchmark_end_time": "2026-04-29T10:16:44.166472+00:00", + "benchmark_elapsed_minutes": 4.7, + "model_load_seconds": 63.0 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online/result.json new file mode 100644 index 00000000..57f5f4c9 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 74.15, + "ttft_ms_p90": 125.13, + "ttft_ms_p99": 1721.49, + "tpot_ms_p50": 30.47, + "tpot_ms_p90": 35.85, + "tpot_ms_p99": 40.25, + "elapsed_seconds_median": 68.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 71.26, + "ttft_ms_p90": 85.73, + "ttft_ms_p99": 93.62, + "tpot_ms_p50": 40.73, + "tpot_ms_p90": 42.27, + "tpot_ms_p99": 46.79, + "elapsed_seconds_median": 36.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.68, + "ttft_ms_p90": 110.47, + "ttft_ms_p99": 158.24, + "tpot_ms_p50": 67.21, + "tpot_ms_p90": 76.53, + "tpot_ms_p99": 102.14, + "elapsed_seconds_median": 23.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 71.87, + "ttft_ms_p90": 100.76, + "ttft_ms_p99": 152.6, + "tpot_ms_p50": 76.16, + "tpot_ms_p90": 89.69, + "tpot_ms_p99": 138.89, + "elapsed_seconds_median": 20.4, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "10:25:34", + "run_id": "4c65fcfb", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:18:09.895393+00:00", + "benchmark_end_time": "2026-04-29T10:25:34.882647+00:00", + "benchmark_elapsed_minutes": 7.4, + "model_load_seconds": 58.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/result.json new file mode 100644 index 00000000..e2272bca --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2044.08, + "throughput_tokens_per_sec_per_chip": 2044.08, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2043.12, + "throughput_tokens_per_sec_per_chip": 2043.12, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2043.77, + "throughput_tokens_per_sec_per_chip": 2043.77, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2045.83, + "throughput_tokens_per_sec_per_chip": 2045.83, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 74.15, + "ttft_ms_p90": 125.13, + "ttft_ms_p99": 1721.49, + "tpot_ms_p50": 30.47, + "tpot_ms_p90": 35.85, + "tpot_ms_p99": 40.25, + "elapsed_seconds_median": 68.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 71.26, + "ttft_ms_p90": 85.73, + "ttft_ms_p99": 93.62, + "tpot_ms_p50": 40.73, + "tpot_ms_p90": 42.27, + "tpot_ms_p99": 46.79, + "elapsed_seconds_median": 36.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.68, + "ttft_ms_p90": 110.47, + "ttft_ms_p99": 158.24, + "tpot_ms_p50": 67.21, + "tpot_ms_p90": 76.53, + "tpot_ms_p99": 102.14, + "elapsed_seconds_median": 23.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 71.87, + "ttft_ms_p90": 100.76, + "ttft_ms_p99": 152.6, + "tpot_ms_p50": 76.16, + "tpot_ms_p90": 89.69, + "tpot_ms_p99": 138.89, + "elapsed_seconds_median": 20.4, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 273.8, + "tokens_out": 16436, + "tokens_in": 0, + "requests_completed": 92, + "ttft_ms_p50": 104.0, + "ttft_ms_p99": 2684.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.4, + "tokens_out": 19101, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 154.0 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.5, + "tokens_out": 19280, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 85.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 309.0, + "tokens_out": 18541, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 86.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 307.8, + "tokens_out": 18477, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 72.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 320.6, + "tokens_out": 19232, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 306.6, + "tokens_out": 18403, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.6, + "ttft_ms_p99": 72.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 308.1, + "tokens_out": 18483, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 72.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 327.8, + "tokens_out": 19673, + "tokens_in": 0, + "requests_completed": 108, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.3, + "tokens_out": 18617, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 86.0 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.6, + "tokens_out": 18753, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 71.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.0, + "tokens_out": 19255, + "tokens_in": 0, + "requests_completed": 105, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 72.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 106, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 71.7 + } + ], + "sustained_throughput_tokens_per_sec": 312.0, + "throttle_ratio": 0.835, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2612.8 + } + }, + "accuracy": { + "subset_score": 0.57, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "10:16:44", + "run_id": "4c65fcfb", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:11:59.573174+00:00", + "benchmark_end_time": "2026-04-29T10:16:44.166472+00:00", + "benchmark_elapsed_minutes": 27.3, + "model_load_seconds": 63.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline", + "online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained/result.json new file mode 100644 index 00000000..c447bccf --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 273.8, + "tokens_out": 16436, + "tokens_in": 0, + "requests_completed": 92, + "ttft_ms_p50": 104.0, + "ttft_ms_p99": 2684.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.4, + "tokens_out": 19101, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 154.0 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.5, + "tokens_out": 19280, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 85.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 309.0, + "tokens_out": 18541, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 86.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 307.8, + "tokens_out": 18477, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 72.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 320.6, + "tokens_out": 19232, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 306.6, + "tokens_out": 18403, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.6, + "ttft_ms_p99": 72.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 308.1, + "tokens_out": 18483, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 72.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 327.8, + "tokens_out": 19673, + "tokens_in": 0, + "requests_completed": 108, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.3, + "tokens_out": 18617, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 86.0 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.6, + "tokens_out": 18753, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 71.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.0, + "tokens_out": 19255, + "tokens_in": 0, + "requests_completed": 105, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 72.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 106, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 71.7 + } + ], + "sustained_throughput_tokens_per_sec": 312.0, + "throttle_ratio": 0.835, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2612.8 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "10:42:09", + "run_id": "4c65fcfb", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:27:00.056331+00:00", + "benchmark_end_time": "2026-04-29T10:42:09.480677+00:00", + "benchmark_elapsed_minutes": 15.2, + "model_load_seconds": 56.1 + } +} \ No newline at end of file diff --git a/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/env_info.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/env_info.json similarity index 63% rename from results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/env_info.json rename to results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/env_info.json index 5690fb1d..f726f580 100644 --- a/results/community/nvidia_a100_sxm4_40gbx1_suite_D_nvidia_sglang_c43a8309_99c43b97/env_info.json +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/env_info.json @@ -1,27 +1,27 @@ { - "collected_at": "2026-05-07T06:55:48.459765+00:00", + "collected_at": "2026-04-29T10:07:24.359391+00:00", "accelerators": [ { "index": 0, - "name": "NVIDIA A100-SXM4-40GB", + "name": "NVIDIA RTX A6000", "vendor": "NVIDIA", - "memory_gb": 40, + "memory_gb": 48.0, "driver_version": "565.57.01", "firmware_version": null, - "compute_capability": "8.0", + "compute_capability": "8.6", "supports_bf16": true } ], - "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tSYS\tSYS\tSYS\t32-63,96-127\t1\t\tN/A\nNIC0\tSYS\t X \tPIX\tNODE\t\t\t\t\nNIC1\tSYS\tPIX\t X \tNODE\t\t\t\t\nNIC2\tSYS\tNODE\tNODE\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", "intra_node_interconnect": null, "cpu": { - "model": "AMD EPYC 7532 32-Core Processor", - "physical_cores": 64, - "logical_cores": 128, + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, "numa_nodes": 2 }, - "system_memory_gb": 1007.7, - "pcie_generation": "PCIe Gen 4", + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": [ { @@ -38,6 +38,11 @@ "name": "mlx5_2", "type": "InfiniBand/RoCE", "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null } ], "os": "Ubuntu 22.04.4 LTS", diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/result.json new file mode 100644 index 00000000..1cb7fc0a --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/result.json @@ -0,0 +1,963 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 2045.83, + "accuracy_score": 0.57, + "accuracy_baseline_delta": 0.01, + "accuracy_valid": true, + "quality_efficiency": 1166.1, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2044.08, + "throughput_tokens_per_sec_per_chip": 2044.08, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2043.12, + "throughput_tokens_per_sec_per_chip": 2043.12, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2043.77, + "throughput_tokens_per_sec_per_chip": 2043.77, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2045.83, + "throughput_tokens_per_sec_per_chip": 2045.83, + "elapsed_seconds_median": 17.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 2231.0, + "accuracy_score": 0.58, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 1294.0, + "speedup_vs_bf16": 1.091, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2231.0, + "throughput_tokens_per_sec_per_chip": 2231.0, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2228.35, + "throughput_tokens_per_sec_per_chip": 2228.35, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2225.95, + "throughput_tokens_per_sec_per_chip": 2225.95, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2221.05, + "throughput_tokens_per_sec_per_chip": 2221.05, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 1120.82, + "accuracy_score": 0.57, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 638.9, + "speedup_vs_bf16": 0.548, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1116.9, + "throughput_tokens_per_sec_per_chip": 1116.9, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1115.69, + "throughput_tokens_per_sec_per_chip": 1115.69, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1120.82, + "throughput_tokens_per_sec_per_chip": 1120.82, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1117.8, + "throughput_tokens_per_sec_per_chip": 1117.8, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "auto", + "quantization_method": "gptq" + } + ] + }, + "derived": {}, + "quantization_online": { + "results_by_precision": [ + { + "precision": "BF16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 74.15, + "ttft_ms_p90": 125.13, + "ttft_ms_p99": 1721.49, + "tpot_ms_p50": 30.47, + "tpot_ms_p90": 35.85, + "tpot_ms_p99": 40.25, + "elapsed_seconds_median": 68.7, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 71.26, + "ttft_ms_p90": 85.73, + "ttft_ms_p99": 93.62, + "tpot_ms_p50": 40.73, + "tpot_ms_p90": 42.27, + "tpot_ms_p99": 46.79, + "elapsed_seconds_median": 36.0, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.68, + "ttft_ms_p90": 110.47, + "ttft_ms_p99": 158.24, + "tpot_ms_p50": 67.21, + "tpot_ms_p90": 76.53, + "tpot_ms_p99": 102.14, + "elapsed_seconds_median": 23.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 71.87, + "ttft_ms_p90": 100.76, + "ttft_ms_p99": 152.6, + "tpot_ms_p50": 76.16, + "tpot_ms_p90": 89.69, + "tpot_ms_p99": 138.89, + "elapsed_seconds_median": 20.4, + "sla_met": true + } + ] + }, + { + "precision": "W8A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 49.64, + "ttft_ms_p90": 136.86, + "ttft_ms_p99": 2169.34, + "tpot_ms_p50": 17.11, + "tpot_ms_p90": 23.7, + "tpot_ms_p99": 30.35, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 49.95, + "ttft_ms_p90": 60.03, + "ttft_ms_p99": 69.32, + "tpot_ms_p50": 23.82, + "tpot_ms_p90": 26.2, + "tpot_ms_p99": 32.79, + "elapsed_seconds_median": 33.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.29, + "ttft_ms_p90": 112.98, + "ttft_ms_p99": 155.72, + "tpot_ms_p50": 64.65, + "tpot_ms_p90": 72.61, + "tpot_ms_p99": 94.42, + "elapsed_seconds_median": 23.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.06, + "ttft_ms_p90": 115.18, + "ttft_ms_p99": 187.15, + "tpot_ms_p50": 78.68, + "tpot_ms_p90": 89.12, + "tpot_ms_p99": 118.6, + "elapsed_seconds_median": 21.5, + "sla_met": true + } + ] + }, + { + "precision": "W4A16", + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 92.08, + "ttft_ms_p90": 210.3, + "ttft_ms_p99": 1668.21, + "tpot_ms_p50": 53.34, + "tpot_ms_p90": 82.0, + "tpot_ms_p99": 90.25, + "elapsed_seconds_median": 69.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 91.46, + "ttft_ms_p90": 135.97, + "ttft_ms_p99": 154.85, + "tpot_ms_p50": 71.91, + "tpot_ms_p90": 77.39, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 41.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 83.34, + "ttft_ms_p90": 140.37, + "ttft_ms_p99": 190.61, + "tpot_ms_p50": 85.26, + "tpot_ms_p90": 91.49, + "tpot_ms_p99": 109.69, + "elapsed_seconds_median": 28.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 75.89, + "ttft_ms_p90": 132.81, + "ttft_ms_p99": 204.46, + "tpot_ms_p50": 89.48, + "tpot_ms_p90": 98.79, + "tpot_ms_p99": 119.35, + "elapsed_seconds_median": 24.4, + "sla_met": true + } + ] + } + ] + }, + "quantization_sustained": { + "results_by_precision": [ + { + "precision": "BF16", + "sustained_throughput_tokens_per_sec": 312.0, + "throttle_ratio": 0.835, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2612.8, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 273.8, + "tokens_out": 16436, + "tokens_in": 0, + "requests_completed": 92, + "ttft_ms_p50": 104.0, + "ttft_ms_p99": 2684.5 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 318.4, + "tokens_out": 19101, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 154.0 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.5, + "tokens_out": 19280, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 85.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 309.0, + "tokens_out": 18541, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 86.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 307.8, + "tokens_out": 18477, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 72.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 320.6, + "tokens_out": 19232, + "tokens_in": 0, + "requests_completed": 104, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 306.6, + "tokens_out": 18403, + "tokens_in": 0, + "requests_completed": 100, + "ttft_ms_p50": 70.6, + "ttft_ms_p99": 72.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 308.1, + "tokens_out": 18483, + "tokens_in": 0, + "requests_completed": 101, + "ttft_ms_p50": 70.2, + "ttft_ms_p99": 72.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 327.8, + "tokens_out": 19673, + "tokens_in": 0, + "requests_completed": 108, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 85.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 310.3, + "tokens_out": 18617, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.5, + "ttft_ms_p99": 86.0 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 312.6, + "tokens_out": 18753, + "tokens_in": 0, + "requests_completed": 102, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 71.8 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 321.0, + "tokens_out": 19255, + "tokens_in": 0, + "requests_completed": 105, + "ttft_ms_p50": 70.3, + "ttft_ms_p99": 72.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 315.5, + "tokens_out": 18930, + "tokens_in": 0, + "requests_completed": 106, + "ttft_ms_p50": 70.4, + "ttft_ms_p99": 71.7 + } + ] + }, + { + "precision": "W8A16", + "sustained_throughput_tokens_per_sec": 526.2, + "throttle_ratio": 0.855, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2654.1, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 461.1, + "tokens_out": 27681, + "tokens_in": 0, + "requests_completed": 153, + "ttft_ms_p50": 69.6, + "ttft_ms_p99": 2715.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.2, + "tokens_out": 31687, + "tokens_in": 0, + "requests_completed": 171, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.2, + "tokens_out": 31998, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 57.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.2, + "tokens_out": 32045, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.7, + "ttft_ms_p99": 60.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 532.3, + "tokens_out": 31952, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.6, + "ttft_ms_p99": 60.9 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.2, + "tokens_out": 31451, + "tokens_in": 0, + "requests_completed": 168, + "ttft_ms_p50": 43.5, + "ttft_ms_p99": 62.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.8, + "tokens_out": 31893, + "tokens_in": 0, + "requests_completed": 176, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.3, + "tokens_out": 32076, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 59.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.9, + "tokens_out": 31483, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.5, + "tokens_out": 31707, + "tokens_in": 0, + "requests_completed": 169, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 49.5 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 539.0, + "tokens_out": 32351, + "tokens_in": 0, + "requests_completed": 177, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 529.6, + "tokens_out": 31772, + "tokens_in": 0, + "requests_completed": 172, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 53.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.3, + "tokens_out": 31875, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.6, + "tokens_out": 32030, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.0 + } + ] + }, + { + "precision": "W4A16", + "sustained_throughput_tokens_per_sec": 653.7, + "throttle_ratio": 0.85, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2556.3, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 564.4, + "tokens_out": 33886, + "tokens_in": 0, + "requests_completed": 192, + "ttft_ms_p50": 75.0, + "ttft_ms_p99": 2609.7 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.3, + "tokens_out": 39619, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 49.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.9, + "tokens_out": 39693, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 53.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.9, + "tokens_out": 39668, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.8, + "tokens_out": 39353, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 54.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.7, + "tokens_out": 39582, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.5, + "tokens_out": 39746, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.2, + "tokens_out": 39737, + "tokens_in": 0, + "requests_completed": 225, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 52.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39589, + "tokens_in": 0, + "requests_completed": 219, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 54.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.2, + "tokens_out": 39332, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 50.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39602, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 52.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.3, + "tokens_out": 39733, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 48.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.9, + "tokens_out": 39837, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 52.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.6, + "tokens_out": 39752, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 53.4 + } + ] + } + ] + } + }, + "accuracy": null, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "10:16:44", + "run_id": "4c65fcfb", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:11:59.573174+00:00", + "benchmark_end_time": "2026-04-29T10:16:44.166472+00:00", + "benchmark_elapsed_minutes": 85.7, + "model_load_seconds": 63.0, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/offline", + "bf16/online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/online", + "bf16/sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/bf16/sustained", + "fp8/offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/fp8/offline", + "fp8/online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/fp8/online", + "fp8/sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/fp8/sustained", + "w8a8/offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a8/offline", + "w8a8/online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a8/online", + "w8a8/sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a8/sustained", + "w8a16/offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline", + "w8a16/online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online", + "w8a16/sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained", + "w4a16/offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline", + "w4a16/online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online", + "w4a16/sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization — reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization — larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..e2c86fd4 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline/result.json new file mode 100644 index 00000000..bb8d938c --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1116.9, + "throughput_tokens_per_sec_per_chip": 1116.9, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1115.69, + "throughput_tokens_per_sec_per_chip": 1115.69, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1120.82, + "throughput_tokens_per_sec_per_chip": 1120.82, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1117.8, + "throughput_tokens_per_sec_per_chip": 1117.8, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:39:27", + "run_id": "5daf2609", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_5daf2609", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:31:05.380698+00:00", + "benchmark_end_time": "2026-04-29T11:39:27.782384+00:00", + "benchmark_elapsed_minutes": 8.4, + "model_load_seconds": 47.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online/result.json new file mode 100644 index 00000000..59f17301 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 92.08, + "ttft_ms_p90": 210.3, + "ttft_ms_p99": 1668.21, + "tpot_ms_p50": 53.34, + "tpot_ms_p90": 82.0, + "tpot_ms_p99": 90.25, + "elapsed_seconds_median": 69.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 91.46, + "ttft_ms_p90": 135.97, + "ttft_ms_p99": 154.85, + "tpot_ms_p50": 71.91, + "tpot_ms_p90": 77.39, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 41.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 83.34, + "ttft_ms_p90": 140.37, + "ttft_ms_p99": 190.61, + "tpot_ms_p50": 85.26, + "tpot_ms_p90": 91.49, + "tpot_ms_p99": 109.69, + "elapsed_seconds_median": 28.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 75.89, + "ttft_ms_p90": 132.81, + "ttft_ms_p99": 204.46, + "tpot_ms_p50": 89.48, + "tpot_ms_p90": 98.79, + "tpot_ms_p99": 119.35, + "elapsed_seconds_median": 24.4, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:48:47", + "run_id": "5daf2609", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_5daf2609", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:40:35.967057+00:00", + "benchmark_end_time": "2026-04-29T11:48:47.468382+00:00", + "benchmark_elapsed_minutes": 8.2, + "model_load_seconds": 42.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/result.json new file mode 100644 index 00000000..4948dcc4 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1116.9, + "throughput_tokens_per_sec_per_chip": 1116.9, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1115.69, + "throughput_tokens_per_sec_per_chip": 1115.69, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1120.82, + "throughput_tokens_per_sec_per_chip": 1120.82, + "elapsed_seconds_median": 31.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1117.8, + "throughput_tokens_per_sec_per_chip": 1117.8, + "elapsed_seconds_median": 31.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 92.08, + "ttft_ms_p90": 210.3, + "ttft_ms_p99": 1668.21, + "tpot_ms_p50": 53.34, + "tpot_ms_p90": 82.0, + "tpot_ms_p99": 90.25, + "elapsed_seconds_median": 69.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 91.46, + "ttft_ms_p90": 135.97, + "ttft_ms_p99": 154.85, + "tpot_ms_p50": 71.91, + "tpot_ms_p90": 77.39, + "tpot_ms_p99": 87.33, + "elapsed_seconds_median": 41.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 83.34, + "ttft_ms_p90": 140.37, + "ttft_ms_p99": 190.61, + "tpot_ms_p50": 85.26, + "tpot_ms_p90": 91.49, + "tpot_ms_p99": 109.69, + "elapsed_seconds_median": 28.4, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 75.89, + "ttft_ms_p90": 132.81, + "ttft_ms_p99": 204.46, + "tpot_ms_p50": 89.48, + "tpot_ms_p90": 98.79, + "tpot_ms_p99": 119.35, + "elapsed_seconds_median": 24.4, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 564.4, + "tokens_out": 33886, + "tokens_in": 0, + "requests_completed": 192, + "ttft_ms_p50": 75.0, + "ttft_ms_p99": 2609.7 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.3, + "tokens_out": 39619, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 49.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.9, + "tokens_out": 39693, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 53.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.9, + "tokens_out": 39668, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.8, + "tokens_out": 39353, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 54.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.7, + "tokens_out": 39582, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.5, + "tokens_out": 39746, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.2, + "tokens_out": 39737, + "tokens_in": 0, + "requests_completed": 225, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 52.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39589, + "tokens_in": 0, + "requests_completed": 219, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 54.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.2, + "tokens_out": 39332, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 50.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39602, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 52.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.3, + "tokens_out": 39733, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 48.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.9, + "tokens_out": 39837, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 52.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.6, + "tokens_out": 39752, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 53.4 + } + ], + "sustained_throughput_tokens_per_sec": 653.7, + "throttle_ratio": 0.85, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2556.3 + } + }, + "accuracy": { + "subset_score": 0.57, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "W4A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:39:27", + "run_id": "5daf2609", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_5daf2609", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:31:05.380698+00:00", + "benchmark_end_time": "2026-04-29T11:39:27.782384+00:00", + "benchmark_elapsed_minutes": 31.7, + "model_load_seconds": 47.3, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/offline", + "online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/online", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained/result.json new file mode 100644 index 00000000..fb371e29 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w4a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "auto", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 564.4, + "tokens_out": 33886, + "tokens_in": 0, + "requests_completed": 192, + "ttft_ms_p50": 75.0, + "ttft_ms_p99": 2609.7 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.3, + "tokens_out": 39619, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 49.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 661.9, + "tokens_out": 39693, + "tokens_in": 0, + "requests_completed": 227, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 53.5 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.9, + "tokens_out": 39668, + "tokens_in": 0, + "requests_completed": 221, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 52.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.8, + "tokens_out": 39353, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.1, + "ttft_ms_p99": 54.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 659.7, + "tokens_out": 39582, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 51.8 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.5, + "tokens_out": 39746, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 51.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.2, + "tokens_out": 39737, + "tokens_in": 0, + "requests_completed": 225, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 52.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39589, + "tokens_in": 0, + "requests_completed": 219, + "ttft_ms_p50": 36.7, + "ttft_ms_p99": 54.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.2, + "tokens_out": 39332, + "tokens_in": 0, + "requests_completed": 222, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 50.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 660.2, + "tokens_out": 39602, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 52.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.3, + "tokens_out": 39733, + "tokens_in": 0, + "requests_completed": 228, + "ttft_ms_p50": 36.6, + "ttft_ms_p99": 48.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 663.9, + "tokens_out": 39837, + "tokens_in": 0, + "requests_completed": 220, + "ttft_ms_p50": 37.0, + "ttft_ms_p99": 52.1 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 662.6, + "tokens_out": 39752, + "tokens_in": 0, + "requests_completed": 223, + "ttft_ms_p50": 36.5, + "ttft_ms_p99": 53.4 + } + ], + "sustained_throughput_tokens_per_sec": 653.7, + "throttle_ratio": 0.85, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2556.3 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "12:05:00", + "run_id": "5daf2609", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_5daf2609", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:49:56.190663+00:00", + "benchmark_end_time": "2026-04-29T12:05:00.544279+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 43.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..19c9f93b --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.58, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline/result.json new file mode 100644 index 00000000..7df3d894 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2231.0, + "throughput_tokens_per_sec_per_chip": 2231.0, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2228.35, + "throughput_tokens_per_sec_per_chip": 2228.35, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2225.95, + "throughput_tokens_per_sec_per_chip": 2225.95, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2221.05, + "throughput_tokens_per_sec_per_chip": 2221.05, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:01:14", + "run_id": "0ada63b0", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_0ada63b0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:56:51.187586+00:00", + "benchmark_end_time": "2026-04-29T11:01:14.359571+00:00", + "benchmark_elapsed_minutes": 4.4, + "model_load_seconds": 45.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online/result.json new file mode 100644 index 00000000..4d99f5f0 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online/result.json @@ -0,0 +1,180 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 49.64, + "ttft_ms_p90": 136.86, + "ttft_ms_p99": 2169.34, + "tpot_ms_p50": 17.11, + "tpot_ms_p90": 23.7, + "tpot_ms_p99": 30.35, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 49.95, + "ttft_ms_p90": 60.03, + "ttft_ms_p99": 69.32, + "tpot_ms_p50": 23.82, + "tpot_ms_p90": 26.2, + "tpot_ms_p99": 32.79, + "elapsed_seconds_median": 33.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.29, + "ttft_ms_p90": 112.98, + "ttft_ms_p99": 155.72, + "tpot_ms_p50": 64.65, + "tpot_ms_p90": 72.61, + "tpot_ms_p99": 94.42, + "elapsed_seconds_median": 23.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.06, + "ttft_ms_p90": 115.18, + "ttft_ms_p99": 187.15, + "tpot_ms_p50": 78.68, + "tpot_ms_p90": 89.12, + "tpot_ms_p99": 118.6, + "elapsed_seconds_median": 21.5, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:09:40", + "run_id": "0ada63b0", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_0ada63b0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:02:28.295508+00:00", + "benchmark_end_time": "2026-04-29T11:09:40.916415+00:00", + "benchmark_elapsed_minutes": 7.2, + "model_load_seconds": 46.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/result.json new file mode 100644 index 00000000..2f8dfa86 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 2231.0, + "throughput_tokens_per_sec_per_chip": 2231.0, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 2228.35, + "throughput_tokens_per_sec_per_chip": 2228.35, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 2225.95, + "throughput_tokens_per_sec_per_chip": 2225.95, + "elapsed_seconds_median": 15.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2221.05, + "throughput_tokens_per_sec_per_chip": 2221.05, + "elapsed_seconds_median": 16.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 50, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 49.64, + "ttft_ms_p90": 136.86, + "ttft_ms_p99": 2169.34, + "tpot_ms_p50": 17.11, + "tpot_ms_p90": 23.7, + "tpot_ms_p99": 30.35, + "elapsed_seconds_median": 66.4, + "sla_met": false + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 49.95, + "ttft_ms_p90": 60.03, + "ttft_ms_p99": 69.32, + "tpot_ms_p50": 23.82, + "tpot_ms_p90": 26.2, + "tpot_ms_p99": 32.79, + "elapsed_seconds_median": 33.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 63.29, + "ttft_ms_p90": 112.98, + "ttft_ms_p99": 155.72, + "tpot_ms_p50": 64.65, + "tpot_ms_p90": 72.61, + "tpot_ms_p99": 94.42, + "elapsed_seconds_median": 23.3, + "sla_met": true + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 58.06, + "ttft_ms_p90": 115.18, + "ttft_ms_p99": 187.15, + "tpot_ms_p50": 78.68, + "tpot_ms_p90": 89.12, + "tpot_ms_p99": 118.6, + "elapsed_seconds_median": 21.5, + "sla_met": true + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 461.1, + "tokens_out": 27681, + "tokens_in": 0, + "requests_completed": 153, + "ttft_ms_p50": 69.6, + "ttft_ms_p99": 2715.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.2, + "tokens_out": 31687, + "tokens_in": 0, + "requests_completed": 171, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.2, + "tokens_out": 31998, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 57.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.2, + "tokens_out": 32045, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.7, + "ttft_ms_p99": 60.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 532.3, + "tokens_out": 31952, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.6, + "ttft_ms_p99": 60.9 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.2, + "tokens_out": 31451, + "tokens_in": 0, + "requests_completed": 168, + "ttft_ms_p50": 43.5, + "ttft_ms_p99": 62.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.8, + "tokens_out": 31893, + "tokens_in": 0, + "requests_completed": 176, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.3, + "tokens_out": 32076, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 59.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.9, + "tokens_out": 31483, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.5, + "tokens_out": 31707, + "tokens_in": 0, + "requests_completed": 169, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 49.5 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 539.0, + "tokens_out": 32351, + "tokens_in": 0, + "requests_completed": 177, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 529.6, + "tokens_out": 31772, + "tokens_in": 0, + "requests_completed": 172, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 53.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.3, + "tokens_out": 31875, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.6, + "tokens_out": 32030, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.0 + } + ], + "sustained_throughput_tokens_per_sec": 526.2, + "throttle_ratio": 0.855, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2654.1 + } + }, + "accuracy": { + "subset_score": 0.58, + "baseline_delta": -0.01, + "valid": true, + "framework": "SGLang", + "precision": "W8A16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:01:14", + "run_id": "0ada63b0", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_0ada63b0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T10:56:51.187586+00:00", + "benchmark_end_time": "2026-04-29T11:01:14.359571+00:00", + "benchmark_elapsed_minutes": 26.7, + "model_load_seconds": 45.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/offline", + "online": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/online", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained/result.json new file mode 100644 index 00000000..b3c62f33 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_4c65fcfb/w8a16/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-29T10:07:24.359391+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "auto", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 461.1, + "tokens_out": 27681, + "tokens_in": 0, + "requests_completed": 153, + "ttft_ms_p50": 69.6, + "ttft_ms_p99": 2715.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.2, + "tokens_out": 31687, + "tokens_in": 0, + "requests_completed": 171, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.2, + "tokens_out": 31998, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 57.7 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.2, + "tokens_out": 32045, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.7, + "ttft_ms_p99": 60.7 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 532.3, + "tokens_out": 31952, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.6, + "ttft_ms_p99": 60.9 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.2, + "tokens_out": 31451, + "tokens_in": 0, + "requests_completed": 168, + "ttft_ms_p50": 43.5, + "ttft_ms_p99": 62.5 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.8, + "tokens_out": 31893, + "tokens_in": 0, + "requests_completed": 176, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 534.3, + "tokens_out": 32076, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 59.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 524.9, + "tokens_out": 31483, + "tokens_in": 0, + "requests_completed": 173, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 528.5, + "tokens_out": 31707, + "tokens_in": 0, + "requests_completed": 169, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 49.5 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 539.0, + "tokens_out": 32351, + "tokens_in": 0, + "requests_completed": 177, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 60.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 529.6, + "tokens_out": 31772, + "tokens_in": 0, + "requests_completed": 172, + "ttft_ms_p50": 43.4, + "ttft_ms_p99": 53.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 531.3, + "tokens_out": 31875, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 533.6, + "tokens_out": 32030, + "tokens_in": 0, + "requests_completed": 174, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 61.0 + } + ], + "sustained_throughput_tokens_per_sec": 526.2, + "throttle_ratio": 0.855, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -2654.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-29", + "time": "11:26:04", + "run_id": "0ada63b0", + "run_name": "nvidia_rtx_a6000x1_suite_C_nvidia_sglang_c43a8309_0ada63b0", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-29T11:10:58.792403+00:00", + "benchmark_end_time": "2026-04-29T11:26:04.147313+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 50.4 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/accuracy/accuracy.json new file mode 100644 index 00000000..fae398ca --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.57, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/env_info.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/env_info.json new file mode 100644 index 00000000..3e4ec07e --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/interactive/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/interactive/result.json new file mode 100644 index 00000000..491039ca --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 5946.25, + "ttft_ms_p90": 6219.82, + "ttft_ms_p99": 6310.43, + "tpot_ms_p50": 28.33, + "tpot_ms_p90": 28.52, + "tpot_ms_p99": 28.56, + "peak_memory_gb": null, + "elapsed_seconds_median": 1230.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:19:14", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T08:38:13.612044+00:00", + "benchmark_end_time": "2026-04-30T09:19:14.591030+00:00", + "benchmark_elapsed_minutes": 41.0, + "model_load_seconds": 56.8 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/offline/result.json new file mode 100644 index 00000000..73cebf97 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/offline/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 31.41, + "throughput_tokens_per_sec_per_chip": 31.41, + "elapsed_seconds_median": 409.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 31.38, + "throughput_tokens_per_sec_per_chip": 31.38, + "elapsed_seconds_median": 409.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:36:51", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T07:55:51.379978+00:00", + "benchmark_end_time": "2026-04-30T08:36:51.414672+00:00", + "benchmark_elapsed_minutes": 41.0, + "model_load_seconds": 54.5 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/online/result.json new file mode 100644 index 00000000..a60d7b8c --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/online/result.json @@ -0,0 +1,168 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 293244.79, + "ttft_ms_p90": 524501.19, + "ttft_ms_p99": 574243.71, + "tpot_ms_p50": 138.81, + "tpot_ms_p90": 212.31, + "tpot_ms_p99": 222.29, + "elapsed_seconds_median": 785.5, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 332319.49, + "ttft_ms_p90": 588621.66, + "ttft_ms_p99": 661976.7, + "tpot_ms_p50": 138.95, + "tpot_ms_p90": 212.28, + "tpot_ms_p99": 222.24, + "elapsed_seconds_median": 786.2, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 366180.69, + "ttft_ms_p90": 646640.94, + "ttft_ms_p99": 724071.12, + "tpot_ms_p50": 138.98, + "tpot_ms_p90": 212.36, + "tpot_ms_p99": 222.33, + "elapsed_seconds_median": 784.0, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "11:11:24", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:52:53.121061+00:00", + "benchmark_end_time": "2026-04-30T11:11:24.421043+00:00", + "benchmark_elapsed_minutes": 78.5, + "model_load_seconds": 52.9 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/result.json new file mode 100644 index 00000000..d0761ba6 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/result.json @@ -0,0 +1,551 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online", + "speculative" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 31.41, + "throughput_tokens_per_sec_per_chip": 31.41, + "elapsed_seconds_median": 409.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 31.38, + "throughput_tokens_per_sec_per_chip": 31.38, + "elapsed_seconds_median": 409.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 5946.25, + "ttft_ms_p90": 6219.82, + "ttft_ms_p99": 6310.43, + "tpot_ms_p50": 28.33, + "tpot_ms_p90": 28.52, + "tpot_ms_p99": 28.56, + "peak_memory_gb": null, + "elapsed_seconds_median": 1230.5 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 26412.4, + "ttft_ms_p99": 43394.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29065.7, + "ttft_ms_p99": 61856.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30309.3, + "ttft_ms_p99": 59762.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29560.5, + "ttft_ms_p99": 60419.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28043.7, + "ttft_ms_p99": 59182.7 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29340.5, + "ttft_ms_p99": 57627.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28189.4, + "ttft_ms_p99": 59639.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29448.8, + "ttft_ms_p99": 57746.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29401.0, + "ttft_ms_p99": 59412.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30195.0, + "ttft_ms_p99": 59654.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 52.5, + "tokens_out": 3150, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 28708.4, + "ttft_ms_p99": 61710.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29261.0, + "ttft_ms_p99": 57521.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28671.7, + "ttft_ms_p99": 59379.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29328.2, + "ttft_ms_p99": 58135.7 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29288.6, + "ttft_ms_p99": 59155.2 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30034.4, + "ttft_ms_p99": 59361.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29549.3, + "ttft_ms_p99": 60892.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 27805.1, + "ttft_ms_p99": 59434.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29057.9, + "ttft_ms_p99": 57483.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28687.2, + "ttft_ms_p99": 58861.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29461.9, + "ttft_ms_p99": 58274.5 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 52.5, + "tokens_out": 3150, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 29594.7, + "ttft_ms_p99": 60361.7 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29759.0, + "ttft_ms_p99": 60532.0 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28981.0, + "ttft_ms_p99": 59814.7 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28908.4, + "ttft_ms_p99": 57867.1 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29645.0, + "ttft_ms_p99": 58891.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28959.1, + "ttft_ms_p99": 58359.7 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29113.1, + "ttft_ms_p99": 59004.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29754.0, + "ttft_ms_p99": 59176.7 + } + ], + "sustained_throughput_tokens_per_sec": 28.1, + "throttle_ratio": 0.499, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": -2679.9 + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 293244.79, + "ttft_ms_p90": 524501.19, + "ttft_ms_p99": 574243.71, + "tpot_ms_p50": 138.81, + "tpot_ms_p90": 212.31, + "tpot_ms_p99": 222.29, + "elapsed_seconds_median": 785.5, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 332319.49, + "ttft_ms_p90": 588621.66, + "ttft_ms_p99": 661976.7, + "tpot_ms_p50": 138.95, + "tpot_ms_p90": 212.28, + "tpot_ms_p99": 222.24, + "elapsed_seconds_median": 786.2, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 366180.69, + "ttft_ms_p90": 646640.94, + "ttft_ms_p99": 724071.12, + "tpot_ms_p50": 138.98, + "tpot_ms_p90": 212.36, + "tpot_ms_p99": 222.33, + "elapsed_seconds_median": 784.0, + "sla_met": false + } + ] + }, + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 24.63, + "throughput_tokens_per_sec_per_chip": 24.63, + "elapsed_seconds_median": 521.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 24.62, + "throughput_tokens_per_sec_per_chip": 24.62, + "elapsed_seconds_median": 521.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": 0.57, + "baseline_delta": 0.01, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "08:36:51", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T07:55:51.379978+00:00", + "benchmark_end_time": "2026-04-30T08:36:51.414672+00:00", + "benchmark_elapsed_minutes": 243.7, + "model_load_seconds": 54.5, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online', 'speculative'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/offline", + "interactive": "results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/interactive", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/sustained", + "online": "results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/online", + "speculative": "results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/speculative" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/speculative/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/speculative/result.json new file mode 100644 index 00000000..18217c34 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/speculative/result.json @@ -0,0 +1,154 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "speculative", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "speculative": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 24.63, + "throughput_tokens_per_sec_per_chip": 24.63, + "elapsed_seconds_median": 521.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 24.62, + "throughput_tokens_per_sec_per_chip": 24.62, + "elapsed_seconds_median": 521.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "12:07:30", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T11:15:15.959930+00:00", + "benchmark_end_time": "2026-04-30T12:07:30.303239+00:00", + "benchmark_elapsed_minutes": 52.2, + "model_load_seconds": 167.3 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/sustained/result.json new file mode 100644 index 00000000..3afc6054 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840/sustained/result.json @@ -0,0 +1,428 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-04-30T07:51:16.080658+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPIX\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPIX\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 26412.4, + "ttft_ms_p99": 43394.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29065.7, + "ttft_ms_p99": 61856.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30309.3, + "ttft_ms_p99": 59762.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29560.5, + "ttft_ms_p99": 60419.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28043.7, + "ttft_ms_p99": 59182.7 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29340.5, + "ttft_ms_p99": 57627.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28189.4, + "ttft_ms_p99": 59639.5 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29448.8, + "ttft_ms_p99": 57746.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29401.0, + "ttft_ms_p99": 59412.3 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30195.0, + "ttft_ms_p99": 59654.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 52.5, + "tokens_out": 3150, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 28708.4, + "ttft_ms_p99": 61710.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29261.0, + "ttft_ms_p99": 57521.0 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28671.7, + "ttft_ms_p99": 59379.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29328.2, + "ttft_ms_p99": 58135.7 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29288.6, + "ttft_ms_p99": 59155.2 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 30034.4, + "ttft_ms_p99": 59361.4 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29549.3, + "ttft_ms_p99": 60892.2 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 27805.1, + "ttft_ms_p99": 59434.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29057.9, + "ttft_ms_p99": 57483.4 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28687.2, + "ttft_ms_p99": 58861.7 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29461.9, + "ttft_ms_p99": 58274.5 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 52.5, + "tokens_out": 3150, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 29594.7, + "ttft_ms_p99": 60361.7 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29759.0, + "ttft_ms_p99": 60532.0 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28981.0, + "ttft_ms_p99": 59814.7 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28908.4, + "ttft_ms_p99": 57867.1 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29645.0, + "ttft_ms_p99": 58891.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 28959.1, + "ttft_ms_p99": 58359.7 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.3, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29113.1, + "ttft_ms_p99": 59004.5 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 26.2, + "tokens_out": 1575, + "tokens_in": 0, + "requests_completed": 7, + "ttft_ms_p50": 29754.0, + "ttft_ms_p99": 59176.7 + } + ], + "sustained_throughput_tokens_per_sec": 28.1, + "throttle_ratio": 0.499, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": -2679.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-04-30", + "time": "09:51:34", + "run_id": "4974e840", + "run_name": "nvidia_rtx_a6000x1_suite_D_nvidia_sglang_c43a8309_4974e840", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-04-30T09:20:34.605248+00:00", + "benchmark_end_time": "2026-04-30T09:51:34.624450+00:00", + "benchmark_elapsed_minutes": 31.0, + "model_load_seconds": 53.7 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/accuracy/accuracy.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/accuracy/accuracy.json new file mode 100644 index 00000000..8825ca96 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.38, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/env_info.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/env_info.json new file mode 100644 index 00000000..a6bc323d --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/env_info.json @@ -0,0 +1,53 @@ +{ + "collected_at": "2026-05-06T11:15:40.197436+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/interactive/result.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/interactive/result.json new file mode 100644 index 00000000..b9400a20 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/interactive/result.json @@ -0,0 +1,136 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-06T11:15:40.197436+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 17.75, + "ttft_ms_p90": 19.4, + "ttft_ms_p99": 25.25, + "tpot_ms_p50": 2.22, + "tpot_ms_p90": 2.25, + "tpot_ms_p99": 2.33, + "peak_memory_gb": null, + "elapsed_seconds_median": 65.7 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-06", + "time": "11:27:14", + "run_id": "68ab5b50", + "run_name": "nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-06T11:23:54.684627+00:00", + "benchmark_end_time": "2026-05-06T11:27:14.782883+00:00", + "benchmark_elapsed_minutes": 3.3, + "model_load_seconds": 38.5 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/offline/result.json similarity index 53% rename from results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json rename to results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/offline/result.json index 24981672..81aa678f 100644 --- a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/offline/result.json @@ -1,39 +1,39 @@ { "schema_version": "1.0", "suite_id": "suite_F", - "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "implementation_id": "nvidia_sglang_c43a8309", "chip": { - "name": "MTT S4000", - "vendor": "Moore Threads", + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", "count": 1, "memory_gb": 48.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "environment": { - "collected_at": "2026-05-18T08:40:55.208034+00:00", + "collected_at": "2026-05-06T11:15:40.197436+00:00", "accelerators": [ { "index": 0, - "name": "MTT S4000", - "vendor": "Moore Threads", + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", "memory_gb": 48.0, - "driver_version": "2.7.0", + "driver_version": "565.57.01", "firmware_version": null, + "compute_capability": "8.6", "supports_bf16": true } ], - "accelerator_platform": "moorethreads", - "accelerator_topology": null, + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", "intra_node_interconnect": null, "cpu": { - "model": "Intel(R) Xeon(R) Gold 6430", - "physical_cores": 64, - "logical_cores": 128, + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, "numa_nodes": 2 }, "system_memory_gb": 1007.5, - "pcie_generation": "PCIe 16x/16x", + "pcie_generation": "PCIe Gen 1", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": [ { @@ -47,24 +47,29 @@ "bandwidth_gbps": null }, { - "name": "mlx5_bond_0", + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", "type": "InfiniBand/RoCE", "bandwidth_gbps": null } ], - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8", - "kernel_version": "5.15.0-105-generic", - "runtime_version": "Moore Threads Driver 2.7.0", - "pytorch_version": "2.2.0" + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" }, "software": { - "framework": "vllm-musa", - "framework_version": "0.4.2", - "driver_version": "2.7.0", - "runtime_version": "Moore Threads Driver 2.7.0", - "os": "Ubuntu Jammy Jellyfish (development branch)", - "python_version": "3.10.8" + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" }, "model": { "model_id": "Qwen/Qwen2.5-0.5B-Instruct", @@ -75,7 +80,7 @@ "architecture": "dense", "parameter_count_b": 0.5, "precision": "BF16", - "effective_dtype": "float16", + "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original" }, @@ -97,10 +102,9 @@ "results_by_concurrency": [ { "client_concurrency": 4, - "throughput_tokens_per_sec": 1994.51, - "throughput_tokens_per_sec_per_chip": 1994.51, - "throughput_tokens_per_sec_total": 3642.41, - "elapsed_seconds_median": 12.5, + "throughput_tokens_per_sec": 10477.62, + "throughput_tokens_per_sec_per_chip": 10477.62, + "elapsed_seconds_median": 4.0, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -110,10 +114,9 @@ }, { "client_concurrency": 16, - "throughput_tokens_per_sec": 1998.44, - "throughput_tokens_per_sec_per_chip": 1998.44, - "throughput_tokens_per_sec_total": 3649.59, - "elapsed_seconds_median": 12.5, + "throughput_tokens_per_sec": 11554.68, + "throughput_tokens_per_sec_per_chip": 11554.68, + "elapsed_seconds_median": 3.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -123,10 +126,9 @@ }, { "client_concurrency": 64, - "throughput_tokens_per_sec": 2004.02, - "throughput_tokens_per_sec_per_chip": 2004.02, - "throughput_tokens_per_sec_total": 3659.77, - "elapsed_seconds_median": 12.5, + "throughput_tokens_per_sec": 11509.83, + "throughput_tokens_per_sec_per_chip": 11509.83, + "elapsed_seconds_median": 3.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, @@ -144,21 +146,21 @@ "notes": "Run --scenario accuracy to check model accuracy." }, "meta": { - "submitted_by": "JuhaoLiang1997", + "submitted_by": "Gong-K", "submission_type": "individual", - "date": "2026-05-18", - "time": "16:48:27", - "run_id": "4f66d29d", - "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "date": "2026-05-06", + "time": "11:19:35", + "run_id": "68ab5b50", + "run_name": "nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50", "flagged": null, - "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, - "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", - "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", - "benchmark_elapsed_minutes": 2.5, - "model_load_seconds": 146.8 + "benchmark_start_time": "2026-05-06T11:18:46.260573+00:00", + "benchmark_end_time": "2026-05-06T11:19:35.049014+00:00", + "benchmark_elapsed_minutes": 0.8, + "model_load_seconds": 41.3 } } \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/online/result.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/online/result.json new file mode 100644 index 00000000..e69e1438 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/online/result.json @@ -0,0 +1,156 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-06T11:15:40.197436+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 21.59, + "ttft_ms_p90": 34.08, + "ttft_ms_p99": 1763.06, + "tpot_ms_p50": 3.15, + "tpot_ms_p90": 3.63, + "tpot_ms_p99": 6.7, + "elapsed_seconds_median": 32.0, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 38.68, + "ttft_ms_p90": 45.67, + "ttft_ms_p99": 51.86, + "tpot_ms_p50": 21.19, + "tpot_ms_p90": 25.29, + "tpot_ms_p99": 33.55, + "elapsed_seconds_median": 10.0, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-06", + "time": "11:22:46", + "run_id": "68ab5b50", + "run_name": "nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-06T11:20:41.202166+00:00", + "benchmark_end_time": "2026-05-06T11:22:46.287116+00:00", + "benchmark_elapsed_minutes": 2.1, + "model_load_seconds": 37.6 + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/result.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/result.json new file mode 100644 index 00000000..4c34f2be --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/result.json @@ -0,0 +1,371 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-06T11:15:40.197436+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 10477.62, + "throughput_tokens_per_sec_per_chip": 10477.62, + "elapsed_seconds_median": 4.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 11554.68, + "throughput_tokens_per_sec_per_chip": 11554.68, + "elapsed_seconds_median": 3.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 11509.83, + "throughput_tokens_per_sec_per_chip": 11509.83, + "elapsed_seconds_median": 3.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 21.59, + "ttft_ms_p90": 34.08, + "ttft_ms_p99": 1763.06, + "tpot_ms_p50": 3.15, + "tpot_ms_p90": 3.63, + "tpot_ms_p99": 6.7, + "elapsed_seconds_median": 32.0, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 38.68, + "ttft_ms_p90": 45.67, + "ttft_ms_p99": 51.86, + "tpot_ms_p50": 21.19, + "tpot_ms_p90": 25.29, + "tpot_ms_p99": 33.55, + "elapsed_seconds_median": 10.0, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 17.75, + "ttft_ms_p90": 19.4, + "ttft_ms_p99": 25.25, + "tpot_ms_p50": 2.22, + "tpot_ms_p90": 2.25, + "tpot_ms_p99": 2.33, + "peak_memory_gb": null, + "elapsed_seconds_median": 65.7 + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 5992.8, + "tokens_out": 359668, + "tokens_in": 0, + "requests_completed": 1933, + "ttft_ms_p50": 28.4, + "ttft_ms_p99": 2491.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6297.3, + "tokens_out": 378010, + "tokens_in": 0, + "requests_completed": 2026, + "ttft_ms_p50": 28.1, + "ttft_ms_p99": 44.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6346.4, + "tokens_out": 380716, + "tokens_in": 0, + "requests_completed": 2040, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 44.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6281.5, + "tokens_out": 376749, + "tokens_in": 0, + "requests_completed": 2020, + "ttft_ms_p50": 27.0, + "ttft_ms_p99": 44.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6323.2, + "tokens_out": 379405, + "tokens_in": 0, + "requests_completed": 2035, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 43.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6294.6, + "tokens_out": 377666, + "tokens_in": 0, + "requests_completed": 2028, + "ttft_ms_p50": 27.9, + "ttft_ms_p99": 43.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6314.8, + "tokens_out": 379013, + "tokens_in": 0, + "requests_completed": 2037, + "ttft_ms_p50": 27.6, + "ttft_ms_p99": 45.0 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6355.4, + "tokens_out": 381323, + "tokens_in": 0, + "requests_completed": 2042, + "ttft_ms_p50": 27.8, + "ttft_ms_p99": 45.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6282.5, + "tokens_out": 377022, + "tokens_in": 0, + "requests_completed": 2016, + "ttft_ms_p50": 27.6, + "ttft_ms_p99": 45.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6287.5, + "tokens_out": 377177, + "tokens_in": 0, + "requests_completed": 2026, + "ttft_ms_p50": 27.0, + "ttft_ms_p99": 43.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6325.7, + "tokens_out": 379662, + "tokens_in": 0, + "requests_completed": 2039, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 44.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6366.5, + "tokens_out": 381732, + "tokens_in": 0, + "requests_completed": 2046, + "ttft_ms_p50": 27.1, + "ttft_ms_p99": 44.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6358.8, + "tokens_out": 381608, + "tokens_in": 0, + "requests_completed": 2043, + "ttft_ms_p50": 27.8, + "ttft_ms_p99": 43.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6361.9, + "tokens_out": 381743, + "tokens_in": 0, + "requests_completed": 2048, + "ttft_ms_p50": 27.1, + "ttft_ms_p99": 43.4 + } + ], + "sustained_throughput_tokens_per_sec": 6299.2, + "throttle_ratio": 0.941, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2447.7 + } + }, + "accuracy": { + "subset_score": 0.38, + "baseline_delta": 0.0, + "valid": true, + "framework": "SGLang", + "precision": "BF16", + "notes": "Integrated accuracy check — used same SGLang instance as benchmark." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-06", + "time": "11:19:35", + "run_id": "68ab5b50", + "run_name": "nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-06T11:18:46.260573+00:00", + "benchmark_end_time": "2026-05-06T11:19:35.049014+00:00", + "benchmark_elapsed_minutes": 21.2, + "model_load_seconds": 41.3, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/offline", + "online": "results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/online", + "interactive": "results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/interactive", + "sustained": "results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/sustained" + } + } +} \ No newline at end of file diff --git a/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/sustained/result.json b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/sustained/result.json new file mode 100644 index 00000000..23e00da2 --- /dev/null +++ b/results/community/nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50/sustained/result.json @@ -0,0 +1,278 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_sglang_c43a8309", + "chip": { + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-06T11:15:40.197436+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA RTX A6000", + "vendor": "NVIDIA", + "memory_gb": 48.0, + "driver_version": "565.57.01", + "firmware_version": null, + "compute_capability": "8.6", + "supports_bf16": true + } + ], + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tNODE\tPXB\tSYS\tSYS\t0-37,76-113\t0\t\tN/A\nNIC0\tNODE\t X \tNODE\tSYS\tSYS\t\t\t\t\nNIC1\tPXB\tNODE\t X \tSYS\tSYS\t\t\t\t\nNIC2\tSYS\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz", + "physical_cores": 76, + "logical_cores": 152, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe Gen 1", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_3", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "SGLang", + "framework_version": "0.5.6", + "driver_version": "565.57.01", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.10.20" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 5992.8, + "tokens_out": 359668, + "tokens_in": 0, + "requests_completed": 1933, + "ttft_ms_p50": 28.4, + "ttft_ms_p99": 2491.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6297.3, + "tokens_out": 378010, + "tokens_in": 0, + "requests_completed": 2026, + "ttft_ms_p50": 28.1, + "ttft_ms_p99": 44.2 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6346.4, + "tokens_out": 380716, + "tokens_in": 0, + "requests_completed": 2040, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 44.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6281.5, + "tokens_out": 376749, + "tokens_in": 0, + "requests_completed": 2020, + "ttft_ms_p50": 27.0, + "ttft_ms_p99": 44.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6323.2, + "tokens_out": 379405, + "tokens_in": 0, + "requests_completed": 2035, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 43.3 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6294.6, + "tokens_out": 377666, + "tokens_in": 0, + "requests_completed": 2028, + "ttft_ms_p50": 27.9, + "ttft_ms_p99": 43.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6314.8, + "tokens_out": 379013, + "tokens_in": 0, + "requests_completed": 2037, + "ttft_ms_p50": 27.6, + "ttft_ms_p99": 45.0 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6355.4, + "tokens_out": 381323, + "tokens_in": 0, + "requests_completed": 2042, + "ttft_ms_p50": 27.8, + "ttft_ms_p99": 45.2 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6282.5, + "tokens_out": 377022, + "tokens_in": 0, + "requests_completed": 2016, + "ttft_ms_p50": 27.6, + "ttft_ms_p99": 45.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6287.5, + "tokens_out": 377177, + "tokens_in": 0, + "requests_completed": 2026, + "ttft_ms_p50": 27.0, + "ttft_ms_p99": 43.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6325.7, + "tokens_out": 379662, + "tokens_in": 0, + "requests_completed": 2039, + "ttft_ms_p50": 28.2, + "ttft_ms_p99": 44.3 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6366.5, + "tokens_out": 381732, + "tokens_in": 0, + "requests_completed": 2046, + "ttft_ms_p50": 27.1, + "ttft_ms_p99": 44.5 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6358.8, + "tokens_out": 381608, + "tokens_in": 0, + "requests_completed": 2043, + "ttft_ms_p50": 27.8, + "ttft_ms_p99": 43.4 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 6361.9, + "tokens_out": 381743, + "tokens_in": 0, + "requests_completed": 2048, + "ttft_ms_p50": 27.1, + "ttft_ms_p99": 43.4 + } + ], + "sustained_throughput_tokens_per_sec": 6299.2, + "throttle_ratio": 0.941, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -2447.7 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "Gong-K", + "submission_type": "individual", + "date": "2026-05-06", + "time": "11:43:22", + "run_id": "68ab5b50", + "run_name": "nvidia_rtx_a6000x1_suite_F_nvidia_sglang_c43a8309_68ab5b50", + "flagged": null, + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-06T11:28:21.385518+00:00", + "benchmark_end_time": "2026-05-06T11:43:22.981588+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 39.2 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json deleted file mode 100644 index 304c3db9..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "subset_score": 0.61, - "baseline_delta": null, - "valid": true, - "framework": "1Cat-vLLM", - "precision": "FP16", - "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json deleted file mode 100644 index 52c2fdcb..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "collected_at": "2026-05-18T09:38:50.346241+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json deleted file mode 100644 index 66aeb486..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_A", - "implementation_id": "nvidia_onecat_vllm_12a253c2", - "chip": { - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 32.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T09:38:50.346241+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" - }, - "model": { - "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8.0, - "precision": "FP16", - "effective_dtype": null, - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "online", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - }, - "runtime_metrics": null - }, - "metrics": { - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 0.0, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5.0, - "ttft_ms_p50": 113119.0, - "ttft_ms_p90": 832380.28, - "ttft_ms_p99": 872316.46, - "tpot_ms_p50": 1274.2, - "tpot_ms_p90": 1801.34, - "tpot_ms_p99": 4289.09, - "elapsed_seconds_median": 968.7, - "sla_met": false - }, - { - "target_qps": 25, - "achieved_qps": 25.0, - "ttft_ms_p50": 130646.03, - "ttft_ms_p90": 865522.04, - "ttft_ms_p99": 901339.26, - "tpot_ms_p50": 1262.15, - "tpot_ms_p90": 1785.02, - "tpot_ms_p99": 4287.18, - "elapsed_seconds_median": 936.5, - "sla_met": false - }, - { - "target_qps": 100, - "achieved_qps": 100.0, - "ttft_ms_p50": 132710.0, - "ttft_ms_p90": 863880.66, - "ttft_ms_p99": 888527.06, - "tpot_ms_p50": 1248.86, - "tpot_ms_p90": 1740.58, - "tpot_ms_p99": 4225.34, - "elapsed_seconds_median": 921.5, - "sla_met": false - } - ] - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "20:25:39", - "run_id": "4e0e6eba", - "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", - "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00", - "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00", - "benchmark_elapsed_minutes": 140.9, - "model_load_seconds": 45.2 - } -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json deleted file mode 100644 index 07930da0..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json +++ /dev/null @@ -1,210 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_A", - "implementation_id": "nvidia_onecat_vllm_12a253c2", - "chip": { - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 32.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T09:38:50.346241+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" - }, - "model": { - "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 8.0, - "precision": "FP16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - } - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 8, - "throughput_tokens_per_sec": 671.32, - "throughput_tokens_per_sec_per_chip": 671.32, - "throughput_tokens_per_sec_total": 1168.67, - "elapsed_seconds_median": 51.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 32, - "throughput_tokens_per_sec": 670.99, - "throughput_tokens_per_sec_per_chip": 670.99, - "throughput_tokens_per_sec_total": 1168.09, - "elapsed_seconds_median": 51.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 128, - "throughput_tokens_per_sec": 671.43, - "throughput_tokens_per_sec_per_chip": 671.43, - "throughput_tokens_per_sec_total": 1168.44, - "elapsed_seconds_median": 51.6, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 0.0, - "results_by_qps": [ - { - "target_qps": 5, - "achieved_qps": 5.0, - "ttft_ms_p50": 113119.0, - "ttft_ms_p90": 832380.28, - "ttft_ms_p99": 872316.46, - "tpot_ms_p50": 1274.2, - "tpot_ms_p90": 1801.34, - "tpot_ms_p99": 4289.09, - "elapsed_seconds_median": 968.7, - "sla_met": false - }, - { - "target_qps": 25, - "achieved_qps": 25.0, - "ttft_ms_p50": 130646.03, - "ttft_ms_p90": 865522.04, - "ttft_ms_p99": 901339.26, - "tpot_ms_p50": 1262.15, - "tpot_ms_p90": 1785.02, - "tpot_ms_p99": 4287.18, - "elapsed_seconds_median": 936.5, - "sla_met": false - }, - { - "target_qps": 100, - "achieved_qps": 100.0, - "ttft_ms_p50": 132710.0, - "ttft_ms_p90": 863880.66, - "ttft_ms_p99": 888527.06, - "tpot_ms_p50": 1248.86, - "tpot_ms_p90": 1740.58, - "tpot_ms_p99": 4225.34, - "elapsed_seconds_median": 921.5, - "sla_met": false - } - ] - } - }, - "accuracy": { - "subset_score": 0.61, - "baseline_delta": null, - "valid": true, - "framework": "1Cat-vLLM", - "precision": "FP16", - "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "18:03:39", - "run_id": "4e0e6eba", - "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", - "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", - "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", - "benchmark_elapsed_minutes": 151.2, - "model_load_seconds": 47.8, - "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", - "scenario_dirs": { - "offline": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline", - "online": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online" - } - } -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json deleted file mode 100644 index 94e55472..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "subset_score": 0.37, - "baseline_delta": 0.0, - "valid": true, - "framework": "1Cat-vLLM", - "precision": "FP16", - "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json deleted file mode 100644 index 1f8b6bd5..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "collected_at": "2026-05-18T12:26:03.593928+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json deleted file mode 100644 index f017bc27..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json +++ /dev/null @@ -1,126 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "nvidia_onecat_vllm_12a253c2", - "chip": { - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 32.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T12:26:03.593928+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "FP16", - "effective_dtype": null, - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "interactive", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - }, - "runtime_metrics": null - }, - "metrics": { - "interactive": { - "ttft_ms_p50": 26.76, - "ttft_ms_p90": 29.57, - "ttft_ms_p99": 40.69, - "tpot_ms_p50": 3.51, - "tpot_ms_p90": 3.76, - "tpot_ms_p99": 3.81, - "peak_memory_gb": null, - "elapsed_seconds_median": 116.9 - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "20:45:36", - "run_id": "419b138c", - "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", - "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T12:39:46.224469+00:00", - "benchmark_end_time": "2026-05-18T12:45:36.498231+00:00", - "benchmark_elapsed_minutes": 5.8, - "model_load_seconds": 27.8 - } -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json deleted file mode 100644 index 170f9d0d..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json +++ /dev/null @@ -1,146 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "nvidia_onecat_vllm_12a253c2", - "chip": { - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 32.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T12:26:03.593928+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "FP16", - "effective_dtype": null, - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenario": "online", - "num_runs": 3, - "warmup_runs": 1, - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - }, - "runtime_metrics": null - }, - "metrics": { - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 0.0, - "results_by_qps": [ - { - "target_qps": 10, - "achieved_qps": 10.0, - "ttft_ms_p50": 6316.13, - "ttft_ms_p90": 53409.43, - "ttft_ms_p99": 67932.56, - "tpot_ms_p50": 206.23, - "tpot_ms_p90": 291.3, - "tpot_ms_p99": 636.32, - "elapsed_seconds_median": 103.3, - "sla_met": false - }, - { - "target_qps": 40, - "achieved_qps": 40.0, - "ttft_ms_p50": 19238.78, - "ttft_ms_p90": 56898.27, - "ttft_ms_p99": 75398.9, - "tpot_ms_p50": 189.24, - "tpot_ms_p90": 300.17, - "tpot_ms_p99": 582.22, - "elapsed_seconds_median": 86.3, - "sla_met": false - } - ] - } - }, - "accuracy": { - "subset_score": null, - "baseline_delta": null, - "valid": false, - "notes": "Run --scenario accuracy to check model accuracy." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "20:38:56", - "run_id": "419b138c", - "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", - "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T12:29:46.673625+00:00", - "benchmark_end_time": "2026-05-18T12:38:56.798553+00:00", - "benchmark_elapsed_minutes": 9.2, - "model_load_seconds": 28.7 - } -} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json deleted file mode 100644 index 12baab45..00000000 --- a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json +++ /dev/null @@ -1,210 +0,0 @@ -{ - "schema_version": "1.0", - "suite_id": "suite_F", - "implementation_id": "nvidia_onecat_vllm_12a253c2", - "chip": { - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "count": 1, - "memory_gb": 32.0, - "interconnect_intra_node": null, - "interconnect_inter_node": null - }, - "environment": { - "collected_at": "2026-05-18T12:26:03.593928+00:00", - "accelerators": [ - { - "index": 0, - "name": "Tesla V100-PCIE-32GB", - "vendor": "NVIDIA", - "memory_gb": 32.0, - "driver_version": "580.82.07", - "firmware_version": null, - "compute_capability": "7.0", - "supports_bf16": false - } - ], - "accelerator_platform": "nvidia", - "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", - "intra_node_interconnect": null, - "cpu": { - "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", - "physical_cores": 26, - "logical_cores": 52, - "numa_nodes": 2 - }, - "system_memory_gb": 214.5, - "pcie_generation": "PCIe Gen 3", - "cpu_accelerator_bandwidth_gbs": null, - "network_interfaces": null, - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13", - "kernel_version": "5.4.0-149-generic", - "runtime_version": "CUDA 12.8", - "pytorch_version": "2.9.1+cu128" - }, - "software": { - "framework": "1Cat-vLLM", - "framework_version": "1.0.0+flash_attn_v100-1.0.0", - "driver_version": "580.82.07", - "runtime_version": "CUDA 12.8", - "os": "Ubuntu 22.04.5 LTS", - "python_version": "3.12.13" - }, - "model": { - "model_id": "Qwen/Qwen2.5-0.5B-Instruct", - "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", - "model_name": null, - "model_note": null, - "model_source": "local", - "architecture": "dense", - "parameter_count_b": 0.5, - "precision": "FP16", - "effective_dtype": "float16", - "quantization_method": null, - "model_format": "HuggingFace original" - }, - "task": { - "scenarios_run": [ - "offline", - "online", - "interactive" - ], - "parallelism": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "expert_parallel_size": 1, - "data_parallel_size": 1 - }, - "num_runs": 3, - "extra_config": { - "tensor_parallel_size": 1, - "enforce_eager": false, - "max_num_seqs": 512, - "gpu_memory_utilization": 0.9, - "engine_kwargs": { - "enable_prefix_caching": false, - "enable_chunked_prefill": false, - "kv_cache_auto_trim_ratio": 0.0 - } - } - }, - "metrics": { - "derived": {}, - "offline": { - "results_by_concurrency": [ - { - "client_concurrency": 4, - "throughput_tokens_per_sec": 6234.82, - "throughput_tokens_per_sec_per_chip": 6234.82, - "throughput_tokens_per_sec_total": 9303.11, - "elapsed_seconds_median": 6.8, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 16, - "throughput_tokens_per_sec": 6292.79, - "throughput_tokens_per_sec_per_chip": 6292.79, - "throughput_tokens_per_sec_total": 9356.18, - "elapsed_seconds_median": 6.7, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - }, - { - "client_concurrency": 64, - "throughput_tokens_per_sec": 6243.51, - "throughput_tokens_per_sec_per_chip": 6243.51, - "throughput_tokens_per_sec_total": 9267.55, - "elapsed_seconds_median": 6.8, - "peak_memory_gb": null, - "power_watts_avg": null, - "power_watts_peak": null, - "oom": false, - "_throughput_note": "output_only", - "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." - } - ] - }, - "online": { - "sla_ttft_ms": 500, - "max_valid_qps": 0.0, - "results_by_qps": [ - { - "target_qps": 10, - "achieved_qps": 10.0, - "ttft_ms_p50": 6316.13, - "ttft_ms_p90": 53409.43, - "ttft_ms_p99": 67932.56, - "tpot_ms_p50": 206.23, - "tpot_ms_p90": 291.3, - "tpot_ms_p99": 636.32, - "elapsed_seconds_median": 103.3, - "sla_met": false - }, - { - "target_qps": 40, - "achieved_qps": 40.0, - "ttft_ms_p50": 19238.78, - "ttft_ms_p90": 56898.27, - "ttft_ms_p99": 75398.9, - "tpot_ms_p50": 189.24, - "tpot_ms_p90": 300.17, - "tpot_ms_p99": 582.22, - "elapsed_seconds_median": 86.3, - "sla_met": false - } - ] - }, - "interactive": { - "ttft_ms_p50": 26.76, - "ttft_ms_p90": 29.57, - "ttft_ms_p99": 40.69, - "tpot_ms_p50": 3.51, - "tpot_ms_p90": 3.76, - "tpot_ms_p99": 3.81, - "peak_memory_gb": null, - "elapsed_seconds_median": 116.9 - } - }, - "accuracy": { - "subset_score": 0.37, - "baseline_delta": 0.0, - "valid": true, - "framework": "1Cat-vLLM", - "precision": "FP16", - "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." - }, - "meta": { - "submitted_by": "JuhaoLiang1997", - "submission_type": "individual", - "date": "2026-05-18", - "time": "20:28:55", - "run_id": "419b138c", - "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", - "flagged": null, - "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", - "env_info_file": "../env_info.json", - "log_file": "run.log", - "samples_file": "samples.jsonl", - "notes": null, - "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", - "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", - "benchmark_elapsed_minutes": 16.4, - "model_load_seconds": 31.7, - "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", - "scenario_dirs": { - "offline": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline", - "online": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online", - "interactive": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive" - } - } -} \ No newline at end of file