RIKEN-RCCS · yoshifuminakamura · Jun 12, 2026 · Jun 10, 2026 · Jun 11, 2026
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,9 @@ benchpark/
 # BenchPark workspace (generated during CI)
 benchpark-workspace/
 
+# Estimator tool checkouts prepared during CI/local smoke tests
+.benchkit_estimation_tools/
+
 # Dev mode data and config (NEVER commit)
 result_server/_dev_data/
 result_server/config/allowed_emails.json

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -30,6 +30,17 @@ variables:
   estimate_result_uuid: ""
   reestimation_reason: ""
   reestimation_trigger: ""
+  # Temporary bring-up switches for the GPU estimation integration.
+  # Remove or replace them after the real estimator runner/package flow is fixed.
+  BK_QWS_GPU_MLP_SMOKE: "true"
+  BK_QWS_GPU_MLP_SMOKE_MODE: "perftools"
+  BK_ESTIMATE_RUNNER_TAG: "fncx-estimate-python"
+  BK_GPU_MLP_PERFTOOLS_REPO: "https://github.com/masaaki-kondo/PerfTools.git"
+  BK_GPU_MLP_PERFTOOLS_REF: "main"
+  BK_GENESIS_GPU_MLP_PROFILE: "true"
+  BK_GPU_MLP_NCU_LAUNCH_COUNT: "20"
+  BK_GPU_MLP_SOURCE_GPU: "H100"
+  BK_GPU_MLP_KERNEL_COUNT: "20"
 
 # Extract system and code filters from API variables or commit message
 .filters: &filters

diff --git a/docs/cx/REESTIMATION_SPEC.md b/docs/cx/REESTIMATION_SPEC.md
@@ -199,7 +199,7 @@ However, when those artifacts do not exist on the server:
 
 - estimate result UUID で estimate JSON を返す取得 API
 - estimate JSON に記録された `source_result_uuid` に基づいて Result JSON を返す取得 API
-- `source_result_uuid` に対応する estimation input artifact を返す取得 API
+- `source_result_uuid` に対応する estimation artifact bundle を返す取得 API
 
 現時点では、再推定の shell フローと取得口自体は実装済みである。
 一方で、取得 API の公開方針と認証条件、ならびに compare UI や portal からの再推定起動導線は文書としてまだ十分に整理されていない。
@@ -216,7 +216,18 @@ Re-estimation from `estimate_result_uuid` requires retrieval paths for:
 
 - Estimate JSON by estimate-result UUID
 - Result JSON through the resolved `source_result_uuid`
-- estimation-input artifacts associated with that source result
+- estimation artifact bundles associated with that source result
+
+The canonical artifact endpoints are:
+
+- `POST /api/ingest/estimation-artifacts`
+- `GET /api/query/estimation-artifacts?uuid=<source_result_uuid>`
+
+The older `estimation-inputs` endpoints remain as compatibility aliases during
+the transition, but new clients should use `estimation-artifacts`. The artifact
+bundle may contain prepared estimation inputs, prediction outputs, and logs; it
+must not be used to duplicate large profiler archives such as PA Data or
+`*.ncu-rep`.
 
 At present, the shell-side re-estimation flow and these retrieval endpoints exist, but the exposure rules, authentication conditions, and portal-facing documentation are not yet fixed clearly enough in the documents.
 
@@ -296,23 +307,23 @@ Re-estimation in BenchKit should preferably satisfy at least:
 4. different estimation methods can coexist for the same benchmark result
 5. `weakscaling`-based minimum estimation and detailed estimation can be compared along the same comparison axis
 6. insufficient inputs can be reported explicitly as not applicable, fallback, or re-measurement required
-7. detailed re-estimation should be able to restore estimation-input artifacts associated with the source result
+7. detailed re-estimation should be able to restore estimation artifacts associated with the source result
 
-## 8.1 estimation input artifact の復元 / Restoration of Estimation Input Artifacts
+## 8.1 estimation artifact の復元 / Restoration of Estimation Artifacts
 
 当面の再推定では、artifact 復元を次の流れで扱う。
 
 1. `estimate_result_uuid` から開始し、estimate JSON から `source_result_uuid` を解決する
 2. source result JSON を取得する
-3. `received_estimation_inputs/<result-stem>/` が存在する場合は、その内容を `results/estimation_inputs/` に復元する
-4. server 側に estimation input artifact が無い場合は、artifact 不要な推定のみを許可し、必要な推定は `not_applicable` とする
+3. `received_estimation_artifacts/<result-stem>/` が存在する場合は、その内容を `results/estimation_artifacts/` に復元する
+4. server 側に estimation artifact が無い場合は、artifact 不要な推定のみを許可し、必要な推定は `not_applicable` とする
 
 Current restoration should follow this flow:
 
 1. if starting from `estimate_result_uuid`, resolve `source_result_uuid` from the estimate JSON
 2. fetch the source result JSON
-3. if `received_estimation_inputs/<result-stem>/` exists, restore its contents into `results/estimation_inputs/`
-4. if no stored estimation inputs exist, allow only methods that do not require them; otherwise terminate as `not_applicable`
+3. if `received_estimation_artifacts/<result-stem>/` exists, restore its contents into `results/estimation_artifacts/`
+4. if no stored estimation artifacts exist, allow only methods that do not require them; otherwise terminate as `not_applicable`
 
 ## 9. 次の実装候補 / Next Implementation Candidates
 
@@ -337,7 +348,7 @@ Candidate next steps include:
 - 利用者向け入口として `estimate_result_uuid` を使える
 - `estimate_result_uuid` から stored estimate JSON を取得し、そこから `source_result_uuid` を解決できる
 - source result JSON を結果サーバから再取得できる
-- `received_estimation_inputs/<result-stem>/` から detailed estimation input artifact を復元できる
+- `received_estimation_artifacts/<result-stem>/` から detailed estimation artifact bundle を復元できる
 - 復元した artifact を使って detailed re-estimation を実行できる
 - 保存済み estimate JSON に `reestimation` ブロックを持てる
 - `reestimation` の既定値として `scope=both` と `baseline_policy=reuse-recorded-baseline` を持てる

diff --git a/docs/deploy/hardening-guide.md b/docs/deploy/hardening-guide.md
@@ -10,13 +10,13 @@ The portal enforces an application-level request body limit:
 RESULT_SERVER_MAX_UPLOAD_MB=512
 ```
 
-Large estimation input archives are also checked per member:
+Large estimation artifact archives are also checked per member:
 
 ```text
 RESULT_SERVER_MAX_ARCHIVE_MEMBER_MB=1024
 ```
 
-Set these values to match the largest expected PA Data or estimation input
+Set these values to match the largest expected PA Data or estimation artifact
 archive. Keep the reverse proxy body limit at or below the Flask limit so that
 oversized uploads are rejected before they consume worker memory.
 

diff --git a/docs/guides/add-estimation-package.md b/docs/guides/add-estimation-package.md
@@ -43,6 +43,7 @@
   - `counter_papi_detailed.sh`
   - `trace_mpi_basic.sh`
   - `overlap_max_basic.sh`
+  - `gpu_kernel_mlp_v15.sh`
 
 ## 3. top-level package の責務
 
@@ -67,6 +68,55 @@ section package はもっと小さくてかまいません。
 
 ここでは「1 区間の変換規則」に集中し、Estimate JSON 全体の組み立てや current / future の side 管理は BenchKit 共通層や top-level package 側へ寄せる方が自然です。
 
+GPU kernel 単位の外部推定ツールは、通常は section package として扱います。
+たとえば `gpu_kernel_mlp_v15` は、PerfTools の `MLP_NN/v1.5` を「GPU 区間だけを変換する package」として接続します。
+top-level package は `instrumented_app_sections_dummy` などのままにして、GPU 区間にだけ `gpu_kernel_mlp_v15` を割り当てます。
+
+```bash
+bk_declare_section --side future gpu_kernel_region gpu_kernel_mlp_v15
+bk_emit_declared_section --side future gpu_kernel_region "$measured_gpu_time" results/estimation_artifacts/gpu_kernel_region_input.csv
+```
+
+PerfTools 本体は BenchKit に vendoring せず、実行時に次の環境変数で渡します。
+
+```bash
+export BK_GPU_MLP_PERFTOOLS_ROOT=/path/to/PerfTools
+export BK_GPU_MLP_PYTHON=python3
+```
+
+section artifact は PerfTools 側の static GPU spec sheet から作られた prepared CSV を想定します。
+BenchKit 実行時に GPU spec を動的採取しません。
+テストやデバッグでは、既に作成済みの prediction CSV を使えます。
+
+```bash
+export BK_GPU_MLP_ARTIFACT_MODE=prediction
+# or section-specific override:
+export BK_GPU_MLP_PREDICTION_CSV_GPU_KERNEL_REGION=/path/to/pred.csv
+```
+
+section package は prediction CSV の `Execution Time [ns]` を合算し、その section の future-side `time` にします。
+
+qws を使って CI 配管だけを確認する場合は、実際の qws が GPU 化されていなくても GPU MLP smoke test を有効にできます。
+`BK_QWS_GPU_MLP_SMOKE_MODE=prediction` では、同梱のサンプル prediction CSV を使い、run job が `gpu_kernel_region` section と prediction CSV artifact を結果に埋め込みます。
+`BK_QWS_GPU_MLP_SMOKE_MODE=perftools` では、estimate job が PerfTools repo を checkout し、`MLP_NN/examples/example_input_mixed-src_20kernels.csv` を `predict_v15.py` に渡して prediction CSV を生成します。
+どちらのモードでも、estimate job が `gpu_kernel_mlp_v15` section package を通して Estimate JSON へ変換できることを確認します。
+qws の推定スクリプト単体では既定無効ですが、GPU estimator integration の立ち上げ期間中は GitLab CI 側の既定を一時的に有効化しています。
+
+```bash
+export BK_QWS_GPU_MLP_SMOKE=true
+export BK_QWS_GPU_MLP_SMOKE_MODE=perftools
+export BK_ESTIMATE_RUNNER_TAG=<python-and-jq-estimator-runner-tag>
+export BK_GPU_MLP_PERFTOOLS_REPO=https://github.com/masaaki-kondo/PerfTools.git
+export BK_GPU_MLP_PERFTOOLS_REF=main
+```
+
+これらの変数は、GPU estimator integration の立ち上げ期間だけの暫定スイッチです。
+`BK_QWS_GPU_MLP_SMOKE` は qws を使った配管確認用、`BK_QWS_GPU_MLP_SMOKE_MODE` は prediction fixture 取り込みと PerfTools 実行の切り替え用、`BK_ESTIMATE_RUNNER_TAG` は推定用 runner/container を手動で逃がすためのものです。
+実際の GPU profiling input と推定 runner の運用が固まったら、専用の package/runner 設定へ置き換え、これらの暫定変数は削除対象として見直してください。
+
+`perftools` smoke mode は GitHub から PerfTools を取得するため、推定 runner/container には `git` と外部接続、Python 3.12 以上、numpy/pandas/torch が必要です。
+実運用では smoke mode ではなく、推定 runner/container に PerfTools checkout を用意し、section artifact として実アプリ由来の prepared input CSV を渡してください。
+
 ## 5. metadata に持たせるもの
 
 現在の実装では、package metadata がかなり重要です。

diff --git a/docs/guides/add-estimation-to-app.md b/docs/guides/add-estimation-to-app.md
@@ -192,13 +192,13 @@ bk_emit_declared_section \
 bk_emit_declared_section \
   --side future \
   compute_solver 1.03 \
-  results/estimation_inputs/compute_solver_papi.tgz \
+  results/estimation_artifacts/compute_solver_papi.tgz \
   >> results/result
 
 bk_emit_declared_overlap \
   --side future \
   compute_hopping,halo_exchange 0.23 \
-  results/estimation_inputs/compute_halo_overlap.json \
+  results/estimation_artifacts/compute_halo_overlap.json \
   >> results/result
 ```
 
@@ -288,7 +288,7 @@ app 側では、まず section 名と `estimation_package` を決めることを
 
 特に PAPI のように複数回実行が必要になる採取は、app 側に細かく書かせすぎると重くなります。package 側は「`papi` が必要」と定義し、BenchKit 側が採取や保存の共通処理を引き受ける形が自然です。
 
-現状の参照実装では `results/estimation_inputs/` を使う例がありますが、これは将来も app 側が細かく書き続けるべきという意味ではありません。
+現状の参照実装では `results/estimation_artifacts/` を使う例がありますが、これは将来も app 側が細かく書き続けるべきという意味ではありません。
 
 `bk_emit_section` や `bk_emit_overlap` は残してよく、`estimate.sh` 内の宣言と共存できます。宣言は package 割当てを先に示し、`bk_emit_*` は実際に得られた値を Result JSON に流し込む手段として使います。
 

diff --git a/docs/guides/developer-reference.md b/docs/guides/developer-reference.md
@@ -71,7 +71,7 @@ The supported baseline is that contributors can add apps, sites, and estimation
 
 `result_server/` provides:
 
-- ingest APIs for results, estimates, profiler archives, and estimation inputs
+- ingest APIs for results, estimates, profiler archives, and estimation artifacts
 - public and confidential result views
 - detailed result and estimate pages
 - usage reporting
@@ -101,6 +101,21 @@ The supported baseline is that contributors can add apps, sites, and estimation
 - `result_server/routes/admin.py`
   Admin-only user management.
 
+### Main API Endpoints
+
+The canonical estimation artifact endpoints are:
+
+- `POST /api/ingest/estimation-artifacts`
+  Upload a lightweight estimation artifact bundle associated with a source result UUID.
+- `GET /api/query/estimation-artifacts?uuid=<source_result_uuid>`
+  Download the stored estimation artifact bundle for re-estimation.
+
+The older `estimation-inputs` endpoint names remain as compatibility aliases
+only. New client code and documentation should use `estimation-artifacts`.
+Estimation artifact bundles may contain prepared estimator inputs, prediction
+outputs, and logs, but should not duplicate large profiler archives such as PA
+Data or `*.ncu-rep`.
+
 ### Main Templates
 
 - `result_server/templates/_results_base.html`

diff --git a/programs/genesis/estimate.sh b/programs/genesis/estimate.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# estimate.sh — GENESIS estimation entrypoint and run-time section metadata.
+
+genesis_declare_estimation_layout() {
+  bk_clear_estimation_defaults
+  bk_clear_estimation_declarations
+  bk_define_current_estimation_package weakscaling
+  bk_define_future_estimation_package instrumented_app_sections_dummy
+  bk_define_baseline_system "${BK_ESTIMATION_BASELINE_SYSTEM:-MiyabiG}"
+  bk_define_baseline_exp "${BK_ESTIMATION_BASELINE_EXP:-${BK_GENESIS_EXP:-p8}}"
+  bk_define_future_system "${BK_ESTIMATION_FUTURE_SYSTEM:-GPU_MLP_TARGET}"
+  bk_define_current_target_nodes "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}"
+  bk_define_future_target_nodes "${BK_ESTIMATION_FUTURE_TARGET_NODES:-1}"
+  bk_declare_section --side future gpu_kernel_region gpu_kernel_mlp_v15
+}
+
+genesis_emit_estimation_data_from_fom() {
+  local fom="$1"
+  local artifact_path="results/padata0.tgz"
+  local padata_path="$artifact_path"
+
+  case "${BK_GENESIS_GPU_MLP_PROFILE:-false}" in
+    1|true|TRUE|yes|YES|on|ON) ;;
+    *) return 0 ;;
+  esac
+
+  if [[ -n "${GENESIS_BENCHKIT_ROOT:-}" ]]; then
+    padata_path="${GENESIS_BENCHKIT_ROOT}/${artifact_path}"
+  fi
+  if [[ ! -f "$padata_path" ]]; then
+    echo "Genesis GPU MLP estimation requested but profiler archive was not found: ${padata_path}" >&2
+    return 0
+  fi
+
+  bk_emit_declared_section --side future gpu_kernel_region "$fom" "$artifact_path"
+}
+
+source scripts/bk_functions.sh
+source scripts/estimation/common.sh
+
+BK_ESTIMATION_SECTION_DEFAULT_FACTOR="${BK_ESTIMATION_SECTION_DEFAULT_FACTOR:-1.0}"
+BK_GPU_MLP_ARTIFACT_MODE="${BK_GPU_MLP_ARTIFACT_MODE:-ncu}"
+BK_GPU_MLP_SOURCE_GPU="${BK_GPU_MLP_SOURCE_GPU:-H100}"
+BK_GPU_MLP_KERNEL_COUNT="${BK_GPU_MLP_KERNEL_COUNT:-20}"
+export BK_GPU_MLP_ARTIFACT_MODE
+export BK_GPU_MLP_SOURCE_GPU
+export BK_GPU_MLP_KERNEL_COUNT
+
+genesis_declare_estimation_layout
+bk_estimation_apply_declared_defaults
+BK_ESTIMATION_PACKAGE="${BK_ESTIMATION_PACKAGE:-$BK_ESTIMATION_FUTURE_PACKAGE}"
+
+if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then
+  return 0 2>/dev/null || exit 0
+fi
+
+BK_ESTIMATION_INPUT_JSON="$1"
+
+bk_estimation_run_declared_future_package "$BK_ESTIMATION_INPUT_JSON"
+bk_estimation_run_recorded_current_with_weakscaling \
+  "${BK_ESTIMATION_BASELINE_SYSTEM:-MiyabiG}" \
+  "${BK_ESTIMATION_BASELINE_EXP:-}" \
+  "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}" \
+  "${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}"
+
+bk_estimation_write_output "results/estimate_${est_code}_0.json"
diff --git a/programs/genesis/run.sh b/programs/genesis/run.sh
@@ -8,13 +8,16 @@ nthreads="$4"
 numproc=$(( numproc_node * nodes ))
 
 source "${PWD}/scripts/bk_functions.sh"
+source "${PWD}/programs/genesis/estimate.sh"
 
 SCRIPT_DIR="${PWD}"
+export GENESIS_BENCHKIT_ROOT="$SCRIPT_DIR"
 REPO_DIR="genesis_benchmark_input"
 REPO_URL="https://github.com/genesis-release-r-ccs/${REPO_DIR}.git"
 BRANCH="main"
 dir_path="npt/genesis2.0beta_3.5fs/apoa1"
 header=p8
+exp="${BK_GENESIS_EXP:-$header}"
 input=${header}.inp
 resultsdir=${SCRIPT_DIR}/results
 artifactsdir=${SCRIPT_DIR}/artifacts
@@ -152,7 +155,15 @@ run_genesis_gh200_gpu() {
     fi
 
     genesis_profiler_tool=$(bk_get_profiler_tool "$genesis_profiler_requested") || return 1
-    genesis_profiler_level="${!profiler_level_var:-${GENESIS_PROFILER_LEVEL:-single}}"
+    local genesis_default_profiler_level="single"
+    case "${BK_GENESIS_GPU_MLP_PROFILE:-false}" in
+      1|true|TRUE|yes|YES|on|ON)
+        genesis_default_profiler_level="detailed"
+        export BK_PROFILER_NCU_RAW_CSV="${BK_PROFILER_NCU_RAW_CSV:-true}"
+        export BK_PROFILER_ARGS="${BK_PROFILER_ARGS:---launch-count ${BK_GPU_MLP_NCU_LAUNCH_COUNT:-20}}"
+        ;;
+    esac
+    genesis_profiler_level="${!profiler_level_var:-${GENESIS_PROFILER_LEVEL:-${genesis_default_profiler_level}}}"
     if [ -n "$genesis_profiler_tool" ]; then
         if [ "$genesis_profiler_tool" = "ncu" ] && ! command -v ncu >/dev/null 2>&1; then
             if [ "$genesis_profiler_explicit" -eq 1 ]; then
@@ -223,14 +234,17 @@ fom_val=$(awk -F'=' '/^[[:space:]]*dynamics[[:space:]]*=/ {
 			print $2;
 			exit
 			}' ${output})
-cd - > /dev/null
+cd "$SCRIPT_DIR" > /dev/null
 
 if [[ -z "$fom_val" ]]; then
     echo "Warning: FOM value not found in ${output}" >&2
     fom_val="nan"   # or 0.0
 fi
 
-bk_emit_result --fom "$fom_val" --nodes "$nodes" --numproc-node "$numproc_node" --nthreads "$nthreads" >> ${resultsdir}/result
+{
+    bk_emit_result --fom "$fom_val" --exp "$exp" --nodes "$nodes" --numproc-node "$numproc_node" --nthreads "$nthreads"
+    genesis_emit_estimation_data_from_fom "$fom_val"
+} >> ${resultsdir}/result
 # if information is requierd
 #printf "%-10s nodes=%2d numproc=%3d  FOM: %.3f\n" \
 #    "$system" "$nodes" "$numproc" "$fom_val" >> ../results/result