diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb159a032..bcc7de2fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -129,6 +129,7 @@ jobs: outputs: configurations: ${{ steps.compute.outputs.configurations }} run_fuzz: ${{ steps.compute.outputs.run_fuzz }} + run_reliability: ${{ steps.compute.outputs.run_reliability }} steps: - name: Debounce label events if: github.event.action == 'labeled' @@ -161,8 +162,14 @@ jobs: else echo "run_fuzz=false" >> $GITHUB_OUTPUT fi + if echo "$labels" | grep -Fq "test:reliability"; then + echo "run_reliability=true" >> $GITHUB_OUTPUT + else + echo "run_reliability=false" >> $GITHUB_OUTPUT + fi else echo "run_fuzz=false" >> $GITHUB_OUTPUT + echo "run_reliability=false" >> $GITHUB_OUTPUT fi configs="$configs]" diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 97e1eb2c8..83ce0def5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,6 +8,9 @@ variables: FORCE_BUILD: value: "" description: "Force build even if no new commits (any non-empty value)" + RUN_RELIABILITY: + value: "false" + description: "Run reliability and chaos tests. Set automatically when the test:reliability label is on the PR." MAVEN_REPOSITORY_PROXY: "https://depot-read-api-java.us1.ddbuild.io/magicmirror/magicmirror/@current/" default: @@ -153,6 +156,66 @@ jdk-integration-test: forward: pipeline_variables: true +# Generates a child pipeline YAML for reliability/chaos tests when the PR +# carries the test:reliability label (RUN_RELIABILITY=true in build.env). +generate-reliability-child-pipeline: + stage: reliability + tags: ["arch:amd64"] + image: $PREPARE_IMAGE + needs: + - job: prepare:start + artifacts: true + rules: + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never + - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' + when: never + - when: on_success + script: + - | + if [ "${RUN_RELIABILITY:-}" = "true" ]; then + echo "Label test:reliability detected — enabling reliability child pipeline" + cp .gitlab/reliability/pr-child.gitlab-ci.yml generated-reliability.yml + else + cat > generated-reliability.yml << 'NOOP' + skip-reliability: + image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest + tags: ["arch:amd64"] + script: + - echo "Label test:reliability not set — skipping" + rules: + - when: always + NOOP + fi + artifacts: + paths: + - generated-reliability.yml + expire_in: 1 day + +run-reliability-tests: + stage: reliability + variables: + DDPROF_COMMIT_BRANCH: "$DDPROF_COMMIT_BRANCH" + DDPROF_COMMIT_SHA: "$DDPROF_COMMIT_SHA" + needs: + - job: generate-reliability-child-pipeline + artifacts: true + - job: prepare:start + artifacts: true + rules: + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never + - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' + when: never + - when: on_success + trigger: + include: + - artifact: generated-reliability.yml + job: generate-reliability-child-pipeline + strategy: depend + forward: + pipeline_variables: true + include: - local: .gitlab/common.yml - local: .adms/python/gitlab.yaml diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml index 893322996..960383af6 100644 --- a/.gitlab/benchmarks/.gitlab-ci.yml +++ b/.gitlab/benchmarks/.gitlab-ci.yml @@ -17,13 +17,17 @@ variables: rules: - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' when: never - - if: '$CI_PIPELINE_SOURCE == "trigger" || $CI_PIPELINE_SOURCE == "pipeline"' - when: on_success + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + when: never - if: '$CI_PIPELINE_SOURCE == "web"' when: manual allow_failure: true - - if: '$CI_PIPELINE_SOURCE == "push"' - when: manual + # Run automatically and non-blocking on any other source (push/trigger/api/ + # etc.) — mirrors the integration-test rules. The before_script CANCELLED + # gate skips branches with no open PR. + - when: on_success allow_failure: true script: | # setup the env @@ -36,8 +40,8 @@ variables: if [ -z "${CANDIDATE_VERSION}" ]; then echo "Missing candidate version. Skipping."; exit 0; fi # fetch the common platform scripts - git config --global url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf "https://github.com/DataDog/" - git clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR} + git -c url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf="https://github.com/DataDog/" \ + clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR} # apply the specific step scripts cp -r .gitlab/benchmarks/steps/* ${PLATFORM_DIR}/steps/ @@ -52,7 +56,6 @@ variables: ${PLATFORM_DIR}/steps/run-benchmarks.sh ${PLATFORM_DIR}/steps/analyze-results.sh ${PLATFORM_DIR}/steps/upload-results-to-s3.sh - ${PLATFORM_DIR}/steps/post-pr-comment.sh parallel: matrix: - RUN_MODE: ["cpu", "wall", "alloc", "memleak", "cpu,wall", "memleak,alloc", "cpu,wall,alloc,memleak"] @@ -76,6 +79,36 @@ benchmarks-candidate-aarch64: KUBERNETES_MEMORY_REQUEST: 200Gi KUBERNETES_MEMORY_LIMIT: 200Gi +post-benchmarks-pr-comment: + extends: .retry-config + stage: benchmarks + tags: ["arch:arm64"] + image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1 + id_tokens: + DDOCTOSTS_ID_TOKEN: + aud: dd-octo-sts + needs: + - job: prepare:start + artifacts: true + - job: benchmarks-candidate-amd64 + artifacts: true + - job: benchmarks-candidate-aarch64 + artifacts: true + rules: + - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + when: never + # Always run when the candidate jobs ran, regardless of source, so results + # are posted back to the PR. + - when: always + timeout: 5m + script: + - .gitlab/benchmarks/post-pr-comment.sh reports + allow_failure: true + publish-benchmark-gh-pages: stage: benchmarks tags: ["arch:arm64"] diff --git a/.gitlab/benchmarks/post-pr-comment.sh b/.gitlab/benchmarks/post-pr-comment.sh new file mode 100755 index 000000000..de410dff9 --- /dev/null +++ b/.gitlab/benchmarks/post-pr-comment.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Post aggregated benchmark comparison results as a single PR comment. +# +# Expects all per-cell comparison-baseline-vs-candidate_*.md reports to be +# present under REPORTS_DIR (default: reports/). +# +# Required env: +# DDPROF_COMMIT_BRANCH – branch name used to locate the open PR +# Optional env: +# CI_PIPELINE_URL, DDPROF_COMMIT_SHA + +set -euo pipefail + +REPORTS_DIR="${1:-reports}" +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Aggregate all per-cell reports into a single comment body +SECTIONS="" +for md in "${REPORTS_DIR}"/comparison-baseline-vs-candidate_*.md; do + [ -f "${md}" ] || continue + label=$(basename "${md}" .md | sed 's/comparison-baseline-vs-candidate_//') + SECTIONS="${SECTIONS} +
${label} + +$(cat "${md}") + +
+" +done + +if [ -z "${SECTIONS}" ]; then + echo "No benchmark reports found under ${REPORTS_DIR} — skipping comment" + exit 0 +fi + +BODY_FILE=$(mktemp) +trap 'rm -f "${BODY_FILE}"' EXIT +cat > "${BODY_FILE}" <&2 + CRASH_MSG="Chaos harness crashed (RC=${RC})" + HS_ERR="${HERE}/../../hs_err.log" + if [ -f "${HS_ERR}" ]; then + SIG=$(grep -m1 '^siginfo:' "${HS_ERR}" 2>/dev/null | tr -d '\n' | cut -c1-120) + FRAME=$(grep -m1 'libjavaProfiler\|AsyncProfiler' "${HS_ERR}" 2>/dev/null | sed 's/^[[:space:]]*//' | tr -d '\n' | cut -c1-120) + [ -n "${SIG}" ] && CRASH_MSG="${CRASH_MSG};${SIG}" + [ -n "${FRAME}" ] && CRASH_MSG="${CRASH_MSG};${FRAME}" + fi + echo "FAIL:${CRASH_MSG}" >&2 exit 1 fi diff --git a/.gitlab/reliability/post-pr-comment.sh b/.gitlab/reliability/post-pr-comment.sh new file mode 100755 index 000000000..8be574a9c --- /dev/null +++ b/.gitlab/reliability/post-pr-comment.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Post aggregated reliability + chaos test results as a single PR comment. +# +# Reads REASON_* variables written to build.env by the reliability/chaos jobs +# and emits a ✅/❌ matrix with failure
blocks. +# +# Required env: +# DDPROF_COMMIT_BRANCH – branch name used to locate the open PR +# Optional env: +# CI_PIPELINE_URL + +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# ── Collect failures from REASON_* env vars ──────────────────────────────────── +rel_fail=0; rel_failures="" +chaos_fail=0; chaos_failures="" + +for key in $(compgen -v | grep -E '^REASON_.*X(jit|memory)$' | sort); do + reason="${!key}" + label="${key#REASON_}" + rel_fail=$((rel_fail + 1)) + detail=$(printf '%s' "${reason//\`/}" | tr ';' '\n') + rel_failures="${rel_failures} +
❌ ${label//_/ } + +\`\`\` +${detail} +\`\`\` + +
" +done + +for key in $(compgen -v | grep -E '^REASON_.*Xchaos$' | sort); do + reason="${!key}" + label="${key#REASON_}" + chaos_fail=$((chaos_fail + 1)) + detail=$(printf '%s' "${reason//\`/}" | tr ';' '\n') + chaos_failures="${chaos_failures} +
❌ chaos: ${label//_/ } + +\`\`\` +${detail} +\`\`\` + +
" +done + +# ── Assemble comment ──────────────────────────────────────────────────────────── +total_fail=$((rel_fail + chaos_fail)) +if [ "${total_fail}" -gt 0 ]; then + overall="❌ **${total_fail} failure(s) detected**" +else + overall="✅ **All reliability & chaos checks passed**" +fi + +BODY_FILE=$(mktemp) +trap 'rm -f "${BODY_FILE}"' EXIT +cat > "${BODY_FILE}" <err.log 1>out.log + - REASON=$(grep -m1 'FAIL:' err.log | cut -f2- -d':' | tr -d '\n') || true + - if [ -n "${REASON}" ]; then _key=$(printf 'REASON_%s_%s_%sX%s' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${VARIANT}" | tr '+' '_'); echo "${_key}=${REASON}" >> build.env; exit 1; fi + after_script: + - | + if [[ "$CI_JOB_STATUS" == "failed" ]]; then + _key=$(printf 'REASON_%s_%s_%sX%s' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${VARIANT}" | tr '+' '_') + grep -q "${_key}=" build.env 2>/dev/null || echo "${_key}=Unknown failure, perhaps timeout" >> build.env + fi + artifacts: + name: "results-${ARCH}" + when: always + paths: + - memwatch.log + - memwatch-trend.png + - hs_err.log + - err.log + - out.log + reports: + dotenv: build.env + expire_in: 1 day + +reliability-amd64: + extends: .reliability_pr_job + tags: ["arch:amd64"] + image: $BENCHMARK_IMAGE_AMD64 + variables: + ARCH: amd64 + +reliability-aarch64: + extends: .reliability_pr_job + tags: ["arch:arm64"] + image: $BENCHMARK_IMAGE_ARM64 + variables: + ARCH: aarch64 + +# ── Chaos ──────────────────────────────────────────────────────────────────── +# chaos_check.sh builds chaos.jar inline (via Gradle) when the artifact is +# absent, and downloads ddprof from Maven snapshots when no local jar exists. + +.reliability_chaos_pr_job: + stage: test + timeout: 6h + variables: + RUNTIME: "120" + needs: + - job: get-versions + artifacts: true + rules: + - when: on_success + parallel: + matrix: + - CONFIG: ["profiler", "profiler+tracer"] + ALLOCATOR: ["gmalloc", "jemalloc", "tcmalloc"] + CHAOS_JDK: ["21.0.3-tem", "25.0.3-tem"] + script: + - set +e + - echo "runtime=${RUNTIME}, config=${CONFIG}, allocator=${ALLOCATOR}, arch=${ARCH}, jdk=${CHAOS_JDK}" + - CHAOS_JDK="${CHAOS_JDK}" .gitlab/reliability/chaos_check.sh "$RUNTIME" "$CONFIG" "$ALLOCATOR" 2>err.log 1>out.log + - REASON=$(grep -m1 'FAIL:' err.log | cut -f2- -d':' | tr -d '\n') || true + - if [ -n "${REASON}" ]; then _key=$(printf 'REASON_%s_%s_%s_%sXchaos' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${CHAOS_JDK//[.-]/_}" | tr '+' '_'); echo "${_key}=${REASON}" >> build.env; exit 1; fi + after_script: + - | + if [[ "$CI_JOB_STATUS" == "failed" ]]; then + _key=$(printf 'REASON_%s_%s_%s_%sXchaos' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${CHAOS_JDK//[.-]/_}" | tr '+' '_') + grep -q "${_key}=" build.env 2>/dev/null || echo "${_key}=Unknown failure, perhaps timeout" >> build.env + fi + artifacts: + name: "chaos-results-${ARCH}" + when: always + paths: + - hs_err.log + - err.log + - out.log + reports: + dotenv: build.env + expire_in: 1 day + +reliability-chaos-amd64: + extends: .reliability_chaos_pr_job + tags: ["arch:amd64"] + image: $BENCHMARK_IMAGE_AMD64 + variables: + ARCH: amd64 + +reliability-chaos-aarch64: + extends: .reliability_chaos_pr_job + tags: ["arch:arm64"] + image: $BENCHMARK_IMAGE_ARM64 + variables: + ARCH: aarch64 + +# ── PR comment ─────────────────────────────────────────────────────────────── + +post-reliability-pr-comment: + extends: .retry-config + stage: notify + tags: ["arch:arm64"] + image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1 + id_tokens: + DDOCTOSTS_ID_TOKEN: + aud: dd-octo-sts + needs: + - job: reliability-amd64 + artifacts: true + - job: reliability-aarch64 + artifacts: true + - job: reliability-chaos-amd64 + artifacts: true + - job: reliability-chaos-aarch64 + artifacts: true + rules: + - when: always + timeout: 5m + script: + - .gitlab/reliability/post-pr-comment.sh + allow_failure: true diff --git a/.gitlab/scripts/prepare.sh b/.gitlab/scripts/prepare.sh index 1b803c0c0..38f841730 100755 --- a/.gitlab/scripts/prepare.sh +++ b/.gitlab/scripts/prepare.sh @@ -22,6 +22,14 @@ if [ "${CI_PIPELINE_SOURCE}" == "push" ] || [ "${CI_PIPELINE_SOURCE}" == "trigge echo "CANCELLED=true" >> build.env exit 0 fi + # Detect PR labels and export flags for downstream jobs + if command -v jq >/dev/null 2>&1; then + if echo "${API_RESPONSE}" | jq -e '[.[0].labels[].name] | any(. == "test:reliability")' >/dev/null 2>&1; then + echo "RUN_RELIABILITY=true" >> build.env + fi + elif echo "${API_RESPONSE}" | grep -q '"test:reliability"'; then + echo "RUN_RELIABILITY=true" >> build.env + fi fi fi diff --git a/.gitlab/scripts/upsert-github-pr-comment.sh b/.gitlab/scripts/upsert-github-pr-comment.sh new file mode 100755 index 000000000..c7a60a738 --- /dev/null +++ b/.gitlab/scripts/upsert-github-pr-comment.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Upsert a comment on the java-profiler GitHub PR for the current branch. +# +# Posts (or replaces) a single marker-tagged comment using a short-lived GitHub +# token obtained via dd-octo-sts. No pr-commenter / benchmarking-platform clone +# is required — only dd-octo-sts (present in dd-octo-sts-ci-base) plus curl/jq. +# +# Usage: +# upsert-github-pr-comment.sh +# +# comment-id : unique slug used as an HTML marker to find/replace the comment +# branch : head branch name used to locate the open PR +# body-file : path to a file holding the markdown comment body +# +# Requires in CI: dd-octo-sts CLI + DDOCTOSTS_ID_TOKEN id_token, curl, jq. +# Token policy async-profiler-build.ci grants issues:write + pull_requests:read. + +set -euo pipefail + +COMMENT_ID="${1:?comment-id required}" +BRANCH="${2:?branch required}" +BODY_FILE="${3:?body-file required}" +REPO="DataDog/java-profiler" +API="https://api.github.com/repos/${REPO}" + +log() { echo "[upsert-pr-comment] $*" >&2; } + +# gh_api [data] — performs a GitHub API call, capturing both the +# response body and HTTP status. On HTTP >= 400 it logs the status and body +# (turning opaque "curl 403" failures into actionable diagnostics) and returns 1. +# On success the response body is written to stdout. +gh_api() { + local method="$1" url="$2" data="${3:-}" + local args=(-sS -X "${method}" + -H "Authorization: Bearer ${TOKEN}" + -H "Accept: application/vnd.github+json" + -H "X-GitHub-Api-Version: 2022-11-28" + -H "User-Agent: java-profiler-ci" + -w $'\n%{http_code}') + [ -n "${data}" ] && args+=(-d "${data}") + local resp status body + resp=$(curl "${args[@]}" "${url}") || { log "curl failed for ${method} ${url}"; return 1; } + status="${resp##*$'\n'}" + body="${resp%$'\n'*}" + if [ "${status}" -ge 400 ]; then + log "GitHub API ${method} ${url} -> HTTP ${status}" + log "Response: ${body}" + return 1 + fi + printf '%s' "${body}" +} + +if [ -z "${BRANCH}" ] || [ "${BRANCH}" = "main" ] || [ "${BRANCH}" = "master" ]; then + log "Skipping PR comment for branch: ${BRANCH:-}" + exit 0 +fi +if [ ! -s "${BODY_FILE}" ]; then + log "Empty body file (${BODY_FILE}) — nothing to post" + exit 0 +fi + +# 1. Obtain a GitHub token via dd-octo-sts (no stored secrets). Trim whitespace +# and validate the format, mirroring publish-gh-pages.sh — a token polluted +# with log noise/newlines produces a malformed header and a GitHub 403. +TOKEN=$(dd-octo-sts token --scope "${REPO}" --policy async-profiler-build.ci 2>/tmp/octo-sts.err || true) +TOKEN="${TOKEN//[$'\t\r\n ']/}" +if [ -z "${TOKEN}" ]; then + log "Failed to obtain GitHub token via dd-octo-sts — skipping comment" + [ -s /tmp/octo-sts.err ] && log "dd-octo-sts: $(head -3 /tmp/octo-sts.err)" + exit 0 +fi +if [[ ! "${TOKEN}" =~ ^(ghs_|ghp_|github_pat_|v1\.|[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.) ]]; then + log "dd-octo-sts returned an unexpected token format (first 8 chars: ${TOKEN:0:8}) — skipping" + exit 0 +fi + +# 2. Resolve the open PR for this branch. +PR=$(gh_api GET "${API}/pulls?head=DataDog:${BRANCH}&state=open&per_page=1" | jq -r '.[0].number // empty') +if [ -z "${PR}" ]; then + log "No open PR found for branch ${BRANCH} — skipping comment" + exit 0 +fi + +# 3. Prepend a stable marker and build the JSON payload safely. +MARKER="" +BODY="${MARKER}"$'\n'"$(cat "${BODY_FILE}")" +PAYLOAD=$(jq -n --arg body "${BODY}" '{body: $body}') + +# 4. Find an existing marker comment and PATCH it, otherwise POST a new one. +CID=$(gh_api GET "${API}/issues/${PR}/comments?per_page=100" \ + | jq -r --arg m "${MARKER}" '.[] | select(.body | contains($m)) | .id' | head -n1) + +if [ -n "${CID}" ]; then + gh_api PATCH "${API}/issues/comments/${CID}" "${PAYLOAD}" >/dev/null + log "Updated comment ${CID} on PR #${PR}" +else + gh_api POST "${API}/issues/${PR}/comments" "${PAYLOAD}" >/dev/null + log "Created comment on PR #${PR}" +fi