From f90836da8dade415bea2d0670bf78c523175f17f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 14:08:04 +0100 Subject: [PATCH 01/60] CFN-6544: set Noble as default stemcell for acceptance tests --- .gitignore | 2 +- .../{bionic_test.go => jammy_test.go} | 12 +- acceptance-tests/run-local.sh | 41 ++-- ci/Dockerfile | 13 +- ci/bosh-scaled-out.yml | 3 + ci/pipeline.yml | 14 +- ci/scripts/acceptance-tests | 14 +- ci/scripts/functions-ci.sh | 14 +- ci/scripts/start-bosh.sh | 217 +++++++++++------- .../.gitkeep | 0 manifests/haproxy.yml | 2 +- 11 files changed, 194 insertions(+), 138 deletions(-) rename acceptance-tests/{bionic_test.go => jammy_test.go} (73%) create mode 100644 ci/bosh-scaled-out.yml rename ci/scripts/{stemcell-bionic => stemcell-jammy}/.gitkeep (100%) diff --git a/.gitignore b/.gitignore index 5b6f7d4b..881ead69 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ config/settings.yml releases/*.tgz releases/**/*.tgz ci/scripts/stemcell/*.tgz -ci/scripts/stemcell-bionic/*.tgz +ci/scripts/stemcell-jammy/*.tgz dev_releases blobs/* .blobs diff --git a/acceptance-tests/bionic_test.go b/acceptance-tests/jammy_test.go similarity index 73% rename from acceptance-tests/bionic_test.go rename to acceptance-tests/jammy_test.go index 1823cd98..44ffc175 100644 --- a/acceptance-tests/bionic_test.go +++ b/acceptance-tests/jammy_test.go @@ -7,14 +7,14 @@ import ( . "github.com/onsi/ginkgo/v2" ) -var _ = Describe("Bionic", func() { - It("Correctly proxies HTTP requests when using the Bionic stemcell", func() { +var _ = Describe("Jammy", func() { + It("Correctly proxies HTTP requests when using the Jammy stemcell", func() { - opsfileBionic := `--- -# Configure Bionic stemcell + opsfileJammy := `--- +# Configure Jammy stemcell - type: replace path: /stemcells/alias=default/os - value: ubuntu-bionic + value: ubuntu-jammy ` haproxyBackendPort := 12000 @@ -22,7 +22,7 @@ var _ = Describe("Bionic", func() { haproxyBackendPort: haproxyBackendPort, haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: deploymentNameForTestNode(), - }, []string{opsfileBionic}, map[string]interface{}{}, true) + }, []string{opsfileJammy}, map[string]interface{}{}, true) closeLocalServer, localPort := startDefaultTestServer() defer closeLocalServer() diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 391a8acd..01d402f8 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -3,21 +3,27 @@ set -eu REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" source "${REPO_DIR}/ci/scripts/functions-ci.sh" +FOCUS="" +PARALLELISM="" KEEP_RUNNING="" usage() { - echo -e "Usage: $0 [-F ] [-k] + echo -e "Usage: $0 [-F ] [-P ] [-k] -F Focus on a particular test. Expects a Ginkgo test name. Keep bosh running afterwards. + -P Set Ginkgo parallel node count. Default is '-p' (smart parallelism). -k Keep bosh container running. Useful for debug." 1>&2; exit 1; } -while getopts ":F:k" o; do +while getopts ":F:P:k" o; do case "${o}" in F) FOCUS=${OPTARG} KEEP_RUNNING=true ;; + P) + PARALLELISM=${OPTARG} + ;; k) KEEP_RUNNING=true ;; @@ -28,25 +34,11 @@ while getopts ":F:k" o; do done shift $((OPTIND-1)) -docker_mac_check_cgroupsv1() { - # Force cgroups v1 on Docker for Mac - # inspired by https://github.com/docker/for-mac/issues/6073#issuecomment-1018793677 - - SETTINGS=~/Library/Group\ Containers/group.com.docker/settings.json - - cgroupsV1Enabled=$(jq '.deprecatedCgroupv1' "$SETTINGS") - if [ "$cgroupsV1Enabled" != "true" ]; then - echo "deprecatedCgroupv1 should be enabled in $SETTINGS. Otherwise the acceptance tests will not run on Docker for Mac." - echo "Check in the README.md for a convenient script to set deprecatedCgroupv1 and restart Docker." - exit 1 - fi -} - check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( - ci/scripts/stemcell/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent - ci/scripts/stemcell-bionic/bosh-stemcell-*-ubuntu-bionic-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-bionic-go_agent + ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble + ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent ) for entry in "${REQUIRED_FILE_PATTERNS[@]}"; do @@ -63,9 +55,10 @@ check_required_files() { fi ( - echo "$filepattern not found, downloading latest." + echo "$filepattern not found, downloading." cd "$folder" && \ - resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1) && \ + resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') && \ + echo "Resolved URL: $resolved" && \ curl -s --remote-name --remote-header-name --location "$resolved" && \ echo "Downloaded '$url' successfully." && \ ls -1lh "$folder/"$filepattern @@ -81,10 +74,6 @@ check_required_files() { check_required_files -if [ "$(uname)" == "Darwin" ]; then - docker_mac_check_cgroupsv1 -fi - build_image "${REPO_DIR}/ci" prepare_docker_scratch @@ -93,9 +82,9 @@ if [ -n "$KEEP_RUNNING" ] ; then echo echo "*** KEEP_RUNNING enabled. Please clean up docker scratch after removing containers: ${DOCKER_SCRATCH}" echo - docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="$FOCUS" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" + docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="${FOCUS}" -e PARALLELISM="${PARALLELISM}" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" else - docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" + docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" -e PARALLELISM="${PARALLELISM}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" echo "Cleaning up docker scratch: ${DOCKER_SCRATCH}" sudo rm -rf "${DOCKER_SCRATCH}" fi diff --git a/ci/Dockerfile b/ci/Dockerfile index 56c7550b..34414d78 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,18 +1,23 @@ -FROM bosh/docker-cpi:main +FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest # Install all necessary tools for haproxy testflight and dependency autobump ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget jq git vim nano python3-pip && \ + apt-get install -y wget jq git vim nano python3-pip python3-venv && \ apt-get clean # Set bosh env at login RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc +# Copy ops files +COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml + # Install Python libraries needed for scripts +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" COPY scripts/requirements.txt /requirements.txt -RUN /usr/bin/python3 -m pip install -r /requirements.txt +RUN pip install -r /requirements.txt # Install go dependencies ENV GOBIN=/usr/local/bin -RUN go install github.com/geofffranks/spruce/cmd/spruce@latest +RUN go install github.com/geofffranks/spruce/cmd/spruce@latest \ No newline at end of file diff --git a/ci/bosh-scaled-out.yml b/ci/bosh-scaled-out.yml new file mode 100644 index 00000000..93937df3 --- /dev/null +++ b/ci/bosh-scaled-out.yml @@ -0,0 +1,3 @@ +- type: replace + path: /instance_groups/name=bosh/properties/director/workers? + value: 12 \ No newline at end of file diff --git a/ci/pipeline.yml b/ci/pipeline.yml index bc374b1f..75436cfd 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -121,7 +121,7 @@ jobs: - in_parallel: - { get: git, trigger: true, passed: [unit-tests] } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - task: acceptance-tests privileged: true @@ -131,7 +131,7 @@ jobs: inputs: - { name: git } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git/ci/scripts/acceptance-tests args: [] @@ -152,7 +152,7 @@ jobs: - do: - { get: git-pull-requests, trigger: true, version: every } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - put: git-pull-requests params: @@ -169,7 +169,7 @@ jobs: inputs: - { name: git-pull-requests } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git-pull-requests/ci/scripts/acceptance-tests args: [] @@ -403,15 +403,15 @@ resources: - "dependabot" - "CFN-CI" - - name: stemcell-bionic + - name: stemcell-jammy type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-bionic-go_agent + name: bosh-warden-boshlite-ubuntu-jammy-go_agent - name: stemcell type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-jammy-go_agent + name: bosh-warden-boshlite-ubuntu-noble - name: version type: semver diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 9cc17e83..579a864a 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -35,11 +35,19 @@ echo "----- Running tests" export PATH=$PATH:$GOPATH/bin ginkgo version -PARALLELISM="-p" -if [ -n "$FOCUS" ]; then +echo "------------------------------------------------------------------" +if [ -n "${FOCUS:-}" ]; then PARALLELISM="--nodes=1" + echo "FOCUS is set, thus PARALLELISM is set to '$PARALLELISM'" +elif [ -n "${PARALLELISM:-}" ]; then + PARALLELISM="--nodes=$PARALLELISM" + echo "PARALLELISM is set. Will run ginkgo with '$PARALLELISM'" +else + PARALLELISM="-p" + echo "PARALLELISM is not set. Using default '$PARALLELISM'" fi +echo "------------------------------------------------------------------" ginkgo -v "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" -keep_running_info +keep_running_info \ No newline at end of file diff --git a/ci/scripts/functions-ci.sh b/ci/scripts/functions-ci.sh index d3e64d50..fa8e5465 100755 --- a/ci/scripts/functions-ci.sh +++ b/ci/scripts/functions-ci.sh @@ -62,18 +62,18 @@ function bosh_release() { } function bosh_assets() { - stemcell_jammy_path="$START_DIR/stemcell/*.tgz" - stemcell_bionic_path="$START_DIR/stemcell-bionic/*.tgz" + stemcell_noble_path="$START_DIR/stemcell/*.tgz" + stemcell_jammy_path="$START_DIR/stemcell-jammy/*.tgz" + + echo "----- Uploading Noble stemcell" + bosh -n upload-stemcell $stemcell_noble_path echo "----- Uploading Jammy stemcell" bosh -n upload-stemcell $stemcell_jammy_path - echo "----- Uploading Bionic stemcell" - bosh -n upload-stemcell $stemcell_bionic_path - echo "----- Uploading os-conf (used for tests only)" - bosh -n upload-release --sha1 386293038ae3d00813eaa475b4acf63f8da226ef \ - https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=22.1.2 + bosh -n upload-release --sha1 sha256:efcf30754ce4c5f308aedab3329d8d679f5967b2a4c3c453204c7cb10c7c5ed9 \ + https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=23.0.0 export BOSH_PATH=$(command -v bosh) export BASE_MANIFEST_PATH="$PWD/manifests/haproxy.yml" diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 3bda28f6..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -6,44 +6,30 @@ function generate_certs() { local certs_dir certs_dir="${1}" - pushd "${certs_dir}" - - jq -ner --arg "ip" "${OUTER_CONTAINER_IP}" '{ - "variables": [ - { - "name": "docker_ca", - "type": "certificate", - "options": { - "is_ca": true, - "common_name": "ca" - } - }, - { - "name": "docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "server_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - }, - { - "name": "client_docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "client_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - } - ] - }' > ./bosh-vars.yml + pushd "${certs_dir}" > /dev/null + cat < ./bosh-vars.yml +--- +variables: +- name: docker_ca + type: certificate + options: + is_ca: true + common_name: ca +- name: docker_tls + type: certificate + options: + extended_key_usage: [server_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +- name: client_docker_tls + type: certificate + options: + extended_key_usage: [client_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +EOF bosh int ./bosh-vars.yml --vars-store=./certs.yml bosh int ./certs.yml --path=/docker_ca/ca > ./ca.pem @@ -51,12 +37,13 @@ function generate_certs() { bosh int ./certs.yml --path=/docker_tls/private_key > ./server-key.pem bosh int ./certs.yml --path=/client_docker_tls/certificate > ./cert.pem bosh int ./certs.yml --path=/client_docker_tls/private_key > ./key.pem - # generate certs in json format - # - ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "$certs_dir/ca_json_safe.pem" - ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "$certs_dir/client_certificate_json_safe.pem" - ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "$certs_dir/client_private_key_json_safe.pem" - popd + + # generate certs in json format + ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "${certs_dir}/ca_json_safe.pem" + ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "${certs_dir}/client_certificate_json_safe.pem" + ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "${certs_dir}/client_private_key_json_safe.pem" + + popd > /dev/null } function sanitize_cgroups() { @@ -64,15 +51,28 @@ function sanitize_cgroups() { mountpoint -q /sys/fs/cgroup || \ mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # cgroups v2: enable nesting (based on moby/moby hack/dind) + mkdir -p /sys/fs/cgroup/init + # Loop to handle races from concurrent process creation (e.g. docker exec) + while ! { + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || : + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do true; done + return + fi + mount -o remount,rw /sys/fs/cgroup - sed -e 1d /proc/cgroups | while read sys hierarchy num enabled; do + # shellcheck disable=SC2034 + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue fi - grouping="$(cat /proc/self/cgroup | cut -d: -f2 | grep "\\<$sys\\>")" + grouping="$(cut -d: -f2 < /proc/self/cgroup | grep "\\<$sys\\>")" if [ -z "$grouping" ]; then # subsystem not mounted anywhere; mount it on its own grouping="$sys" @@ -102,17 +102,32 @@ function sanitize_cgroups() { source "ci/scripts/functions-ci.sh" function start_docker() { - generate_certs "$1" - local mtu + local certs_dir + certs_dir="${1}" + + export DNS_IP="8.8.8.8" + + # docker will fail starting with the new iptables. it throws: + # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... + update-alternatives --set iptables /usr/sbin/iptables-legacy + + generate_certs "${certs_dir}" + mkdir -p /var/log mkdir -p /var/run sanitize_cgroups + echo "Sanitized cgroups for docker" >&2 + + # systemd inside nested Docker containers requires shared mount propagation + mount --make-rshared / - # ensure systemd cgroup is present - mkdir -p /sys/fs/cgroup/systemd - if ! mountpoint -q /sys/fs/cgroup/systemd ; then - mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + # ensure systemd cgroup is present (cgroups v1 only) + if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then + mkdir -p /sys/fs/cgroup/systemd + if ! mountpoint -q /sys/fs/cgroup/systemd ; then + mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + fi fi # check for /proc/sys being mounted readonly, as systemd does @@ -120,12 +135,13 @@ function start_docker() { mount -o remount,rw /proc/sys fi - mtu=$(cat /sys/class/net/$(ip route get 8.8.8.8|awk '{ print $5 }')/mtu) + local mtu + mtu=$(cat "/sys/class/net/$(ip route get ${DNS_IP} | awk '{ print $5 }')/mtu") [[ ! -d /etc/docker ]] && mkdir /etc/docker cat < /etc/docker/daemon.json { - "hosts": ["${DOCKER_HOST}","unix:///var/run/docker.sock"], + "hosts": ["${DOCKER_HOST}"], "tls": true, "tlscert": "${certs_dir}/server-cert.pem", "tlskey": "${certs_dir}/server-key.pem", @@ -137,15 +153,14 @@ function start_docker() { EOF service docker start - - export DOCKER_TLS_VERIFY=1 - export DOCKER_CERT_PATH=$1 + echo "Started docker service" >&2 rc=1 - for i in $(seq 1 10); do - echo waiting for docker to come up... - sleep 10 + for i in $(seq 1 100); do + echo "waiting for docker to come up... (${i})" + sleep 1 set +e + echo "Docker started, checking if it's responsive..." docker info rc=$? set -e @@ -165,66 +180,102 @@ EOF if [ -z "${KEEP_RUNNING}" ] ; then trap stop_docker ERR fi - echo "$certs_dir" + + echo "${certs_dir}" } function main() { - export OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list - .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } - .map { |addr| addr.ip_address }.first') - - export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" + # ".first" - original code could return multiple IPs (e.g., container IP + docker0 bridge IP) + # which breaks the docker_tls JSON variable formatting + OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list + .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } + .map { |addr| addr.ip_address }.first') + export OUTER_CONTAINER_IP + echo "Determined OUTER_CONTAINER_IP: ${OUTER_CONTAINER_IP}" >&2 local certs_dir certs_dir=$(mktemp -d) - start_docker "${certs_dir}" local local_bosh_dir local_bosh_dir="/tmp/local-bosh/director" + mkdir -p ${local_bosh_dir} + + cat < "${local_bosh_dir}/docker-env" +export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" +export DOCKER_TLS_VERIFY=1 +export DOCKER_CERT_PATH="${certs_dir}" +EOF + echo "Source '${local_bosh_dir}/docker-env' to run docker" >&2 + source "${local_bosh_dir}/docker-env" - if ! docker network ls | grep director_network; then - docker network create -d bridge --subnet=10.245.0.0/16 director_network + start_docker "${certs_dir}" + echo "Docker is up and running with TLS configured" >&2 + + local docker_network_name="director_network" + local docker_network_cidr="10.245.0.0/16" + if docker network ls | grep -q "${docker_network_name}"; then + echo "A docker network named '${docker_network_name}' already exists, skipping creation" >&2 + else + docker network create -d bridge --subnet=${docker_network_cidr} "${docker_network_name}" + echo "Created docker network '${docker_network_name}' with subnet '${docker_network_cidr}'" >&2 fi - compilation_ops="$PWD/ci/compilation.yml" pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null + echo "Current directory: $(pwd)" >&2 + export BOSH_DIRECTOR_IP="10.245.0.3" export BOSH_ENVIRONMENT="docker-director" - mkdir -p ${local_bosh_dir} + cat < "${local_bosh_dir}/docker_tls.json" +{ + "ca": "$(cat "${certs_dir}/ca_json_safe.pem")", + "certificate": "$(cat "${certs_dir}/client_certificate_json_safe.pem")", + "private_key": "$(cat "${certs_dir}/client_private_key_json_safe.pem")" +} +EOF - command bosh int bosh.yml \ + echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 + bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ + -o /usr/local/local-releases.yml \ + -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ - -v internal_cidr=10.245.0.0/16 \ + -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ -v internal_ip="${BOSH_DIRECTOR_IP}" \ -v docker_host="${DOCKER_HOST}" \ - -v network=director_network \ - -v docker_tls="{\"ca\": \"$(cat "${certs_dir}"/ca_json_safe.pem)\",\"certificate\": \"$(cat "${certs_dir}"/client_certificate_json_safe.pem)\",\"private_key\": \"$(cat "${certs_dir}"/client_private_key_json_safe.pem)\"}" \ - ${@} > "${local_bosh_dir}/bosh-director.yml" + -v network="${docker_network_name}" \ + -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ + "${@}" > "${local_bosh_dir}/bosh-director.yml" - command bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + echo "Creating BOSH director environment..." >&2 + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" + echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" + bosh_client_secret="$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password)" + + echo "Setting up BOSH CLI environment..." >&2 bosh -e "${BOSH_DIRECTOR_IP}" --ca-cert "${local_bosh_dir}/ca.crt" alias-env "${BOSH_ENVIRONMENT}" cat < "${local_bosh_dir}/env" + export BOSH_DIRECTOR_IP="${BOSH_DIRECTOR_IP}" export BOSH_ENVIRONMENT="${BOSH_ENVIRONMENT}" export BOSH_CLIENT=admin - export BOSH_CLIENT_SECRET=$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password) + export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" - EOF + echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" - bosh -n update-cloud-config docker/cloud-config.yml -v network=director_network -o "${compilation_ops}" + echo "Updating BOSH cloud config with Docker network..." >&2 + bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" popd > /dev/null } echo "----- Starting BOSH" -main $@ +main "${@}" \ No newline at end of file diff --git a/ci/scripts/stemcell-bionic/.gitkeep b/ci/scripts/stemcell-jammy/.gitkeep similarity index 100% rename from ci/scripts/stemcell-bionic/.gitkeep rename to ci/scripts/stemcell-jammy/.gitkeep diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 92a99154..16fcb911 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -31,7 +31,7 @@ update: stemcells: - alias: default - os: ubuntu-jammy + os: ubuntu-noble version: latest releases: From 81054066ccce34145f852941609e2487a34ce7d3 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 17:32:52 +0100 Subject: [PATCH 02/60] CFN-6544: bump to the last bpm version --- manifests/haproxy.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 16fcb911..3b09636e 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -36,9 +36,9 @@ stemcells: releases: - name: bpm - version: 1.2.14 - url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.2.14 - sha1: 1e357a533654e2067e15231dd8ac5bad2e697dff + version: 1.4.26 + url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 + sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 - name: haproxy version: 16.4.0+3.2.13 url: https://github.com/cloudfoundry/haproxy-boshrelease/releases/download/v16.4.0+3.2.13/haproxy-16.4.0+3.2.13.tgz From 14394dc3264c38ba90cf96d2fd40272b5fee46ff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 5 Mar 2026 10:51:42 +0100 Subject: [PATCH 03/60] CFN-6544: fix Docker CPI image in pipeline --- ci/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 75436cfd..8b58f69e 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -453,7 +453,7 @@ resources: - name: docker-cpi-image type: docker-image source: - repository: bosh/docker-cpi + repository: ghcr.io/cloudfoundry/bosh/docker-cpi - name: git-ci type: git From c07aa62d072cd68bd179f7679b9a77c2b8f1f682 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 13:11:17 +0100 Subject: [PATCH 04/60] CFN-6544: fixes --- acceptance-tests/run-local.sh | 18 ++++++++++++++++++ ci/Dockerfile | 1 + ci/scripts/start-bosh.sh | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 01d402f8..33946376 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -34,6 +34,20 @@ while getopts ":F:P:k" o; do done shift $((OPTIND-1)) +docker_mac_check_cgroupsv1() { + # Force cgroups v1 on Docker for Mac + # inspired by https://github.com/docker/for-mac/issues/6073#issuecomment-1018793677 + + SETTINGS=~/Library/Group\ Containers/group.com.docker/settings.json + + cgroupsV1Enabled=$(jq '.deprecatedCgroupv1' "$SETTINGS") + if [ "$cgroupsV1Enabled" != "true" ]; then + echo "deprecatedCgroupv1 should be enabled in $SETTINGS. Otherwise the acceptance tests will not run on Docker for Mac." + echo "Check in the README.md for a convenient script to set deprecatedCgroupv1 and restart Docker." + exit 1 + fi +} + check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( @@ -74,6 +88,10 @@ check_required_files() { check_required_files +if [ "$(uname)" == "Darwin" ]; then + docker_mac_check_cgroupsv1 +fi + build_image "${REPO_DIR}/ci" prepare_docker_scratch diff --git a/ci/Dockerfile b/ci/Dockerfile index 34414d78..c61ec60d 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -7,6 +7,7 @@ RUN apt-get update && \ apt-get clean # Set bosh env at login +RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..d1aacf61 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do + sed -e 1d /proc/cgroups | while read -r sys enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 6094de114922c06bc14d7194fb70cd258d502808 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 13:50:47 +0100 Subject: [PATCH 05/60] CFN-6544: rollback cgroup check change --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d1aacf61..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys enabled; do + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 024e286e85d80284c40ff9993c1c96866327e6ff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 15:42:49 +0100 Subject: [PATCH 06/60] CFN-6544: new ops file (start with systemd) --- ci/Dockerfile | 3 ++- ci/{ => ops}/bosh-scaled-out.yml | 0 ci/{ => ops}/compilation.yml | 0 ci/ops/noble-support.yml | 6 ++++++ ci/scripts/start-bosh.sh | 1 + 5 files changed, 9 insertions(+), 1 deletion(-) rename ci/{ => ops}/bosh-scaled-out.yml (100%) rename ci/{ => ops}/compilation.yml (100%) create mode 100644 ci/ops/noble-support.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index c61ec60d..537fdaf8 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,8 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml +COPY ops/noble-support.yml /usr/local/bosh-deployment/noble-support.yml +COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv diff --git a/ci/bosh-scaled-out.yml b/ci/ops/bosh-scaled-out.yml similarity index 100% rename from ci/bosh-scaled-out.yml rename to ci/ops/bosh-scaled-out.yml diff --git a/ci/compilation.yml b/ci/ops/compilation.yml similarity index 100% rename from ci/compilation.yml rename to ci/ops/compilation.yml diff --git a/ci/ops/noble-support.yml b/ci/ops/noble-support.yml new file mode 100644 index 00000000..beec1cbb --- /dev/null +++ b/ci/ops/noble-support.yml @@ -0,0 +1,6 @@ +- type: replace + path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? + value: true +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? + value: true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..cfa57dd3 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,6 +239,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$PWD/noble-support.yml" \ -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From 6ce8e764e072408ef1622ff501347128523fac67 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 17:13:59 +0100 Subject: [PATCH 07/60] CFN-6544: rollback new ops file (start with systemd) and add verbosity --- ci/Dockerfile | 1 - ci/ops/noble-support.yml | 6 ------ ci/scripts/acceptance-tests | 12 +++++++++++- ci/scripts/start-bosh.sh | 1 - 4 files changed, 11 insertions(+), 9 deletions(-) delete mode 100644 ci/ops/noble-support.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 537fdaf8..da6d1ff3 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,6 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY ops/noble-support.yml /usr/local/bosh-deployment/noble-support.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml # Install Python libraries needed for scripts diff --git a/ci/ops/noble-support.yml b/ci/ops/noble-support.yml deleted file mode 100644 index beec1cbb..00000000 --- a/ci/ops/noble-support.yml +++ /dev/null @@ -1,6 +0,0 @@ -- type: replace - path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? - value: true -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? - value: true diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 579a864a..589a46d7 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -48,6 +48,16 @@ else fi echo "------------------------------------------------------------------" -ginkgo -v "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" +echo "------------------------------------------------------------------" +if [ "${VERBOSITY:-}" = "vv" ]; then + VERBOSITY_FLAG="-vv" + echo "VERBOSITY is set to 'vv'. Will run ginkgo with '$VERBOSITY_FLAG'" +else + VERBOSITY_FLAG="-v" + echo "VERBOSITY is not set or unrecognised. Using default '$VERBOSITY_FLAG'" +fi +echo "------------------------------------------------------------------" + +ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" keep_running_info \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index cfa57dd3..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,7 +239,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/noble-support.yml" \ -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From e78b590427cf47ec8d42e16cef984f12d7129cee Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 19:28:57 +0100 Subject: [PATCH 08/60] CFN-6544: debug cgroups --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..d1aacf61 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do + sed -e 1d /proc/cgroups | while read -r sys enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From c05baf1d18d6a8e44ab310fa85b4c3208d58c7bd Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 19:36:16 +0100 Subject: [PATCH 09/60] CFN-6544: rollback cgroups --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d1aacf61..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys enabled; do + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 34dca32e4e365b6537afe2214c850e4efa2be4b2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 08:52:29 +0100 Subject: [PATCH 10/60] CFN-6544: debugging --- acceptance-tests/bosh_helpers.go | 31 +++++++++++++++++++++++++++++-- ci/scripts/start-bosh.sh | 1 + 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 8197b47c..645723b7 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "os/exec" + "regexp" "strings" "time" @@ -152,10 +153,14 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) Expect(err).NotTo(HaveOccurred()) + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) + if expectSuccess { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) + if session.ExitCode() != 0 { + dumpBoshTaskDebug(session) + Fail(fmt.Sprintf("bosh deploy exited with code %d", session.ExitCode())) + } } else { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) Expect(session.ExitCode()).NotTo(BeZero()) } @@ -173,6 +178,28 @@ func dumpCmd(cmd *exec.Cmd) { writeLog("------------------------------------") } +// dumpBoshTaskDebug extracts the BOSH task number from session output and runs +// "bosh tasks --debug" to stream the full debug log into GinkgoWriter. +func dumpBoshTaskDebug(session *gexec.Session) { + combined := string(session.Out.Contents()) + string(session.Err.Contents()) + // Lines like: "Task 67 | 19:24:12 | ..." + re := regexp.MustCompile(`(?m)^\s*Task (\d+) \|`) + matches := re.FindStringSubmatch(combined) + if len(matches) < 2 { + writeLog("(could not extract BOSH task number from output for debug dump)") + return + } + taskNumber := matches[1] + By(fmt.Sprintf("Dumping BOSH task %s debug log", taskNumber)) + cmd := config.boshCmd("", "task", taskNumber, "--debug") + debugSession, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + if err != nil { + writeLog(fmt.Sprintf("Failed to start bosh task debug: %s", err)) + return + } + Eventually(debugSession, 2*time.Minute, time.Second).Should(gexec.Exit()) +} + func dumpHAProxyConfig(haproxyInfo haproxyInfo) { By("Checking /var/vcap/jobs/haproxy/config/haproxy.config") haProxyConfig, _, err := runOnRemote(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, "cat /var/vcap/jobs/haproxy/config/haproxy.config") diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..82ea4080 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -267,6 +267,7 @@ EOF export BOSH_CLIENT=admin export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" + export BOSH_LOG_LEVEL=debug EOF echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" From afa66fe80177133c02e157013c6f4c7bb25c600f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 10:50:29 +0100 Subject: [PATCH 11/60] CFN-6544: debugging one test --- acceptance-tests/healthcheck_test.go | 12 +++++++----- ci/scripts/acceptance-tests | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 93c20a75..60e74ea7 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -62,13 +62,15 @@ var _ = Describe("HTTP Health Check", func() { haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: backendDeploymentName, }, []string{}, map[string]interface{}{}, true) - defer deleteDeployment(backendDeploymentName) + // defer deleteDeployment(backendDeploymentName) - closeLocalServer, backendLocalPort := startDefaultTestServer() - defer closeLocalServer() + _, backendLocalPort := startDefaultTestServer() + //closeLocalServer, backendLocalPort := startDefaultTestServer() + // defer closeLocalServer() - closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - defer closeTunnel() + _ := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + // defer closeTunnel() // Now deploy test HAProxy with 'haproxy-backend' configured as backend haproxyInfo, _ := deployHAProxy(baseManifestVars{ diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 589a46d7..e47fa0f3 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -4,6 +4,9 @@ set -e source "${REPO_ROOT}/ci/scripts/functions-ci.sh" START_DIR="${PWD}" # Differs for CI and manual execution + +FOCUS="Correctly starts if there is a healthy backend" + if [ -n "$FOCUS" ]; then echo "------------------------------------------------------------------" echo "FOCUS is set. Will only run tests matching '$FOCUS'" From bef1960588516597b1e3e9c56cc7d342f1e0957f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 11:05:15 +0100 Subject: [PATCH 12/60] CFN-6544: debugging one test, fixes --- acceptance-tests/healthcheck_test.go | 2 +- ci/scripts/start-bosh.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 60e74ea7..29aedc25 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -68,7 +68,7 @@ var _ = Describe("HTTP Health Check", func() { //closeLocalServer, backendLocalPort := startDefaultTestServer() // defer closeLocalServer() - _ := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) // defer closeTunnel() diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 82ea4080..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -267,7 +267,6 @@ EOF export BOSH_CLIENT=admin export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" - export BOSH_LOG_LEVEL=debug EOF echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" From e4898b7e6aa22abf41cd7601d8379c8c89f88353 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 11:35:09 +0100 Subject: [PATCH 13/60] CFN-6544: rollback task log dumping --- acceptance-tests/bosh_helpers.go | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 645723b7..8197b47c 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -5,7 +5,6 @@ import ( "fmt" "io/ioutil" "os/exec" - "regexp" "strings" "time" @@ -153,14 +152,10 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) Expect(err).NotTo(HaveOccurred()) - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) - if expectSuccess { - if session.ExitCode() != 0 { - dumpBoshTaskDebug(session) - Fail(fmt.Sprintf("bosh deploy exited with code %d", session.ExitCode())) - } + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) } else { + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) Expect(session.ExitCode()).NotTo(BeZero()) } @@ -178,28 +173,6 @@ func dumpCmd(cmd *exec.Cmd) { writeLog("------------------------------------") } -// dumpBoshTaskDebug extracts the BOSH task number from session output and runs -// "bosh tasks --debug" to stream the full debug log into GinkgoWriter. -func dumpBoshTaskDebug(session *gexec.Session) { - combined := string(session.Out.Contents()) + string(session.Err.Contents()) - // Lines like: "Task 67 | 19:24:12 | ..." - re := regexp.MustCompile(`(?m)^\s*Task (\d+) \|`) - matches := re.FindStringSubmatch(combined) - if len(matches) < 2 { - writeLog("(could not extract BOSH task number from output for debug dump)") - return - } - taskNumber := matches[1] - By(fmt.Sprintf("Dumping BOSH task %s debug log", taskNumber)) - cmd := config.boshCmd("", "task", taskNumber, "--debug") - debugSession, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - if err != nil { - writeLog(fmt.Sprintf("Failed to start bosh task debug: %s", err)) - return - } - Eventually(debugSession, 2*time.Minute, time.Second).Should(gexec.Exit()) -} - func dumpHAProxyConfig(haproxyInfo haproxyInfo) { By("Checking /var/vcap/jobs/haproxy/config/haproxy.config") haProxyConfig, _, err := runOnRemote(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, "cat /var/vcap/jobs/haproxy/config/haproxy.config") From 30e7a0cc8c7138658d97106309cb9048d0b333fb Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 12:51:48 +0100 Subject: [PATCH 14/60] CFN-6544: outbound fixes --- ci/scripts/start-bosh.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..b90f9ee3 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -105,11 +105,14 @@ function start_docker() { local certs_dir certs_dir="${1}" - export DNS_IP="8.8.8.8" + # Raise inotify limits so nested containers running systemd don't exhaust + # file descriptors. Systemd and containerd's cgroup-v2 event monitor both + # use inotify; the default max_user_instances (128) was too low. + sysctl -w fs.inotify.max_user_instances=1024 + sysctl -w fs.inotify.max_user_watches=524288 + sysctl -w net.ipv4.ip_forward=1 - # docker will fail starting with the new iptables. it throws: - # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... - update-alternatives --set iptables /usr/sbin/iptables-legacy + export DNS_IP="8.8.8.8" generate_certs "${certs_dir}" @@ -147,8 +150,10 @@ function start_docker() { "tlskey": "${certs_dir}/server-key.pem", "tlscacert": "${certs_dir}/ca.pem", "mtu": ${mtu}, + "dns": ["8.8.8.8", "8.8.4.4"], "data-root": "/scratch/docker", - "tlsverify": true + "tlsverify": true, + "ip-forward-no-drop": true } EOF From 6a20cd30423c243f9caed8c3406d7959b6905fdc Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 13:07:09 +0100 Subject: [PATCH 15/60] CFN-6544: outbound fixes --- ci/Dockerfile | 5 +- ci/ops/bosh-dns.yml | 98 ++++++++++++++++++++++++++++++++++++++++ ci/scripts/start-bosh.sh | 10 +++- 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 ci/ops/bosh-dns.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index da6d1ff3..8835c870 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,10 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml +RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml +COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml +COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv diff --git a/ci/ops/bosh-dns.yml b/ci/ops/bosh-dns.yml new file mode 100644 index 00000000..11113091 --- /dev/null +++ b/ci/ops/bosh-dns.yml @@ -0,0 +1,98 @@ +- type: replace + path: /addons?/name=bosh-dns-systemd? + value: + include: + stemcell: + - os: ubuntu-noble + jobs: + - name: bosh-dns + properties: + configure_systemd_resolved: true + disable_recursors: true + override_nameserver: false + api: + client: + tls: ((dns_api_client_tls)) + server: + tls: ((dns_api_server_tls)) + health: + client: + tls: ((dns_healthcheck_client_tls)) + enabled: true + server: + tls: ((dns_healthcheck_server_tls)) + cache: + enabled: true + release: bosh-dns + name: bosh-dns-systemd +- type: replace + path: /releases/name=bosh-dns? + value: + name: bosh-dns + sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 + url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 + version: 1.39.21 +- type: replace + path: /variables/name=dns_healthcheck_tls_ca? + value: + name: dns_healthcheck_tls_ca + options: + common_name: dns-healthcheck-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=dns_healthcheck_server_tls? + value: + name: dns_healthcheck_server_tls + options: + alternative_names: + - health.bosh-dns + ca: dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=dns_healthcheck_client_tls? + value: + name: dns_healthcheck_client_tls + options: + alternative_names: + - health.bosh-dns + ca: dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - client_auth + type: certificate +- type: replace + path: /variables/name=dns_api_tls_ca? + value: + name: dns_api_tls_ca + options: + common_name: dns-api-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=dns_api_server_tls? + value: + name: dns_api_server_tls + options: + alternative_names: + - api.bosh-dns + ca: dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=dns_api_client_tls? + value: + name: dns_api_client_tls + options: + alternative_names: + - api.bosh-dns + ca: dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - client_auth + type: certificate \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b90f9ee3..e4877ec1 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,12 +239,15 @@ EOF } EOF + local ops_files_dir="$PWD/haproxy-boshrelease" + echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/bosh-scaled-out.yml" \ + -o "$ops_files_dir/bosh-dns.yml" \ + -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ @@ -277,7 +280,10 @@ EOF source "${local_bosh_dir}/env" echo "Updating BOSH cloud config with Docker network..." >&2 - bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" + bosh -n update-cloud-config \ + docker/cloud-config.yml \ + -o "$ops_files_dir/compilation.yml" \ + -v network="${docker_network_name}" popd > /dev/null } From a3a8866b3f7fc6a9d09c43f344909d4c3b7b3a89 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 16:13:50 +0100 Subject: [PATCH 16/60] CFN-6544: increase canary watch timeout --- manifests/haproxy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 3b09636e..a80b5430 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -25,8 +25,8 @@ instance_groups: update: canaries: 1 max_in_flight: 1 - canary_watch_time: 1000-30000 - update_watch_time: 1000-30000 + canary_watch_time: 1000-60000 + update_watch_time: 1000-60000 serial: false stemcells: From 65c6e0c892ce2da34788d1cd8d8e57419a97f685 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 16:32:10 +0100 Subject: [PATCH 17/60] CFN-6544: increase canary watch timeout --- manifests/haproxy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index a80b5430..e0de40a1 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -25,8 +25,8 @@ instance_groups: update: canaries: 1 max_in_flight: 1 - canary_watch_time: 1000-60000 - update_watch_time: 1000-60000 + canary_watch_time: 1000-120000 + update_watch_time: 1000-120000 serial: false stemcells: From 369fc83faa7aa7277652d4915711e6f4fbecb454 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 19:43:55 +0100 Subject: [PATCH 18/60] CFN-6544: increase canary watch timeout for Bosh deployment --- ci/Dockerfile | 1 + ci/ops/bosh-watch-time.yml | 6 ++++++ ci/scripts/start-bosh.sh | 1 + 3 files changed, 8 insertions(+) create mode 100644 ci/ops/bosh-watch-time.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 8835c870..9647e2f1 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -14,6 +14,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml +COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml new file mode 100644 index 00000000..cae9c925 --- /dev/null +++ b/ci/ops/bosh-watch-time.yml @@ -0,0 +1,6 @@ +- type: replace + path: /update?/canary_watch_time? + value: 60000-1200000 +- type: replace + path: /update?/update_watch_time? + value: 60000-1200000 diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e4877ec1..4055c8c7 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -248,6 +248,7 @@ EOF -o /usr/local/local-releases.yml \ -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ + -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 267a285788e00d4c394a9eae5dfa1d1410413992 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 21:07:34 +0100 Subject: [PATCH 19/60] CFN-6544: increase canary watch timeout for Bosh deployment --- ci/ops/bosh-watch-time.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index cae9c925..249f65f2 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -1,6 +1,9 @@ - type: replace path: /update?/canary_watch_time? - value: 60000-1200000 + value: 60000-600000 - type: replace path: /update?/update_watch_time? - value: 60000-1200000 + value: 60000-600000 +- type: replace + path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? + value: 60 \ No newline at end of file From bf45ecddd49c62b7bd63641a119433f26591e81e Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 23:43:45 +0100 Subject: [PATCH 20/60] CFN-6544: rollback dns ops file --- ci/scripts/start-bosh.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 4055c8c7..d151f693 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,7 +246,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From c22ca235f13b3de1fd3fe73451abf9c85a6d9ee2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 00:01:44 +0100 Subject: [PATCH 21/60] CFN-6544: increase director db connection timeout --- ci/ops/bosh-watch-time.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index 249f65f2..6234b1c5 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -6,4 +6,4 @@ value: 60000-600000 - type: replace path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 60 \ No newline at end of file + value: 120 \ No newline at end of file From 30d5013956c0a8ed9c0d139231202a1a1f2ddb63 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 09:13:13 +0100 Subject: [PATCH 22/60] CFN-6544: extra param --- ci/scripts/acceptance-tests | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index e47fa0f3..92c0a600 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -61,6 +61,15 @@ else fi echo "------------------------------------------------------------------" -ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" +echo "------------------------------------------------------------------" +if [ -n "${FLAKE_ATTEMPTS:-}" ]; then + echo "FLAKE_ATTEMPTS is set. Will run ginkgo with '--flake-attempts=$FLAKE_ATTEMPTS'" +else + FLAKE_ATTEMPTS=5 + echo "FLAKE_ATTEMPTS is not set. Using default '$FLAKE_ATTEMPTS'" +fi +echo "------------------------------------------------------------------" + +ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts "$FLAKE_ATTEMPTS" "${ADDITIONAL_ARGS[@]}" keep_running_info \ No newline at end of file From f1d2cd0302690bace8bc7e35ce5d6d40ce24163f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 11:10:11 +0100 Subject: [PATCH 23/60] CFN-6544: cgroupns mode --- ci/Dockerfile | 1 + ci/ops/bosh-cgroup.yml | 14 ++++++++++++++ ci/ops/bosh-watch-time.yml | 5 ++++- ci/scripts/start-bosh.sh | 2 ++ 4 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 ci/ops/bosh-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 9647e2f1..301c0c43 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,6 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml new file mode 100644 index 00000000..b5238806 --- /dev/null +++ b/ci/ops/bosh-cgroup.yml @@ -0,0 +1,14 @@ +# When the Concourse worker host uses cgroups v2 (unified hierarchy), +# Docker containers started by the Docker CPI default to a private cgroup +# namespace. BPM's runc then tries to create scopes under +# /sys/fs/cgroup/systemd/ (a cgroups v1 path) which does not exist in +# the container, causing: +# openat2 /sys/fs/cgroup/systemd/.../cgroup.procs: no such file or directory +# +# Setting cgroupns_mode to "host" makes the BOSH director container share +# the host cgroup namespace so that /sys/fs/cgroup/systemd/ is visible +# and BPM/runc can write to it. +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? + value: host + diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index 6234b1c5..ac774784 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -6,4 +6,7 @@ value: 60000-600000 - type: replace path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 120 \ No newline at end of file + value: 120 +- type: replace + path: /instance_groups/name=bosh/properties/director/db/connection_options?/connect_timeout? + value: 30 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d151f693..e1c75390 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,6 +246,8 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$ops_files_dir/bosh-cgroup.yml" \ + -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From 803e842443d6a4b72c0e6d1e51f64f6b7665e0b1 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 12:21:51 +0100 Subject: [PATCH 24/60] CFN-6544: privileged --- ci/ops/bosh-cgroup.yml | 14 +++----------- ci/scripts/start-bosh.sh | 1 - 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml index b5238806..1796f4a0 100644 --- a/ci/ops/bosh-cgroup.yml +++ b/ci/ops/bosh-cgroup.yml @@ -1,14 +1,6 @@ -# When the Concourse worker host uses cgroups v2 (unified hierarchy), -# Docker containers started by the Docker CPI default to a private cgroup -# namespace. BPM's runc then tries to create scopes under -# /sys/fs/cgroup/systemd/ (a cgroups v1 path) which does not exist in -# the container, causing: -# openat2 /sys/fs/cgroup/systemd/.../cgroup.procs: no such file or directory -# -# Setting cgroupns_mode to "host" makes the BOSH director container share -# the host cgroup namespace so that /sys/fs/cgroup/systemd/ is visible -# and BPM/runc can write to it. - type: replace path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? value: host - +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/container?/privileged? + value: true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e1c75390..c07d4363 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,7 +247,6 @@ EOF -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ -o "$ops_files_dir/bosh-cgroup.yml" \ - -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From f4553ce057aa0a81c072f35145cda60786299ff7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 14:05:41 +0100 Subject: [PATCH 25/60] CFN-6544: rollback ops files --- ci/scripts/start-bosh.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index c07d4363..402b6626 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,9 +246,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ - -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 8c85a98c07395927189ad27481c2dc82b538f0e6 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 16:33:39 +0100 Subject: [PATCH 26/60] CFN-6544: cgroupns mode for all VMs --- ci/Dockerfile | 1 + ci/ops/cloud-config-cgroup.yml | 7 +++++++ ci/scripts/start-bosh.sh | 2 ++ 3 files changed, 10 insertions(+) create mode 100644 ci/ops/cloud-config-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 301c0c43..0d8d7c92 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,6 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml new file mode 100644 index 00000000..442254b2 --- /dev/null +++ b/ci/ops/cloud-config-cgroup.yml @@ -0,0 +1,7 @@ +- type: replace + path: /vm_types/name=default/cloud_properties/privileged? + value: true +- type: replace + path: /vm_types/name=default/cloud_properties/cgroupns_mode? + value: host + diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 402b6626..887dd11c 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,6 +246,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ @@ -282,6 +283,7 @@ EOF bosh -n update-cloud-config \ docker/cloud-config.yml \ -o "$ops_files_dir/compilation.yml" \ + -o "$ops_files_dir/cloud-config-cgroup.yml" \ -v network="${docker_network_name}" popd > /dev/null From 3fcf4bc6d501fa9e0af085c36868d4cd26721196 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 17:40:39 +0100 Subject: [PATCH 27/60] CFN-6544: cgroupfs driver --- ci/Dockerfile | 4 +- ci/ops/bosh-dns.yml | 98 -------------------------------------- ci/ops/bosh-watch-time.yml | 12 ----- ci/scripts/start-bosh.sh | 3 +- 4 files changed, 3 insertions(+), 114 deletions(-) delete mode 100644 ci/ops/bosh-dns.yml delete mode 100644 ci/ops/bosh-watch-time.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 0d8d7c92..c1eb1a1f 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,11 +12,9 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease -COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml -COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml -COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml +COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-dns.yml b/ci/ops/bosh-dns.yml deleted file mode 100644 index 11113091..00000000 --- a/ci/ops/bosh-dns.yml +++ /dev/null @@ -1,98 +0,0 @@ -- type: replace - path: /addons?/name=bosh-dns-systemd? - value: - include: - stemcell: - - os: ubuntu-noble - jobs: - - name: bosh-dns - properties: - configure_systemd_resolved: true - disable_recursors: true - override_nameserver: false - api: - client: - tls: ((dns_api_client_tls)) - server: - tls: ((dns_api_server_tls)) - health: - client: - tls: ((dns_healthcheck_client_tls)) - enabled: true - server: - tls: ((dns_healthcheck_server_tls)) - cache: - enabled: true - release: bosh-dns - name: bosh-dns-systemd -- type: replace - path: /releases/name=bosh-dns? - value: - name: bosh-dns - sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 - url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 - version: 1.39.21 -- type: replace - path: /variables/name=dns_healthcheck_tls_ca? - value: - name: dns_healthcheck_tls_ca - options: - common_name: dns-healthcheck-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_server_tls? - value: - name: dns_healthcheck_server_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_client_tls? - value: - name: dns_healthcheck_client_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - client_auth - type: certificate -- type: replace - path: /variables/name=dns_api_tls_ca? - value: - name: dns_api_tls_ca - options: - common_name: dns-api-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_api_server_tls? - value: - name: dns_api_server_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_api_client_tls? - value: - name: dns_api_client_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - client_auth - type: certificate \ No newline at end of file diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml deleted file mode 100644 index ac774784..00000000 --- a/ci/ops/bosh-watch-time.yml +++ /dev/null @@ -1,12 +0,0 @@ -- type: replace - path: /update?/canary_watch_time? - value: 60000-600000 -- type: replace - path: /update?/update_watch_time? - value: 60000-600000 -- type: replace - path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 120 -- type: replace - path: /instance_groups/name=bosh/properties/director/db/connection_options?/connect_timeout? - value: 30 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 887dd11c..a9e92491 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -153,7 +153,8 @@ function start_docker() { "dns": ["8.8.8.8", "8.8.4.4"], "data-root": "/scratch/docker", "tlsverify": true, - "ip-forward-no-drop": true + "ip-forward-no-drop": true, + "exec-opts": ["native.cgroupdriver=cgroupfs"] } EOF From 3adbce5c5fb783bd741302df859a7500e6dfcdc7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 10:03:31 +0100 Subject: [PATCH 28/60] CFN-6544: run all tests --- ci/ops/cloud-config-cgroup.yml | 4 ++-- ci/scripts/acceptance-tests | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml index 442254b2..2b5d3c58 100644 --- a/ci/ops/cloud-config-cgroup.yml +++ b/ci/ops/cloud-config-cgroup.yml @@ -1,7 +1,7 @@ - type: replace - path: /vm_types/name=default/cloud_properties/privileged? + path: /vm_types/name=default/cloud_properties?/privileged? value: true - type: replace - path: /vm_types/name=default/cloud_properties/cgroupns_mode? + path: /vm_types/name=default/cloud_properties?/cgroupns_mode? value: host diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 92c0a600..d6e35394 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -4,9 +4,6 @@ set -e source "${REPO_ROOT}/ci/scripts/functions-ci.sh" START_DIR="${PWD}" # Differs for CI and manual execution - -FOCUS="Correctly starts if there is a healthy backend" - if [ -n "$FOCUS" ]; then echo "------------------------------------------------------------------" echo "FOCUS is set. Will only run tests matching '$FOCUS'" From 41a9b2107ecece6990765913fd362a5bb9ac4447 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 13:28:26 +0100 Subject: [PATCH 29/60] CFN-6544: add attempts of bosh env creation --- acceptance-tests/healthcheck_test.go | 12 +++++------- ci/scripts/start-bosh.sh | 27 ++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 29aedc25..93c20a75 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -62,15 +62,13 @@ var _ = Describe("HTTP Health Check", func() { haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: backendDeploymentName, }, []string{}, map[string]interface{}{}, true) - // defer deleteDeployment(backendDeploymentName) + defer deleteDeployment(backendDeploymentName) - _, backendLocalPort := startDefaultTestServer() - //closeLocalServer, backendLocalPort := startDefaultTestServer() - // defer closeLocalServer() + closeLocalServer, backendLocalPort := startDefaultTestServer() + defer closeLocalServer() - setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - // defer closeTunnel() + closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + defer closeTunnel() // Now deploy test HAProxy with 'haproxy-backend' configured as backend haproxyInfo, _ := deployHAProxy(baseManifestVars{ diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index a9e92491..50fe829b 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -259,9 +259,30 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 - bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + local create_env_rc=1 + local max_attempts=${FLAKE_ATTEMPTS:-5} + local attempt_interval=30 + for attempt in $(seq 1 $max_attempts); do + echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" + + create_env_rc=$? + if [ "${create_env_rc}" -eq "0" ]; then + echo "bosh create-env succeeded on attempt ${attempt}" >&2 + break + fi + echo "bosh create-env failed on attempt ${attempt} (exit code ${create_env_rc})" >&2 + if [ "${attempt}" -lt "${max_attempts}" ]; then + echo "Retrying in ${attempt_interval} seconds..." >&2 + sleep ${attempt_interval} + fi + done + if [ "${create_env_rc}" -ne "0" ]; then + echo "bosh create-env failed after ${max_attempts} attempts. Exiting." >&2 + exit 1 + fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" From a646ccfa0a7d907171cfcbbfc3aa7f22e74934f2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 14:44:56 +0100 Subject: [PATCH 30/60] CFN-6544: add attempts of bosh env creation, fix; remove unnecessary ops files --- ci/Dockerfile | 2 -- ci/ops/bosh-cgroup.yml | 6 ------ ci/ops/cloud-config-cgroup.yml | 7 ------- ci/scripts/start-bosh.sh | 6 +++--- 4 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 ci/ops/bosh-cgroup.yml delete mode 100644 ci/ops/cloud-config-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index c1eb1a1f..b3bd5c48 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,9 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease -COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml -COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml deleted file mode 100644 index 1796f4a0..00000000 --- a/ci/ops/bosh-cgroup.yml +++ /dev/null @@ -1,6 +0,0 @@ -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? - value: host -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/container?/privileged? - value: true diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml deleted file mode 100644 index 2b5d3c58..00000000 --- a/ci/ops/cloud-config-cgroup.yml +++ /dev/null @@ -1,7 +0,0 @@ -- type: replace - path: /vm_types/name=default/cloud_properties?/privileged? - value: true -- type: replace - path: /vm_types/name=default/cloud_properties?/cgroupns_mode? - value: host - diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 50fe829b..ca6b8a9c 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,7 +247,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ @@ -264,11 +263,13 @@ EOF local attempt_interval=30 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 + set +e. # disables abort-on-error bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" - create_env_rc=$? + set -e + if [ "${create_env_rc}" -eq "0" ]; then echo "bosh create-env succeeded on attempt ${attempt}" >&2 break @@ -305,7 +306,6 @@ EOF bosh -n update-cloud-config \ docker/cloud-config.yml \ -o "$ops_files_dir/compilation.yml" \ - -o "$ops_files_dir/cloud-config-cgroup.yml" \ -v network="${docker_network_name}" popd > /dev/null From 485ed5be2dca2837d3eae1dc0d7d4fd1470c5a6a Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 14:54:32 +0100 Subject: [PATCH 31/60] CFN-6544: typo --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ca6b8a9c..2f84f6b6 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -263,7 +263,7 @@ EOF local attempt_interval=30 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 - set +e. # disables abort-on-error + set +e # disables abort-on-error bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" From c0aa7bd07f828414d51606e83b8faf3b45ba88d0 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 17:25:29 +0100 Subject: [PATCH 32/60] CFN-6544: redeploy after error to protect against flakiness --- .../acceptance_tests_suite_test.go | 7 --- acceptance-tests/bosh_helpers.go | 50 ++++++++++++++++--- acceptance-tests/config.go | 14 ++++++ acceptance-tests/log_helpers.go | 15 ++++++ 4 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 acceptance-tests/log_helpers.go diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 93e2ca2b..ab937416 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -247,10 +247,3 @@ func checkNetOpErr(err error, expectString string) { Expect(errors.As(tlsErr, &opErr)).To(BeTrue()) Expect(opErr.Err.Error()).To(ContainSubstring(expectString)) } - -func writeLog(s string) { - ginkgoConfig, _ := GinkgoConfiguration() - for _, line := range strings.Split(s, "\n") { - fmt.Printf("node %d/%d: %s\n", ginkgoConfig.ParallelProcess, ginkgoConfig.ParallelTotal, line) - } -} diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 8197b47c..f6fa4022 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -149,13 +149,9 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c cmd, varsStoreReader := deployBaseManifestCmd(baseManifestVars.deploymentName, opsfiles, manifestVars) dumpCmd(cmd) - session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - Expect(err).NotTo(HaveOccurred()) + session := deployWithRetry(baseManifestVars.deploymentName, cmd, 20*time.Minute, expectSuccess) - if expectSuccess { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) - } else { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) + if !expectSuccess { Expect(session.ExitCode()).NotTo(BeZero()) } @@ -167,6 +163,48 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c return haproxyInfo, varsStoreReader } +// deployWithRetry runs a bosh deploy command and retries up to config.FlakeAttempts times. +// On each failed attempt the deployment is deleted before retrying, so the next attempt starts clean. +// If expectSuccess is false the command is run once without retrying (failure is expected by the caller). +func deployWithRetry(boshDeployment string, cmd *exec.Cmd, timeout time.Duration, expectSuccess bool) *gexec.Session { + var session *gexec.Session + var err error + + for attempt := 1; attempt <= config.FlakeAttempts; attempt++ { + if attempt > 1 { + writeLog(fmt.Sprintf("Deployment attempt %d/%d failed, deleting deployment before retry...", attempt-1, config.FlakeAttempts)) + deleteDeployment(boshDeployment) + + writeLog(fmt.Sprintf("Retrying deployment (attempt %d/%d)...", attempt, config.FlakeAttempts)) + newCmd := exec.Command(cmd.Path, cmd.Args[1:]...) + newCmd.Env = cmd.Env + cmd = newCmd + } + + session, err = gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + Expect(err).NotTo(HaveOccurred()) + + // Wait for the process to exit without asserting the exit code + Eventually(session, timeout, time.Second).Should(gexec.Exit()) + + if !expectSuccess { + // caller expects failure — return immediately without retrying + return session + } + + if session.ExitCode() == 0 { + writeLog(fmt.Sprintf("Deployment succeeded on attempt %d/%d", attempt, config.FlakeAttempts)) + return session + } + + writeLog(fmt.Sprintf("Deployment failed on attempt %d/%d (exit code %d)", attempt, config.FlakeAttempts, session.ExitCode())) + } + + // All attempts exhausted — fail the test with a clear message + Expect(session.ExitCode()).To(BeZero(), fmt.Sprintf("Deployment failed after %d attempt(s)", config.FlakeAttempts)) + return session +} + func dumpCmd(cmd *exec.Cmd) { writeLog("---------- Command to run ----------") writeLog(cmd.String()) diff --git a/acceptance-tests/config.go b/acceptance-tests/config.go index 5d3c6bce..aa27d4c8 100644 --- a/acceptance-tests/config.go +++ b/acceptance-tests/config.go @@ -4,10 +4,13 @@ import ( "fmt" "os" "os/exec" + "strconv" ) var config Config +const DEFAULT_FLAKE_ATTEMPTS = 5 + type Config struct { ReleaseRepoPath string `json:"releaseRepoPath"` ReleaseVersion string `json:"releaseVersion"` @@ -18,6 +21,7 @@ type Config struct { BoshPath string `json:"boshPath"` BaseManifestPath string `json:"baseManifestPath"` HomePath string `json:"homePath"` + FlakeAttempts int `json:"flakeAttempts"` } func loadConfig() (Config, error) { @@ -67,6 +71,15 @@ func loadConfig() (Config, error) { return Config{}, err } + flakeAttempts := DEFAULT_FLAKE_ATTEMPTS + if val := os.Getenv("FLAKE_ATTEMPTS"); val != "" { + if flakeAttemptsFromEnv, err := strconv.Atoi(val); err == nil && flakeAttemptsFromEnv > 0 { + flakeAttempts = flakeAttemptsFromEnv + } else { + writeLog(fmt.Sprintf("FLAKE_ATTEMPTS must be a positive integer, but got: %s, so defaulting test suite's flakeAttempts to %d", val, DEFAULT_FLAKE_ATTEMPTS)) + } + } + return Config{ ReleaseRepoPath: releaseRepoPath, ReleaseVersion: releaseVersion, @@ -77,6 +90,7 @@ func loadConfig() (Config, error) { BoshPath: boshPath, BaseManifestPath: baseManifestPath, HomePath: homePath, + FlakeAttempts: flakeAttempts, }, nil } diff --git a/acceptance-tests/log_helpers.go b/acceptance-tests/log_helpers.go new file mode 100644 index 00000000..d134aca5 --- /dev/null +++ b/acceptance-tests/log_helpers.go @@ -0,0 +1,15 @@ +package acceptance_tests + +import ( + "fmt" + "strings" + + . "github.com/onsi/ginkgo/v2" +) + +func writeLog(s string) { + ginkgoConfig, _ := GinkgoConfiguration() + for _, line := range strings.Split(s, "\n") { + fmt.Printf("node %d/%d: %s\n", ginkgoConfig.ParallelProcess, ginkgoConfig.ParallelTotal, line) + } +} From 36dcd2a972087cfb8ae6da6ffc343cd6b3e55987 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 18:34:20 +0100 Subject: [PATCH 33/60] CFN-6544: workaround to make bosh start --- ci/scripts/start-bosh.sh | 84 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 2f84f6b6..a2b62f62 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -260,10 +260,10 @@ EOF echo "Creating BOSH director environment..." >&2 local create_env_rc=1 local max_attempts=${FLAKE_ATTEMPTS:-5} - local attempt_interval=30 + local bpm_restart_timeout=120 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 - set +e # disables abort-on-error + set +e bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" @@ -274,12 +274,84 @@ EOF echo "bosh create-env succeeded on attempt ${attempt}" >&2 break fi - echo "bosh create-env failed on attempt ${attempt} (exit code ${create_env_rc})" >&2 - if [ "${attempt}" -lt "${max_attempts}" ]; then - echo "Retrying in ${attempt_interval} seconds..." >&2 - sleep ${attempt_interval} + + echo "bosh create-env failed on attempt ${attempt}/${max_attempts} (exit code ${create_env_rc})" >&2 + if [ "${attempt}" -ge "${max_attempts}" ]; then + break + fi + + # The VM already exists but its jobs are in 'failing' state. + # Re-running create-env from scratch wastes ~6 minutes recompiling packages. + # Instead, find the director container and restart BPM inside it so monit + # can bring the jobs back up — then let create-env verify the running state. + local director_container + director_container=$(docker ps --format "{{.ID}}" | head -1) + if [ -n "${director_container}" ]; then + echo "Found director container ${director_container}, restarting BPM jobs..." >&2 + set +e + # runc delete --force fails because the container cgroup scope dirs + # (system.slice/runc-*.scope) are owned by the host systemd and cannot + # be rmdir-d from inside the nested container, even when they are empty. + # + # BPM only needs the runc state dir to be gone before it can re-create + # the container. So: remove the state dir directly, bypassing runc delete. + # The orphaned cgroup scope dirs will be cleaned up by the host systemd + # garbage collector once there are no more references to them. + docker exec "${director_container}" bash -c ' + runc_bin=/var/vcap/packages/bpm/bin/runc + runc_root=/var/vcap/sys/run/bpm-runc + + for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do + # postgres must keep running — the director depends on it + [ "${container_id}" = "bpm-postgres" ] && continue + echo "Cleaning up runc container: ${container_id}" >&2 + rm -rf "${runc_root:?}/${container_id}" + done + ' + # Restart all monitored jobs except postgres (which must keep running + # as the director database — restarting it would cause data loss risk + # and break the director on the next attempt). + docker exec "${director_container}" bash -c ' + /var/vcap/bosh/bin/monit summary | awk "/Process/{print \$2}" | tr -d "'"'"'" | \ + while read -r job; do + [ "${job}" = "postgres" ] && continue + echo "Restarting monit job: ${job}" >&2 + /var/vcap/bosh/bin/monit restart "${job}" || true + done + ' + set -e + + echo "Waiting up to ${bpm_restart_timeout}s for director jobs to recover..." >&2 + local elapsed=0 + local recovered=false + while [ "${elapsed}" -lt "${bpm_restart_timeout}" ]; do + sleep 10 + elapsed=$((elapsed + 10)) + set +e + local status + status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + set -e + local failing + failing=$(echo "${status}" | grep -c "not monitored\|does not exist\|failed\|stopped" || true) + if [ "${failing}" -eq "0" ]; then + echo "All director jobs are running after BPM restart (${elapsed}s)" >&2 + recovered=true + break + fi + echo "Still waiting for jobs... (${elapsed}s)" >&2 + done + + if [ "${recovered}" = "true" ]; then + echo "BPM recovery succeeded, re-running create-env to verify state..." >&2 + else + echo "BPM recovery did not complete within ${bpm_restart_timeout}s, re-running create-env anyway..." >&2 + fi + else + echo "Director container not found at ${BOSH_DIRECTOR_IP}, re-running create-env from scratch..." >&2 + sleep 10 fi done + if [ "${create_env_rc}" -ne "0" ]; then echo "bosh create-env failed after ${max_attempts} attempts. Exiting." >&2 exit 1 From e47b2485a8d1b9059d1cc7904bf9d159f0fa7437 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 09:09:52 +0100 Subject: [PATCH 34/60] CFN-6544: rollback the workaround making bosh start --- ci/scripts/start-bosh.sh | 101 ++------------------------------------- 1 file changed, 3 insertions(+), 98 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index a2b62f62..11723d61 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -258,104 +258,9 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 - local create_env_rc=1 - local max_attempts=${FLAKE_ATTEMPTS:-5} - local bpm_restart_timeout=120 - for attempt in $(seq 1 $max_attempts); do - echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 - set +e - bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" - create_env_rc=$? - set -e - - if [ "${create_env_rc}" -eq "0" ]; then - echo "bosh create-env succeeded on attempt ${attempt}" >&2 - break - fi - - echo "bosh create-env failed on attempt ${attempt}/${max_attempts} (exit code ${create_env_rc})" >&2 - if [ "${attempt}" -ge "${max_attempts}" ]; then - break - fi - - # The VM already exists but its jobs are in 'failing' state. - # Re-running create-env from scratch wastes ~6 minutes recompiling packages. - # Instead, find the director container and restart BPM inside it so monit - # can bring the jobs back up — then let create-env verify the running state. - local director_container - director_container=$(docker ps --format "{{.ID}}" | head -1) - if [ -n "${director_container}" ]; then - echo "Found director container ${director_container}, restarting BPM jobs..." >&2 - set +e - # runc delete --force fails because the container cgroup scope dirs - # (system.slice/runc-*.scope) are owned by the host systemd and cannot - # be rmdir-d from inside the nested container, even when they are empty. - # - # BPM only needs the runc state dir to be gone before it can re-create - # the container. So: remove the state dir directly, bypassing runc delete. - # The orphaned cgroup scope dirs will be cleaned up by the host systemd - # garbage collector once there are no more references to them. - docker exec "${director_container}" bash -c ' - runc_bin=/var/vcap/packages/bpm/bin/runc - runc_root=/var/vcap/sys/run/bpm-runc - - for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do - # postgres must keep running — the director depends on it - [ "${container_id}" = "bpm-postgres" ] && continue - echo "Cleaning up runc container: ${container_id}" >&2 - rm -rf "${runc_root:?}/${container_id}" - done - ' - # Restart all monitored jobs except postgres (which must keep running - # as the director database — restarting it would cause data loss risk - # and break the director on the next attempt). - docker exec "${director_container}" bash -c ' - /var/vcap/bosh/bin/monit summary | awk "/Process/{print \$2}" | tr -d "'"'"'" | \ - while read -r job; do - [ "${job}" = "postgres" ] && continue - echo "Restarting monit job: ${job}" >&2 - /var/vcap/bosh/bin/monit restart "${job}" || true - done - ' - set -e - - echo "Waiting up to ${bpm_restart_timeout}s for director jobs to recover..." >&2 - local elapsed=0 - local recovered=false - while [ "${elapsed}" -lt "${bpm_restart_timeout}" ]; do - sleep 10 - elapsed=$((elapsed + 10)) - set +e - local status - status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) - set -e - local failing - failing=$(echo "${status}" | grep -c "not monitored\|does not exist\|failed\|stopped" || true) - if [ "${failing}" -eq "0" ]; then - echo "All director jobs are running after BPM restart (${elapsed}s)" >&2 - recovered=true - break - fi - echo "Still waiting for jobs... (${elapsed}s)" >&2 - done - - if [ "${recovered}" = "true" ]; then - echo "BPM recovery succeeded, re-running create-env to verify state..." >&2 - else - echo "BPM recovery did not complete within ${bpm_restart_timeout}s, re-running create-env anyway..." >&2 - fi - else - echo "Director container not found at ${BOSH_DIRECTOR_IP}, re-running create-env from scratch..." >&2 - sleep 10 - fi - done - - if [ "${create_env_rc}" -ne "0" ]; then - echo "bosh create-env failed after ${max_attempts} attempts. Exiting." >&2 - exit 1 - fi + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" From ed0e720619d4f738fa07ba2b0ba335d61e32aff7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 09:21:58 +0100 Subject: [PATCH 35/60] CFN-6544: tuned timeouts and fixed path --- ci/pipeline.yml | 7 ++++++- ci/scripts/acceptance-tests | 20 +++++++++++++++++++- ci/scripts/start-bosh.sh | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 8b58f69e..1fb84148 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -125,6 +125,7 @@ jobs: - get: haproxy-boshrelease-testflight - task: acceptance-tests privileged: true + timeout: 4h image: haproxy-boshrelease-testflight config: platform: linux @@ -137,7 +138,8 @@ jobs: args: [] params: REPO_ROOT: git - on_failure: + SUITE_TIMEOUT: 3h + GRACE_PERIOD: 10m put: notify params: channel: "#haproxy-boshrelease" @@ -163,6 +165,7 @@ jobs: list_changed_files: true - task: acceptance-tests privileged: true + timeout: 4h image: haproxy-boshrelease-testflight config: platform: linux @@ -175,6 +178,8 @@ jobs: args: [] params: REPO_ROOT: git-pull-requests + SUITE_TIMEOUT: 3h + GRACE_PERIOD: 10m on_success: put: git-pull-requests params: diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index d6e35394..7f8b6951 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -67,6 +67,24 @@ else fi echo "------------------------------------------------------------------" -ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts "$FLAKE_ATTEMPTS" "${ADDITIONAL_ARGS[@]}" +echo "------------------------------------------------------------------" +if [ -n "${SUITE_TIMEOUT:-}" ]; then + echo "SUITE_TIMEOUT is set. Will run ginkgo with '--timeout=$SUITE_TIMEOUT'" +else + SUITE_TIMEOUT="1h" + echo "SUITE_TIMEOUT is not set. Using default '$SUITE_TIMEOUT'" +fi +echo "------------------------------------------------------------------" + +echo "------------------------------------------------------------------" +if [ -n "${GRACE_PERIOD:-}" ]; then + echo "GRACE_PERIOD is set. Will run ginkgo with '--grace-period=$GRACE_PERIOD'" +else + GRACE_PERIOD="30s" + echo "GRACE_PERIOD is not set. Using default '$GRACE_PERIOD'" +fi +echo "------------------------------------------------------------------" + +ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts "$FLAKE_ATTEMPTS" --timeout "$SUITE_TIMEOUT" --grace-period "$GRACE_PERIOD" "${ADDITIONAL_ARGS[@]}" keep_running_info \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 11723d61..e2f1f03a 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,7 +246,7 @@ EOF bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ - -o /usr/local/local-releases.yml \ + -o /usr/local/ops-files/local-releases.yml \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From 8fc3455024f99951184515e93666aae250a2ac78 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 09:30:29 +0100 Subject: [PATCH 36/60] CFN-6544: tuned timeouts --- ci/Dockerfile | 1 + ci/ops/bosh-timeouts.yml | 3 +++ ci/scripts/start-bosh.sh | 1 + 3 files changed, 5 insertions(+) create mode 100644 ci/ops/bosh-timeouts.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index b3bd5c48..051cdbe2 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -13,6 +13,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml +COPY ops/bosh-timeouts.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-timeouts.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-timeouts.yml b/ci/ops/bosh-timeouts.yml new file mode 100644 index 00000000..d86c9578 --- /dev/null +++ b/ci/ops/bosh-timeouts.yml @@ -0,0 +1,3 @@ +- type: replace + path: /instance_groups/name=bosh/properties/director/db/connection_wait_timeout? + value: 60 diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e2f1f03a..fc6f79e7 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -248,6 +248,7 @@ EOF -o jumpbox-user.yml \ -o /usr/local/ops-files/local-releases.yml \ -o "$ops_files_dir/bosh-scaled-out.yml" \ + -o "$ops_files_dir/bosh-timeouts.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 4856516fd08fc582e1a30140f6084c35906ad837 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 09:50:28 +0100 Subject: [PATCH 37/60] CFN-6544: refactored the waiting logic --- acceptance-tests/acceptance_tests_suite_test.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index ab937416..c96388ad 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -154,11 +154,13 @@ func setupTunnelFromHaproxyIPToTestServerIP(haproxyInfo haproxyInfo, haproxyBack err := startReverseSSHPortAndIPForwarder(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, haproxyBackendIP, haproxyBackendPort, localIP, localPort, ctx) Expect(err).NotTo(HaveOccurred()) - By("Waiting a few seconds so that HAProxy can detect the backend server is listening") - // HAProxy backend health check interval is 1 second - // So we wait five seconds here to ensure that HAProxy - // has time to verify that the backend is now up - time.Sleep(5 * time.Second) + By("Waiting for HAProxy to detect the backend server is listening") + // HAProxy backend health check interval is 1 second. + // Poll until the backend port is reachable from the HAProxy VM + // instead of blindly sleeping. + Eventually(func() error { + return checkListening(fmt.Sprintf("%s:%d", haproxyBackendIP, haproxyBackendPort)) + }, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) return cancelFunc } From af86c313b6615ccbbbfe928f1e95eaf119e7fae4 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 12:26:00 +0100 Subject: [PATCH 38/60] CFN-6544: fix in waiting logic --- acceptance-tests/acceptance_tests_suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index c96388ad..8267b3ff 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -159,7 +159,7 @@ func setupTunnelFromHaproxyIPToTestServerIP(haproxyInfo haproxyInfo, haproxyBack // Poll until the backend port is reachable from the HAProxy VM // instead of blindly sleeping. Eventually(func() error { - return checkListening(fmt.Sprintf("%s:%d", haproxyBackendIP, haproxyBackendPort)) + return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, "80")) }, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) return cancelFunc From 657d3581313e4270f2e8e9ee2ad81a219bb8c43b Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 13:35:11 +0100 Subject: [PATCH 39/60] CFN-6544: workaround for bosh to start --- ci/scripts/fix-bosh-instance.sh | 80 +++++++++++++++++++++++++++++++++ ci/scripts/start-bosh.sh | 21 +++++++++ 2 files changed, 101 insertions(+) create mode 100644 ci/scripts/fix-bosh-instance.sh diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh new file mode 100644 index 00000000..7732027a --- /dev/null +++ b/ci/scripts/fix-bosh-instance.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# This script runs in parallel with bosh create-env. +# It waits until the director container appears and the postgres job successfully starts, then, +# if director job failed, it fixes issue with runc state directories during BPM restart of failed jobs. +# (see https://github.com/cloudfoundry/bpm-release/issues/208). +# +# BPM cannot cleanup before restarting job. The cleanup means to delete +# the runc container for the job and it cannot be deleted because because +# the container cgroup scope dirs (system.slice/runc-*.scope) are owned +# by the host systemd and cannot be deleted from inside the nested container, +# even when they are empty. +# +# We will find the director container and clean runc state dirs so monit can bring +# the jobs back up via BPM restart. + +set +e + +director_container="" +echo "Waiting for director container to appear..." >&2 +while true; do + director_container=$(docker ps --format "{{.ID}}" | head -1) # At this point there should only be one container, the director + if [ -n "${director_container}" ]; then + echo "Director container appeared: ${director_container}" >&2 + break + fi + echo "Director container not yet running, waiting..." >&2 + sleep 5 +done + +echo "Waiting for postgres job to be running in director container..." >&2 +while true; do + status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + if echo "${status}" | grep -q "postgres.*running"; then + echo "postgres is running" >&2 + break + fi + echo "postgres not yet running, waiting..." >&2 + sleep 5 +done + +echo "Monitoring director job until it is running or needs fixing..." >&2 +while [ ! -f "${CREATE_ENV_DONE_FILE:-/tmp/create-env-done}" ]; do + status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + + if echo "${status}" | grep -q "director.*running"; then + echo "director is running, no fix needed. Exiting." >&2 + exit 0 + fi + if echo "${status}" | grep -qE "director.*(Execution failed)"; then + echo "director job is failing, proceeding with fix..." >&2 + break + fi +done + +# BPM only needs the runc state dir to be gone before it can re-create +# the container. The orphaned cgroup scope dirs will be cleaned up by the +# host systemd garbage collector once there are no more references to them. +docker exec "${director_container}" bash -c ' + runc_bin=/var/vcap/packages/bpm/bin/runc + runc_root=/var/vcap/sys/run/bpm-runc + + for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do + # postgres must keep running — the director depends on it + [ "${container_id}" = "bpm-postgres" ] && continue + echo "Cleaning up runc container: ${container_id}" >&2 + rm -rf "${runc_root:?}/${container_id}" + done + + /var/vcap/bosh/bin/monit summary | awk "/Process/{print \$2}" | tr -d "'"'"'" | \ + while read -r job; do + # Restart all monitored jobs except postgres (which must keep running + # as the director database — its restart is slow and will cause the same + # failure for director jobs, which depend on it + [ "${job}" = "postgres" ] && continue + echo "Restarting monit job: ${job}" >&2 + /var/vcap/bosh/bin/monit restart "${job}" || true + done +' 2>/dev/null || true + diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index fc6f79e7..a34cb007 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -259,9 +259,30 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 + + # Run fix-bosh-instance.sh in background in parallel with create-env. + # It will wait for the director container to run and then, if needed, fix + # BPM/runc state so create-env can succeed without getting stuck on failed jobs. + local fix_pid="" + bash "ci/scripts/fix-bosh-instance.sh" & + fix_pid=$! + + set +e bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" + local create_env_rc=$? + + if [ -n "${fix_pid}" ]; then + kill -9 "${fix_pid}" 2>/dev/null || true + wait "${fix_pid}" 2>/dev/null || true + fi + set -e + + if [ "${create_env_rc}" -ne "0" ]; then + echo "bosh create-env failed (exit code ${create_env_rc}). Exiting." >&2 + exit 1 + fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" From 7658f5e15ca1cc77b4feb4ac290e1b9a31079fe2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 13:49:59 +0100 Subject: [PATCH 40/60] CFN-6544: added log --- ci/scripts/start-bosh.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index a34cb007..f77902ee 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -263,8 +263,8 @@ EOF # Run fix-bosh-instance.sh in background in parallel with create-env. # It will wait for the director container to run and then, if needed, fix # BPM/runc state so create-env can succeed without getting stuck on failed jobs. - local fix_pid="" - bash "ci/scripts/fix-bosh-instance.sh" & + local fix_log="/tmp/fix-bosh-instance.log" + bash "ci/scripts/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & fix_pid=$! set +e @@ -276,6 +276,9 @@ EOF if [ -n "${fix_pid}" ]; then kill -9 "${fix_pid}" 2>/dev/null || true wait "${fix_pid}" 2>/dev/null || true + echo "===== fix-bosh-instance.sh log =====" >&2 + cat "${fix_log}" >&2 + echo "===== end fix-bosh-instance.sh log =====" >&2 fi set -e From 95849e9e96f7b84b062d0e52a7db98e1f4b0bd01 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 13:59:06 +0100 Subject: [PATCH 41/60] CFN-6544: fix path --- ci/scripts/start-bosh.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index f77902ee..7eb43c8f 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -2,6 +2,8 @@ set -eo pipefail +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") + function generate_certs() { local certs_dir certs_dir="${1}" @@ -264,7 +266,7 @@ EOF # It will wait for the director container to run and then, if needed, fix # BPM/runc state so create-env can succeed without getting stuck on failed jobs. local fix_log="/tmp/fix-bosh-instance.log" - bash "ci/scripts/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & + bash "${SCRIPT_DIR}/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & fix_pid=$! set +e From 8df8ed560b48c800b4c8222e144298ad73c370bb Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 14:03:08 +0100 Subject: [PATCH 42/60] CFN-6544: fix path --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 7eb43c8f..e5820fb4 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -2,7 +2,7 @@ set -eo pipefail -SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") +SCRIPT_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") function generate_certs() { local certs_dir From 458697dbf86de2c98b053d769d74041db09a45e6 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 14:28:21 +0100 Subject: [PATCH 43/60] CFN-6544: fix port --- acceptance-tests/acceptance_tests_suite_test.go | 2 +- ci/scripts/fix-bosh-instance.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 8267b3ff..5a43a48c 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -159,7 +159,7 @@ func setupTunnelFromHaproxyIPToTestServerIP(haproxyInfo haproxyInfo, haproxyBack // Poll until the backend port is reachable from the HAProxy VM // instead of blindly sleeping. Eventually(func() error { - return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, "80")) + return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, 80)) }, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) return cancelFunc diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index 7732027a..b2b9b3a9 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -51,6 +51,8 @@ while [ ! -f "${CREATE_ENV_DONE_FILE:-/tmp/create-env-done}" ]; do echo "director job is failing, proceeding with fix..." >&2 break fi + echo "director not yet running nor failing, waiting..." >&2 + sleep 5 done # BPM only needs the runc state dir to be gone before it can re-create From a47fb5f56e0fd6b004e1cddf4a4109424d200888 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 14:51:46 +0100 Subject: [PATCH 44/60] CFN-6544: fix workaround logic and lua test --- acceptance-tests/bosh_helpers.go | 29 +++++++++++++ acceptance-tests/lua_test.go | 3 ++ ci/scripts/fix-bosh-instance.sh | 72 ++++++++++++++++++-------------- 3 files changed, 72 insertions(+), 32 deletions(-) diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index f6fa4022..d1d5e001 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -336,6 +336,35 @@ func deleteDeployment(boshDeployment string) { Eventually(session, 10*time.Minute, time.Second).Should(gexec.Exit(0)) } +func restartAllJobsOnDeployment(boshDeployment string) { + By(fmt.Sprintf("Restarting all jobs on deployment '%s'", boshDeployment)) + + bashCode := ` +runc_bin=/var/vcap/packages/bpm/bin/runc +runc_root=/var/vcap/sys/run/bpm-runc + +for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do + echo "Cleaning up runc container: ${container_id}" >&2 + rm -rf "${runc_root:?}/${container_id}" +done + +/var/vcap/bosh/bin/monit summary | awk '/Process/{print $2}' | tr -d "'" | \ +while read -r job; do + echo "Restarting monit job: ${job}" >&2 + /var/vcap/bosh/bin/monit restart "${job}" || true +done +` + + instances := boshInstances(boshDeployment) + for _, instance := range instances { + writeLog(fmt.Sprintf("Running runc cleanup + monit restart on %s", instance.Instance)) + cmd := config.boshCmd(boshDeployment, "ssh", instance.Instance, "-c", bashCode) + session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + Expect(err).NotTo(HaveOccurred()) + Eventually(session, time.Minute, time.Second).Should(gexec.Exit(0)) + } +} + func waitForHAProxyListening(haproxyInfo haproxyInfo) { Eventually(func() error { return checkListening(fmt.Sprintf("%s:443", haproxyInfo.PublicIP)) diff --git a/acceptance-tests/lua_test.go b/acceptance-tests/lua_test.go index e690b41e..760d6e1c 100644 --- a/acceptance-tests/lua_test.go +++ b/acceptance-tests/lua_test.go @@ -61,6 +61,9 @@ core.register_service("lua_test", "http", lua_test) closeTunnel := setupTunnelFromHaproxyToTestServer(haproxyInfo, haproxyBackendPort, localPort) defer closeTunnel() + // TODO: remove this test once the issue https://github.com/cloudfoundry/bpm-release/issues/208 is solved + restartAllJobsOnDeployment(deploymentNameForTestNode()) + By("Waiting monit to report HAProxy is now healthy (the lua script was uploaded after start).") // Since the backend is now listening, HAProxy healthcheck should start returning healthy // and monit should in turn start reporting a healthy process diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index b2b9b3a9..d92bf9eb 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -13,6 +13,10 @@ # # We will find the director container and clean runc state dirs so monit can bring # the jobs back up via BPM restart. +# +# BPM only needs the runc state dir to be gone before it can re-create +# the container. The orphaned cgroup scope dirs will be cleaned up by the +# host systemd garbage collector once there are no more references to them. set +e @@ -39,44 +43,48 @@ while true; do sleep 5 done -echo "Monitoring director job until it is running or needs fixing..." >&2 -while [ ! -f "${CREATE_ENV_DONE_FILE:-/tmp/create-env-done}" ]; do +echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 +while true; do status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) - if echo "${status}" | grep -q "director.*running"; then - echo "director is running, no fix needed. Exiting." >&2 + # Collect names of all failed jobs (any status that is not 'running') + # monit summary lines look like: "Process 'job-name' running" + failed_jobs=$(echo "${status}" | awk "/Process/{print \$2}" | tr -d "'" | \ + while read -r job; do + if ! echo "${status}" | grep -q "'${job}'.*running"; then + echo "${job}" + fi + done) + + if [ -z "${failed_jobs}" ]; then + echo "All jobs are running, no fix needed. Exiting." >&2 exit 0 fi - if echo "${status}" | grep -qE "director.*(Execution failed)"; then - echo "director job is failing, proceeding with fix..." >&2 - break - fi - echo "director not yet running nor failing, waiting..." >&2 - sleep 5 -done -# BPM only needs the runc state dir to be gone before it can re-create -# the container. The orphaned cgroup scope dirs will be cleaned up by the -# host systemd garbage collector once there are no more references to them. -docker exec "${director_container}" bash -c ' - runc_bin=/var/vcap/packages/bpm/bin/runc - runc_root=/var/vcap/sys/run/bpm-runc + echo "Failed jobs detected:" >&2 + echo "${failed_jobs}" >&2 + echo "Applying fix..." >&2 - for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do - # postgres must keep running — the director depends on it - [ "${container_id}" = "bpm-postgres" ] && continue - echo "Cleaning up runc container: ${container_id}" >&2 - rm -rf "${runc_root:?}/${container_id}" + # For each failed job: remove its runc state dir so BPM can re-create it, + # then restart it via monit. + echo "${failed_jobs}" | while read -r job; do + # Map monit job name to runc container id (BPM uses "bpm-" convention, + # dots in sub-process names are encoded as ".2e") + runc_id="bpm-${job}" + docker exec "${director_container}" bash -c " + runc_root=/var/vcap/sys/run/bpm-runc + runc_id='${runc_id}' + if [ -d \"\${runc_root}/\${runc_id}\" ]; then + echo \"Removing runc state dir for \${runc_id}\" >&2 + rm -rf \"\${runc_root:?}/\${runc_id}\" + fi + echo \"Restarting monit job: ${job}\" >&2 + /var/vcap/bosh/bin/monit restart '${job}' || true + " 2>/dev/null || true done - /var/vcap/bosh/bin/monit summary | awk "/Process/{print \$2}" | tr -d "'"'"'" | \ - while read -r job; do - # Restart all monitored jobs except postgres (which must keep running - # as the director database — its restart is slow and will cause the same - # failure for director jobs, which depend on it - [ "${job}" = "postgres" ] && continue - echo "Restarting monit job: ${job}" >&2 - /var/vcap/bosh/bin/monit restart "${job}" || true - done -' 2>/dev/null || true + echo "Fix applied, waiting 10s before re-checking..." >&2 + sleep 10 +done +echo "create-env completed, fix-bosh-instance exiting." >&2 From c4e2ba626d2ce8df848dff238d28415875c8700c Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 15:09:05 +0100 Subject: [PATCH 45/60] CFN-6544: fix workaround logic --- ci/scripts/fix-bosh-instance.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index d92bf9eb..f3cd51f0 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -35,7 +35,7 @@ done echo "Waiting for postgres job to be running in director container..." >&2 while true; do status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) - if echo "${status}" | grep -q "postgres.*running"; then + if echo "${status}" | grep -qE "'postgres'.*running"; then echo "postgres is running" >&2 break fi @@ -51,7 +51,7 @@ while true; do # monit summary lines look like: "Process 'job-name' running" failed_jobs=$(echo "${status}" | awk "/Process/{print \$2}" | tr -d "'" | \ while read -r job; do - if ! echo "${status}" | grep -q "'${job}'.*running"; then + if echo "${status}" | grep -qE "'${job}'.*(Execution failed)"; then echo "${job}" fi done) From 4723e0d7f5da0675ce3309e70d4b26c367071f5c Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 15:24:40 +0100 Subject: [PATCH 46/60] CFN-6544: fix workaround logic --- ci/scripts/fix-bosh-instance.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index f3cd51f0..ec51875e 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -35,6 +35,8 @@ done echo "Waiting for postgres job to be running in director container..." >&2 while true; do status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + echo "Monit status in director container:" >&2 + echo "${status}" >&2 if echo "${status}" | grep -qE "'postgres'.*running"; then echo "postgres is running" >&2 break @@ -46,6 +48,8 @@ done echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 while true; do status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + echo "Monit status in director container:" >&2 + echo "${status}" >&2 # Collect names of all failed jobs (any status that is not 'running') # monit summary lines look like: "Process 'job-name' running" From 737077f401c302bad73bdbf85b37f0a1a654f2f7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 15:38:25 +0100 Subject: [PATCH 47/60] CFN-6544: fix workaround logic --- ci/scripts/fix-bosh-instance.sh | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index ec51875e..57807368 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -20,6 +20,7 @@ set +e +monit_bin=/var/vcap/bosh/bin/monit director_container="" echo "Waiting for director container to appear..." >&2 while true; do @@ -34,10 +35,8 @@ done echo "Waiting for postgres job to be running in director container..." >&2 while true; do - status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) - echo "Monit status in director container:" >&2 - echo "${status}" >&2 - if echo "${status}" | grep -qE "'postgres'.*running"; then + status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) + if echo "${status}" | grep -q "'postgres'.*running"; then echo "postgres is running" >&2 break fi @@ -46,10 +45,21 @@ while true; do done echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 +iteration=0 while true; do - status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) - echo "Monit status in director container:" >&2 - echo "${status}" >&2 + # Recheck the director container ID every N iterations — bosh create-env may + # recreate the container before starting jobs there, giving it a new ID. + # Stale ID causes all docker exec calls to fail silently. + if [ $((iteration % 4)) -eq 0 ]; then + new_container=$(docker ps --format "{{.ID}}" | head -1) + if [ -n "${new_container}" ] && [ "${new_container}" != "${director_container}" ]; then + echo "Director container ID changed: ${director_container} -> ${new_container}" >&2 + director_container="${new_container}" + fi + fi + iteration=$((iteration + 1)) + + status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) # Collect names of all failed jobs (any status that is not 'running') # monit summary lines look like: "Process 'job-name' running" @@ -77,13 +87,14 @@ while true; do runc_id="bpm-${job}" docker exec "${director_container}" bash -c " runc_root=/var/vcap/sys/run/bpm-runc + monit_bin=${monit_bin} runc_id='${runc_id}' if [ -d \"\${runc_root}/\${runc_id}\" ]; then echo \"Removing runc state dir for \${runc_id}\" >&2 rm -rf \"\${runc_root:?}/\${runc_id}\" fi echo \"Restarting monit job: ${job}\" >&2 - /var/vcap/bosh/bin/monit restart '${job}' || true + \${monit_bin} restart '${job}' || true " 2>/dev/null || true done From e7143685f14de989a3c67f95a39b9ff859a147f7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 15:43:56 +0100 Subject: [PATCH 48/60] CFN-6544: fix workaround logic --- ci/scripts/fix-bosh-instance.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index 57807368..b1fe6a7c 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -30,21 +30,10 @@ while true; do break fi echo "Director container not yet running, waiting..." >&2 - sleep 5 + sleep 10 done echo "Waiting for postgres job to be running in director container..." >&2 -while true; do - status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) - if echo "${status}" | grep -q "'postgres'.*running"; then - echo "postgres is running" >&2 - break - fi - echo "postgres not yet running, waiting..." >&2 - sleep 5 -done - -echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 iteration=0 while true; do # Recheck the director container ID every N iterations — bosh create-env may @@ -59,6 +48,17 @@ while true; do fi iteration=$((iteration + 1)) + status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) + if echo "${status}" | grep -q "'postgres'.*running"; then + echo "postgres is running" >&2 + break + fi + echo "postgres not yet running, waiting..." >&2 + sleep 3 +done + +echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 +while true; do status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) # Collect names of all failed jobs (any status that is not 'running') From 295a8af724d2c70a7a05d30f112bee24fa4171bc Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 16:00:29 +0100 Subject: [PATCH 49/60] CFN-6544: fix workaround logic --- ci/scripts/fix-bosh-instance.sh | 56 ++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh index b1fe6a7c..893f2214 100644 --- a/ci/scripts/fix-bosh-instance.sh +++ b/ci/scripts/fix-bosh-instance.sh @@ -61,29 +61,55 @@ echo "Monitoring jobs until all are running or create-env reaches its timeout... while true; do status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) - # Collect names of all failed jobs (any status that is not 'running') - # monit summary lines look like: "Process 'job-name' running" - failed_jobs=$(echo "${status}" | awk "/Process/{print \$2}" | tr -d "'" | \ - while read -r job; do - if echo "${status}" | grep -qE "'${job}'.*(Execution failed)"; then - echo "${job}" - fi - done) + # Categorise jobs into three states: + # running — 'running' + # failed — 'Execution failed' (BPM could not start/restart the process) + # pending — anything else (Does not exist, not monitored, initializing, restart pending, etc.) + # monit summary lines look like: "Process 'job-name' " + all_jobs=$(echo "${status}" | awk "/Process/{print \$2}" | tr -d "'") - if [ -z "${failed_jobs}" ]; then - echo "All jobs are running, no fix needed. Exiting." >&2 + failed_jobs="" + pending_jobs="" + all_running=true + + while read -r job; do + [ -z "${job}" ] && continue + # Extract the status portion after the job name, then strip any trailing + # " - " (e.g. "Execution failed - restart pending" -> "Execution failed") + job_status=$(echo "${status}" | grep "'${job}'" | sed "s/.*'${job}'[[:space:]]*//" | sed "s/[[:space:]]*-[[:space:]].*//" | xargs) + if echo "${job_status}" | grep -qiw "running"; then + : # running — good + elif echo "${job_status}" | grep -qi "execution failed"; then + failed_jobs="${failed_jobs} ${job}" + all_running=false + else + pending_jobs="${pending_jobs} ${job}" + all_running=false + fi + done <<< "${all_jobs}" + + if [ "${all_running}" = "true" ]; then + echo "All jobs are running. Exiting." >&2 exit 0 fi - echo "Failed jobs detected:" >&2 - echo "${failed_jobs}" >&2 + if [ -n "${pending_jobs}" ]; then + echo "Pending jobs (waiting for them to settle):${pending_jobs}" >&2 + fi + + if [ -z "${failed_jobs}" ]; then + echo "No failed jobs, only pending — skipping fix, will recheck..." >&2 + sleep 10 + continue + fi + + echo "Failed jobs detected:${failed_jobs}" >&2 echo "Applying fix..." >&2 # For each failed job: remove its runc state dir so BPM can re-create it, # then restart it via monit. - echo "${failed_jobs}" | while read -r job; do - # Map monit job name to runc container id (BPM uses "bpm-" convention, - # dots in sub-process names are encoded as ".2e") + for job in ${failed_jobs}; do + # Map monit job name to runc container id (BPM uses "bpm-" convention) runc_id="bpm-${job}" docker exec "${director_container}" bash -c " runc_root=/var/vcap/sys/run/bpm-runc From 503f26589feecd6be3deb15be7594aec8fd75447 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 16:13:33 +0100 Subject: [PATCH 50/60] CFN-6544: fix deployment name --- acceptance-tests/acceptance_tests_suite_test.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 5a43a48c..9093c394 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -26,6 +26,12 @@ func deploymentNameForTestNode() string { return fmt.Sprintf("haproxy%d", GinkgoParallelProcess()) } +func deploymentBasicNameForSuite() string { + // TODO: set back to the thread name when https://github.com/cloudfoundry/bpm-release/issues/208 is solved + // return deploymentNameForTestNode() + return "haproxy-basic" +} + func TestAcceptanceTests(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "AcceptanceTests Suite") @@ -46,7 +52,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { deployHAProxy(baseManifestVars{ haproxyBackendPort: 12000, haproxyBackendServers: []string{"127.0.0.1"}, - deploymentName: deploymentNameForTestNode(), + deploymentName: deploymentBasicNameForSuite(), }, []string{}, map[string]interface{}{}, true) configBytes, err := json.Marshal(&config) @@ -61,7 +67,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { var _ = SynchronizedAfterSuite(func() { // Clean up deployments on each thread - deleteDeployment(deploymentNameForTestNode()) + deleteDeployment(deploymentBasicNameForSuite()) }, func() {}) type TestServerOption func(*httptest.Server) From 6b0a941af13c82f7c1b01ea4110ac5c26beabab4 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 17:53:56 +0100 Subject: [PATCH 51/60] CFN-6544: patch for BPM release --- .../acceptance_tests_suite_test.go | 12 ++--- ci/Dockerfile | 17 ++++++- ci/local-releases/bpm-del-runc-state.patch | 41 ++++++++++++++++ ci/ops/local-releases.yml | 3 ++ ci/scripts/start-bosh.sh | 49 ++++++++++--------- 5 files changed, 90 insertions(+), 32 deletions(-) create mode 100644 ci/local-releases/bpm-del-runc-state.patch create mode 100644 ci/ops/local-releases.yml diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 9093c394..10201ecf 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -160,13 +160,11 @@ func setupTunnelFromHaproxyIPToTestServerIP(haproxyInfo haproxyInfo, haproxyBack err := startReverseSSHPortAndIPForwarder(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, haproxyBackendIP, haproxyBackendPort, localIP, localPort, ctx) Expect(err).NotTo(HaveOccurred()) - By("Waiting for HAProxy to detect the backend server is listening") - // HAProxy backend health check interval is 1 second. - // Poll until the backend port is reachable from the HAProxy VM - // instead of blindly sleeping. - Eventually(func() error { - return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, 80)) - }, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) + //By("Waiting a few seconds for HAProxy to detect the backend server is listening") + //Eventually(func() error { + // return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, 80)) + //}, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) + time.Sleep(5 * time.Second) return cancelFunc } diff --git a/ci/Dockerfile b/ci/Dockerfile index 051cdbe2..2daeb3ae 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -4,17 +4,32 @@ FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ apt-get install -y wget jq git vim nano python3-pip python3-venv && \ + apt-get install -y build-essential zlib1g-dev ruby ruby-dev openssl libxslt1-dev libxml2-dev libssl-dev libreadline-dev libyaml-dev libsqlite3-dev sqlite3 && \ apt-get clean +# Install bosh cli v2 +RUN wget -O ./bosh https://github.com/cloudfoundry/bosh-cli/releases/download/v7.9.18/bosh-cli-7.9.18-linux-amd64 && \ + chmod +x ./bosh && \ + mv ./bosh /usr/local/bin/bosh && \ + bosh -v + # Set bosh env at login RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc -# Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease + +# Make local BPM release +RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../local-releases/bpm-del-runc-state.patch +RUN bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz +COPY bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz +RUN cd .. && rm -rf bpm-release + +# Copy ops files COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml COPY ops/bosh-timeouts.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-timeouts.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml +COPY ops/local-releases.yml /usr/local/bosh-deployment/haproxy-boshrelease/local-releases.yml # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv diff --git a/ci/local-releases/bpm-del-runc-state.patch b/ci/local-releases/bpm-del-runc-state.patch new file mode 100644 index 00000000..17eeb1c2 --- /dev/null +++ b/ci/local-releases/bpm-del-runc-state.patch @@ -0,0 +1,41 @@ +diff --git a/src/bpm/runc/client/client.go b/src/bpm/runc/client/client.go +index f9a2591c..e50a9390 100644 +--- a/src/bpm/runc/client/client.go ++++ b/src/bpm/runc/client/client.go +@@ -228,6 +228,12 @@ func (c *RuncClient) DeleteContainer(containerID string) error { + return runcCmd.Run() + } + ++func (c *RuncClient) DeleteState(containerID string) error { ++ args := []string{"-rf", fmt.Sprintf("%s/%s", c.runcRoot, containerID)} ++ cmd := exec.Command("rm", args...) ++ return cmd.Run() ++} ++ + func (*RuncClient) DestroyBundle(bundlePath string) error { + return os.RemoveAll(bundlePath) + } +diff --git a/src/bpm/runc/lifecycle/lifecycle.go b/src/bpm/runc/lifecycle/lifecycle.go +index 0621e2d0..cf25b158 100644 +--- a/src/bpm/runc/lifecycle/lifecycle.go ++++ b/src/bpm/runc/lifecycle/lifecycle.go +@@ -74,6 +74,7 @@ type RuncClient interface { + ContainerState(containerID string) (*specs.State, error) + ListContainers() ([]client.ContainerState, error) + SignalContainer(containerID string, signal client.Signal) error ++ DeleteState(containerID string) error + DeleteContainer(containerID string) error + DestroyBundle(bundlePath string) error + } +@@ -274,6 +275,11 @@ func (j *RuncLifecycle) StopProcess(logger lager.Logger, cfg *config.BPMConfig, + } + + func (j *RuncLifecycle) RemoveProcess(logger lager.Logger, cfg *config.BPMConfig) error { ++ logger.Info("forcefully-deleting-runc-state") ++ if err := j.runcClient.DeleteState(cfg.ContainerID()); err != nil { ++ return err ++ } ++ + logger.Info("forcefully-deleting-container") + if err := j.runcClient.DeleteContainer(cfg.ContainerID()); err != nil { + return err diff --git a/ci/ops/local-releases.yml b/ci/ops/local-releases.yml new file mode 100644 index 00000000..dbca5229 --- /dev/null +++ b/ci/ops/local-releases.yml @@ -0,0 +1,3 @@ +- type: replace + path: /releases/name=bpm/url + value: file:///usr/local/bosh-deployment/haproxy-boshrelease/local-releases.yml \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e5820fb4..98052b03 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -251,6 +251,7 @@ EOF -o /usr/local/ops-files/local-releases.yml \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-timeouts.yml" \ + -o "$ops_files_dir/local-releases.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ @@ -261,33 +262,33 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 - - # Run fix-bosh-instance.sh in background in parallel with create-env. - # It will wait for the director container to run and then, if needed, fix - # BPM/runc state so create-env can succeed without getting stuck on failed jobs. - local fix_log="/tmp/fix-bosh-instance.log" - bash "${SCRIPT_DIR}/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & - fix_pid=$! - - set +e +# +# # Run fix-bosh-instance.sh in background in parallel with create-env. +# # It will wait for the director container to run and then, if needed, fix +# # BPM/runc state so create-env can succeed without getting stuck on failed jobs. +# local fix_log="/tmp/fix-bosh-instance.log" +# bash "${SCRIPT_DIR}/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & +# fix_pid=$! +# +# set +e bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" - local create_env_rc=$? - - if [ -n "${fix_pid}" ]; then - kill -9 "${fix_pid}" 2>/dev/null || true - wait "${fix_pid}" 2>/dev/null || true - echo "===== fix-bosh-instance.sh log =====" >&2 - cat "${fix_log}" >&2 - echo "===== end fix-bosh-instance.sh log =====" >&2 - fi - set -e - - if [ "${create_env_rc}" -ne "0" ]; then - echo "bosh create-env failed (exit code ${create_env_rc}). Exiting." >&2 - exit 1 - fi +# local create_env_rc=$? +# +# if [ -n "${fix_pid}" ]; then +# kill -9 "${fix_pid}" 2>/dev/null || true +# wait "${fix_pid}" 2>/dev/null || true +# echo "===== fix-bosh-instance.sh log =====" >&2 +# cat "${fix_log}" >&2 +# echo "===== end fix-bosh-instance.sh log =====" >&2 +# fi +# set -e +# +# if [ "${create_env_rc}" -ne "0" ]; then +# echo "bosh create-env failed (exit code ${create_env_rc}). Exiting." >&2 +# exit 1 +# fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" From 0db66e84b0b71000af2a4c6c4371dea54e1cee19 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 18:27:12 +0100 Subject: [PATCH 52/60] CFN-6544: fixed Dockerfile --- ci/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 2daeb3ae..5af9c924 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -20,7 +20,8 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease # Make local BPM release -RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../local-releases/bpm-del-runc-state.patch +COPY local-releases/bpm-del-runc-state.patch ./bpm-del-runc-state.patch +RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch RUN bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz COPY bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz RUN cd .. && rm -rf bpm-release From b7df65410388d9e2d270d22b83ecc880727f9d09 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 18:31:13 +0100 Subject: [PATCH 53/60] CFN-6544: fixed Dockerfile --- ci/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 5af9c924..118b1be6 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -21,10 +21,10 @@ RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease # Make local BPM release COPY local-releases/bpm-del-runc-state.patch ./bpm-del-runc-state.patch -RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch -RUN bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz -COPY bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz -RUN cd .. && rm -rf bpm-release +RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch && \ + bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz && \ + bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ + cd .. && rm -rf bpm-release # Copy ops files COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml From d0e5f0236d71ad91865755c6fb5dbe786c9d3695 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 18:33:44 +0100 Subject: [PATCH 54/60] CFN-6544: fixed Dockerfile --- ci/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 118b1be6..19c6b182 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -23,7 +23,7 @@ RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease COPY local-releases/bpm-del-runc-state.patch ./bpm-del-runc-state.patch RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch && \ bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz && \ - bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ + mv bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ cd .. && rm -rf bpm-release # Copy ops files From 28bf0d51ca98cae675378d5dcfeb94cc4cf3a241 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 18:35:54 +0100 Subject: [PATCH 55/60] CFN-6544: fixed ops file --- ci/ops/local-releases.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ops/local-releases.yml b/ci/ops/local-releases.yml index dbca5229..f99e4907 100644 --- a/ci/ops/local-releases.yml +++ b/ci/ops/local-releases.yml @@ -1,3 +1,3 @@ - type: replace path: /releases/name=bpm/url - value: file:///usr/local/bosh-deployment/haproxy-boshrelease/local-releases.yml \ No newline at end of file + value: file:///usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz \ No newline at end of file From 91c419e098c0fd68837856f1fa8c40320ecfa7bf Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 19:17:09 +0100 Subject: [PATCH 56/60] CFN-6544: rollback of workarounds --- .../acceptance_tests_suite_test.go | 18 +-- acceptance-tests/bosh_helpers.go | 79 +---------- acceptance-tests/config.go | 14 -- acceptance-tests/lua_test.go | 3 - ci/scripts/fix-bosh-instance.sh | 131 ------------------ ci/scripts/start-bosh.sh | 24 ---- manifests/haproxy.yml | 2 +- 7 files changed, 13 insertions(+), 258 deletions(-) delete mode 100644 ci/scripts/fix-bosh-instance.sh diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 10201ecf..ab937416 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -26,12 +26,6 @@ func deploymentNameForTestNode() string { return fmt.Sprintf("haproxy%d", GinkgoParallelProcess()) } -func deploymentBasicNameForSuite() string { - // TODO: set back to the thread name when https://github.com/cloudfoundry/bpm-release/issues/208 is solved - // return deploymentNameForTestNode() - return "haproxy-basic" -} - func TestAcceptanceTests(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "AcceptanceTests Suite") @@ -52,7 +46,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { deployHAProxy(baseManifestVars{ haproxyBackendPort: 12000, haproxyBackendServers: []string{"127.0.0.1"}, - deploymentName: deploymentBasicNameForSuite(), + deploymentName: deploymentNameForTestNode(), }, []string{}, map[string]interface{}{}, true) configBytes, err := json.Marshal(&config) @@ -67,7 +61,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { var _ = SynchronizedAfterSuite(func() { // Clean up deployments on each thread - deleteDeployment(deploymentBasicNameForSuite()) + deleteDeployment(deploymentNameForTestNode()) }, func() {}) type TestServerOption func(*httptest.Server) @@ -160,10 +154,10 @@ func setupTunnelFromHaproxyIPToTestServerIP(haproxyInfo haproxyInfo, haproxyBack err := startReverseSSHPortAndIPForwarder(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, haproxyBackendIP, haproxyBackendPort, localIP, localPort, ctx) Expect(err).NotTo(HaveOccurred()) - //By("Waiting a few seconds for HAProxy to detect the backend server is listening") - //Eventually(func() error { - // return checkListening(fmt.Sprintf("%s:%d", haproxyInfo.PublicIP, 80)) - //}, 2*time.Minute, time.Second).ShouldNot(HaveOccurred()) + By("Waiting a few seconds so that HAProxy can detect the backend server is listening") + // HAProxy backend health check interval is 1 second + // So we wait five seconds here to ensure that HAProxy + // has time to verify that the backend is now up time.Sleep(5 * time.Second) return cancelFunc diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index d1d5e001..8197b47c 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -149,9 +149,13 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c cmd, varsStoreReader := deployBaseManifestCmd(baseManifestVars.deploymentName, opsfiles, manifestVars) dumpCmd(cmd) - session := deployWithRetry(baseManifestVars.deploymentName, cmd, 20*time.Minute, expectSuccess) + session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + Expect(err).NotTo(HaveOccurred()) - if !expectSuccess { + if expectSuccess { + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) + } else { + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) Expect(session.ExitCode()).NotTo(BeZero()) } @@ -163,48 +167,6 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c return haproxyInfo, varsStoreReader } -// deployWithRetry runs a bosh deploy command and retries up to config.FlakeAttempts times. -// On each failed attempt the deployment is deleted before retrying, so the next attempt starts clean. -// If expectSuccess is false the command is run once without retrying (failure is expected by the caller). -func deployWithRetry(boshDeployment string, cmd *exec.Cmd, timeout time.Duration, expectSuccess bool) *gexec.Session { - var session *gexec.Session - var err error - - for attempt := 1; attempt <= config.FlakeAttempts; attempt++ { - if attempt > 1 { - writeLog(fmt.Sprintf("Deployment attempt %d/%d failed, deleting deployment before retry...", attempt-1, config.FlakeAttempts)) - deleteDeployment(boshDeployment) - - writeLog(fmt.Sprintf("Retrying deployment (attempt %d/%d)...", attempt, config.FlakeAttempts)) - newCmd := exec.Command(cmd.Path, cmd.Args[1:]...) - newCmd.Env = cmd.Env - cmd = newCmd - } - - session, err = gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - Expect(err).NotTo(HaveOccurred()) - - // Wait for the process to exit without asserting the exit code - Eventually(session, timeout, time.Second).Should(gexec.Exit()) - - if !expectSuccess { - // caller expects failure — return immediately without retrying - return session - } - - if session.ExitCode() == 0 { - writeLog(fmt.Sprintf("Deployment succeeded on attempt %d/%d", attempt, config.FlakeAttempts)) - return session - } - - writeLog(fmt.Sprintf("Deployment failed on attempt %d/%d (exit code %d)", attempt, config.FlakeAttempts, session.ExitCode())) - } - - // All attempts exhausted — fail the test with a clear message - Expect(session.ExitCode()).To(BeZero(), fmt.Sprintf("Deployment failed after %d attempt(s)", config.FlakeAttempts)) - return session -} - func dumpCmd(cmd *exec.Cmd) { writeLog("---------- Command to run ----------") writeLog(cmd.String()) @@ -336,35 +298,6 @@ func deleteDeployment(boshDeployment string) { Eventually(session, 10*time.Minute, time.Second).Should(gexec.Exit(0)) } -func restartAllJobsOnDeployment(boshDeployment string) { - By(fmt.Sprintf("Restarting all jobs on deployment '%s'", boshDeployment)) - - bashCode := ` -runc_bin=/var/vcap/packages/bpm/bin/runc -runc_root=/var/vcap/sys/run/bpm-runc - -for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do - echo "Cleaning up runc container: ${container_id}" >&2 - rm -rf "${runc_root:?}/${container_id}" -done - -/var/vcap/bosh/bin/monit summary | awk '/Process/{print $2}' | tr -d "'" | \ -while read -r job; do - echo "Restarting monit job: ${job}" >&2 - /var/vcap/bosh/bin/monit restart "${job}" || true -done -` - - instances := boshInstances(boshDeployment) - for _, instance := range instances { - writeLog(fmt.Sprintf("Running runc cleanup + monit restart on %s", instance.Instance)) - cmd := config.boshCmd(boshDeployment, "ssh", instance.Instance, "-c", bashCode) - session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - Expect(err).NotTo(HaveOccurred()) - Eventually(session, time.Minute, time.Second).Should(gexec.Exit(0)) - } -} - func waitForHAProxyListening(haproxyInfo haproxyInfo) { Eventually(func() error { return checkListening(fmt.Sprintf("%s:443", haproxyInfo.PublicIP)) diff --git a/acceptance-tests/config.go b/acceptance-tests/config.go index aa27d4c8..5d3c6bce 100644 --- a/acceptance-tests/config.go +++ b/acceptance-tests/config.go @@ -4,13 +4,10 @@ import ( "fmt" "os" "os/exec" - "strconv" ) var config Config -const DEFAULT_FLAKE_ATTEMPTS = 5 - type Config struct { ReleaseRepoPath string `json:"releaseRepoPath"` ReleaseVersion string `json:"releaseVersion"` @@ -21,7 +18,6 @@ type Config struct { BoshPath string `json:"boshPath"` BaseManifestPath string `json:"baseManifestPath"` HomePath string `json:"homePath"` - FlakeAttempts int `json:"flakeAttempts"` } func loadConfig() (Config, error) { @@ -71,15 +67,6 @@ func loadConfig() (Config, error) { return Config{}, err } - flakeAttempts := DEFAULT_FLAKE_ATTEMPTS - if val := os.Getenv("FLAKE_ATTEMPTS"); val != "" { - if flakeAttemptsFromEnv, err := strconv.Atoi(val); err == nil && flakeAttemptsFromEnv > 0 { - flakeAttempts = flakeAttemptsFromEnv - } else { - writeLog(fmt.Sprintf("FLAKE_ATTEMPTS must be a positive integer, but got: %s, so defaulting test suite's flakeAttempts to %d", val, DEFAULT_FLAKE_ATTEMPTS)) - } - } - return Config{ ReleaseRepoPath: releaseRepoPath, ReleaseVersion: releaseVersion, @@ -90,7 +77,6 @@ func loadConfig() (Config, error) { BoshPath: boshPath, BaseManifestPath: baseManifestPath, HomePath: homePath, - FlakeAttempts: flakeAttempts, }, nil } diff --git a/acceptance-tests/lua_test.go b/acceptance-tests/lua_test.go index 760d6e1c..e690b41e 100644 --- a/acceptance-tests/lua_test.go +++ b/acceptance-tests/lua_test.go @@ -61,9 +61,6 @@ core.register_service("lua_test", "http", lua_test) closeTunnel := setupTunnelFromHaproxyToTestServer(haproxyInfo, haproxyBackendPort, localPort) defer closeTunnel() - // TODO: remove this test once the issue https://github.com/cloudfoundry/bpm-release/issues/208 is solved - restartAllJobsOnDeployment(deploymentNameForTestNode()) - By("Waiting monit to report HAProxy is now healthy (the lua script was uploaded after start).") // Since the backend is now listening, HAProxy healthcheck should start returning healthy // and monit should in turn start reporting a healthy process diff --git a/ci/scripts/fix-bosh-instance.sh b/ci/scripts/fix-bosh-instance.sh deleted file mode 100644 index 893f2214..00000000 --- a/ci/scripts/fix-bosh-instance.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env bash - -# This script runs in parallel with bosh create-env. -# It waits until the director container appears and the postgres job successfully starts, then, -# if director job failed, it fixes issue with runc state directories during BPM restart of failed jobs. -# (see https://github.com/cloudfoundry/bpm-release/issues/208). -# -# BPM cannot cleanup before restarting job. The cleanup means to delete -# the runc container for the job and it cannot be deleted because because -# the container cgroup scope dirs (system.slice/runc-*.scope) are owned -# by the host systemd and cannot be deleted from inside the nested container, -# even when they are empty. -# -# We will find the director container and clean runc state dirs so monit can bring -# the jobs back up via BPM restart. -# -# BPM only needs the runc state dir to be gone before it can re-create -# the container. The orphaned cgroup scope dirs will be cleaned up by the -# host systemd garbage collector once there are no more references to them. - -set +e - -monit_bin=/var/vcap/bosh/bin/monit -director_container="" -echo "Waiting for director container to appear..." >&2 -while true; do - director_container=$(docker ps --format "{{.ID}}" | head -1) # At this point there should only be one container, the director - if [ -n "${director_container}" ]; then - echo "Director container appeared: ${director_container}" >&2 - break - fi - echo "Director container not yet running, waiting..." >&2 - sleep 10 -done - -echo "Waiting for postgres job to be running in director container..." >&2 -iteration=0 -while true; do - # Recheck the director container ID every N iterations — bosh create-env may - # recreate the container before starting jobs there, giving it a new ID. - # Stale ID causes all docker exec calls to fail silently. - if [ $((iteration % 4)) -eq 0 ]; then - new_container=$(docker ps --format "{{.ID}}" | head -1) - if [ -n "${new_container}" ] && [ "${new_container}" != "${director_container}" ]; then - echo "Director container ID changed: ${director_container} -> ${new_container}" >&2 - director_container="${new_container}" - fi - fi - iteration=$((iteration + 1)) - - status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) - if echo "${status}" | grep -q "'postgres'.*running"; then - echo "postgres is running" >&2 - break - fi - echo "postgres not yet running, waiting..." >&2 - sleep 3 -done - -echo "Monitoring jobs until all are running or create-env reaches its timeout..." >&2 -while true; do - status=$(docker exec "${director_container}" ${monit_bin} summary 2>/dev/null) - - # Categorise jobs into three states: - # running — 'running' - # failed — 'Execution failed' (BPM could not start/restart the process) - # pending — anything else (Does not exist, not monitored, initializing, restart pending, etc.) - # monit summary lines look like: "Process 'job-name' " - all_jobs=$(echo "${status}" | awk "/Process/{print \$2}" | tr -d "'") - - failed_jobs="" - pending_jobs="" - all_running=true - - while read -r job; do - [ -z "${job}" ] && continue - # Extract the status portion after the job name, then strip any trailing - # " - " (e.g. "Execution failed - restart pending" -> "Execution failed") - job_status=$(echo "${status}" | grep "'${job}'" | sed "s/.*'${job}'[[:space:]]*//" | sed "s/[[:space:]]*-[[:space:]].*//" | xargs) - if echo "${job_status}" | grep -qiw "running"; then - : # running — good - elif echo "${job_status}" | grep -qi "execution failed"; then - failed_jobs="${failed_jobs} ${job}" - all_running=false - else - pending_jobs="${pending_jobs} ${job}" - all_running=false - fi - done <<< "${all_jobs}" - - if [ "${all_running}" = "true" ]; then - echo "All jobs are running. Exiting." >&2 - exit 0 - fi - - if [ -n "${pending_jobs}" ]; then - echo "Pending jobs (waiting for them to settle):${pending_jobs}" >&2 - fi - - if [ -z "${failed_jobs}" ]; then - echo "No failed jobs, only pending — skipping fix, will recheck..." >&2 - sleep 10 - continue - fi - - echo "Failed jobs detected:${failed_jobs}" >&2 - echo "Applying fix..." >&2 - - # For each failed job: remove its runc state dir so BPM can re-create it, - # then restart it via monit. - for job in ${failed_jobs}; do - # Map monit job name to runc container id (BPM uses "bpm-" convention) - runc_id="bpm-${job}" - docker exec "${director_container}" bash -c " - runc_root=/var/vcap/sys/run/bpm-runc - monit_bin=${monit_bin} - runc_id='${runc_id}' - if [ -d \"\${runc_root}/\${runc_id}\" ]; then - echo \"Removing runc state dir for \${runc_id}\" >&2 - rm -rf \"\${runc_root:?}/\${runc_id}\" - fi - echo \"Restarting monit job: ${job}\" >&2 - \${monit_bin} restart '${job}' || true - " 2>/dev/null || true - done - - echo "Fix applied, waiting 10s before re-checking..." >&2 - sleep 10 -done - -echo "create-env completed, fix-bosh-instance exiting." >&2 diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 98052b03..599005bd 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -262,33 +262,9 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 -# -# # Run fix-bosh-instance.sh in background in parallel with create-env. -# # It will wait for the director container to run and then, if needed, fix -# # BPM/runc state so create-env can succeed without getting stuck on failed jobs. -# local fix_log="/tmp/fix-bosh-instance.log" -# bash "${SCRIPT_DIR}/fix-bosh-instance.sh" >"${fix_log}" 2>&1 & -# fix_pid=$! -# -# set +e bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" -# local create_env_rc=$? -# -# if [ -n "${fix_pid}" ]; then -# kill -9 "${fix_pid}" 2>/dev/null || true -# wait "${fix_pid}" 2>/dev/null || true -# echo "===== fix-bosh-instance.sh log =====" >&2 -# cat "${fix_log}" >&2 -# echo "===== end fix-bosh-instance.sh log =====" >&2 -# fi -# set -e -# -# if [ "${create_env_rc}" -ne "0" ]; then -# echo "bosh create-env failed (exit code ${create_env_rc}). Exiting." >&2 -# exit 1 -# fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index e0de40a1..fb1fc947 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -37,7 +37,7 @@ stemcells: releases: - name: bpm version: 1.4.26 - url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 + url: file:///usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 - name: haproxy version: 16.4.0+3.2.13 From ed50ee488e064fd50637e1857ab2aff4e62ee555 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 19:39:56 +0100 Subject: [PATCH 57/60] CFN-6544: Dockerfile fix --- ci/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 19c6b182..4d1010d5 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -3,7 +3,7 @@ FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest # Install all necessary tools for haproxy testflight and dependency autobump ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget jq git vim nano python3-pip python3-venv && \ + apt-get install -y tar wget jq git vim nano python3-pip python3-venv && \ apt-get install -y build-essential zlib1g-dev ruby ruby-dev openssl libxslt1-dev libxml2-dev libssl-dev libreadline-dev libyaml-dev libsqlite3-dev sqlite3 && \ apt-get clean @@ -24,6 +24,7 @@ COPY local-releases/bpm-del-runc-state.patch ./bpm-del-runc-state.patch RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch && \ bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz && \ mv bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ + chmod 644 /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ cd .. && rm -rf bpm-release # Copy ops files From 54f4755bb8d86224da61e9776d85883ee428fac5 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 20:02:44 +0100 Subject: [PATCH 58/60] CFN-6544: Upload local dev releases beforehand --- ci/scripts/start-bosh.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 599005bd..cca9b746 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -242,16 +242,16 @@ EOF } EOF - local ops_files_dir="$PWD/haproxy-boshrelease" + local customization_dir="$PWD/haproxy-boshrelease" echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/ops-files/local-releases.yml \ - -o "$ops_files_dir/bosh-scaled-out.yml" \ - -o "$ops_files_dir/bosh-timeouts.yml" \ - -o "$ops_files_dir/local-releases.yml" \ + -o "$customization_dir/bosh-scaled-out.yml" \ + -o "$customization_dir/bosh-timeouts.yml" \ + -o "$customization_dir/local-releases.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ @@ -286,9 +286,16 @@ EOF echo "Updating BOSH cloud config with Docker network..." >&2 bosh -n update-cloud-config \ docker/cloud-config.yml \ - -o "$ops_files_dir/compilation.yml" \ + -o "$customization_dir/compilation.yml" \ -v network="${docker_network_name}" + echo "Upload local releases..." >&2 + for release_tgz in "${customization_dir}"/*.tgz; do + [ -f "${release_tgz}" ] || continue + echo "Uploading release: ${release_tgz}" >&2 + bosh -n upload-release "${release_tgz}" + done + popd > /dev/null } From 6e83b1bda5553744d65cee3a171422b4eacc8941 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 13 Mar 2026 20:10:11 +0100 Subject: [PATCH 59/60] CFN-6544: Upload local dev releases beforehand --- ci/Dockerfile | 2 +- ci/ops/local-releases.yml | 7 +++++-- manifests/haproxy.yml | 4 +--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 4d1010d5..54d292d4 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -22,7 +22,7 @@ RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease # Make local BPM release COPY local-releases/bpm-del-runc-state.patch ./bpm-del-runc-state.patch RUN git clone https://github.com/cloudfoundry/bpm-release.git && cd bpm-release && git apply ../bpm-del-runc-state.patch && \ - bosh create-release --force --timestamp-version --tarball=bpm-patched-dev-release.tgz && \ + bosh create-release --force --version=1.4.26+patch.runc.state --tarball=bpm-patched-dev-release.tgz && \ mv bpm-patched-dev-release.tgz /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ chmod 644 /usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz && \ cd .. && rm -rf bpm-release diff --git a/ci/ops/local-releases.yml b/ci/ops/local-releases.yml index f99e4907..c6e3803e 100644 --- a/ci/ops/local-releases.yml +++ b/ci/ops/local-releases.yml @@ -1,3 +1,6 @@ - type: replace - path: /releases/name=bpm/url - value: file:///usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz \ No newline at end of file + path: /releases/name=bpm? + value: + name: bpm + version: 1.4.26+patch.runc.state + url: file:///usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz \ No newline at end of file diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index fb1fc947..01bb6103 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -36,9 +36,7 @@ stemcells: releases: - name: bpm - version: 1.4.26 - url: file:///usr/local/bosh-deployment/haproxy-boshrelease/bpm-patched-dev-release.tgz - sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 + version: 1.4.26+patch.runc.state - name: haproxy version: 16.4.0+3.2.13 url: https://github.com/cloudfoundry/haproxy-boshrelease/releases/download/v16.4.0+3.2.13/haproxy-16.4.0+3.2.13.tgz From b0e85ed56dcc55e8bf763982b51c2811008bc8ff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Sat, 14 Mar 2026 01:29:43 +0100 Subject: [PATCH 60/60] CFN-6544: New BPM patch --- ci/local-releases/bpm-del-runc-state.patch | 37 ++++++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/ci/local-releases/bpm-del-runc-state.patch b/ci/local-releases/bpm-del-runc-state.patch index 17eeb1c2..432bac56 100644 --- a/ci/local-releases/bpm-del-runc-state.patch +++ b/ci/local-releases/bpm-del-runc-state.patch @@ -1,12 +1,35 @@ diff --git a/src/bpm/runc/client/client.go b/src/bpm/runc/client/client.go -index f9a2591c..e50a9390 100644 +index f9a2591c..06c15119 100644 --- a/src/bpm/runc/client/client.go +++ b/src/bpm/runc/client/client.go -@@ -228,6 +228,12 @@ func (c *RuncClient) DeleteContainer(containerID string) error { +@@ -228,6 +228,35 @@ func (c *RuncClient) DeleteContainer(containerID string) error { return runcCmd.Run() } -+func (c *RuncClient) DeleteState(containerID string) error { ++func (c *RuncClient) CleanupState(containerID string) error { ++ // kill all lingering processes in the job's cgroup scope dirs. ++ scopeName := fmt.Sprintf("runc-%s.scope", containerID) ++ cgroupDirs, _ := filepath.Glob(fmt.Sprintf("/sys/fs/cgroup/*/system.slice/%s", scopeName)) ++ cgroupV2Dirs, _ := filepath.Glob(fmt.Sprintf("/sys/fs/cgroup/system.slice/%s", scopeName)) ++ cgroupDirs = append(cgroupDirs, cgroupV2Dirs...) ++ killPidsFromFile := func(path string) { ++ if data, err := os.ReadFile(path); err == nil && len(data) > 0 { ++ for _, line := range strings.Split(string(data), "\n") { ++ var pid int ++ if _, err := fmt.Sscanf(strings.TrimSpace(line), "%d", &pid); err == nil && pid > 0 { ++ _ = syscall.Kill(pid, syscall.SIGKILL) ++ } ++ } ++ } ++ } ++ for _, cgDir := range cgroupDirs { ++ if info, err := os.Stat(cgDir); err == nil && info.IsDir() { ++ killPidsFromFile(filepath.Join(cgDir, "cgroup.procs")) ++ killPidsFromFile(filepath.Join(cgDir, "tasks")) ++ } ++ } ++ ++ // remove the runc state dir so BPM can re-create the container. + args := []string{"-rf", fmt.Sprintf("%s/%s", c.runcRoot, containerID)} + cmd := exec.Command("rm", args...) + return cmd.Run() @@ -16,14 +39,14 @@ index f9a2591c..e50a9390 100644 return os.RemoveAll(bundlePath) } diff --git a/src/bpm/runc/lifecycle/lifecycle.go b/src/bpm/runc/lifecycle/lifecycle.go -index 0621e2d0..cf25b158 100644 +index 0621e2d0..bd00d40d 100644 --- a/src/bpm/runc/lifecycle/lifecycle.go +++ b/src/bpm/runc/lifecycle/lifecycle.go @@ -74,6 +74,7 @@ type RuncClient interface { ContainerState(containerID string) (*specs.State, error) ListContainers() ([]client.ContainerState, error) SignalContainer(containerID string, signal client.Signal) error -+ DeleteState(containerID string) error ++ CleanupState(containerID string) error DeleteContainer(containerID string) error DestroyBundle(bundlePath string) error } @@ -31,8 +54,8 @@ index 0621e2d0..cf25b158 100644 } func (j *RuncLifecycle) RemoveProcess(logger lager.Logger, cfg *config.BPMConfig) error { -+ logger.Info("forcefully-deleting-runc-state") -+ if err := j.runcClient.DeleteState(cfg.ContainerID()); err != nil { ++ logger.Info("forcefully-cleanup-runc-state") ++ if err := j.runcClient.CleanupState(cfg.ContainerID()); err != nil { + return err + } +