From 25c21f9fab3c720118359b38f76285497c3ad725 Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 13:56:19 +0000
Subject: [PATCH 1/7] tools/stress: local containerized harness script

Adds a script that brings up the e2e ledger / manager / controller /
funder via dev/dzctl, layers a stress-test device on top of the e2e
device image (with the agent daemon removed and SSH enabled), creates
the device + prepaid access passes onchain, then launches the
device-orchestrator and device-observer against it. Supports a
--no-agent smoke-test path that exercises the onchain sweep + observer
sampling without driving the SSH agent runner.
---
 tools/stress/docker/device/Dockerfile       |  18 +
 tools/stress/docker/device/agent-wrapper.sh |  21 +
 tools/stress/scripts/README.md              |  67 +++
 tools/stress/scripts/run-stress-local.sh    | 436 ++++++++++++++++++++
 4 files changed, 542 insertions(+)
 create mode 100644 tools/stress/docker/device/Dockerfile
 create mode 100755 tools/stress/docker/device/agent-wrapper.sh
 create mode 100644 tools/stress/scripts/README.md
 create mode 100755 tools/stress/scripts/run-stress-local.sh

diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile
new file mode 100644
index 000000000..2ae1422c3
--- /dev/null
+++ b/tools/stress/docker/device/Dockerfile
@@ -0,0 +1,18 @@
+# Stress-test device image.
+#
+# Layers on top of the e2e device image (dz-local/device by default) and
+# replaces the default `doublezero-agent` binary path with a wrapper that the
+# orchestrator's SSH runner can invoke as `doublezero-agent`. The wrapper
+# injects the per-device --pubkey (read from /etc/doublezero/agent/pubkey,
+# populated by run-stress-local.sh) and enables prometheus metrics so the
+# observer can scrape them.
+#
+# This image deliberately does NOT bake an EOS startup-config that runs the
+# agent as a daemon. The orchestrator owns the agent lifecycle.
+ARG DZ_DEVICE_IMAGE=dz-local/device:dev
+FROM ${DZ_DEVICE_IMAGE}
+
+COPY agent-wrapper.sh /usr/local/bin/doublezero-agent
+RUN chmod +x /usr/local/bin/doublezero-agent
+
+EXPOSE 22
diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh
new file mode 100755
index 000000000..dce5cdcfe
--- /dev/null
+++ b/tools/stress/docker/device/agent-wrapper.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Wrapper invoked over SSH by the orchestrator as `doublezero-agent`.
+# The orchestrator's SSH command is hardcoded today as:
+#   doublezero-agent -verbose [-controller HOST:PORT]
+# It does not pass -pubkey or enable metrics. This wrapper supplies both so
+# the agent can fetch its config and the observer can scrape its counters.
+set -eu
+
+PUBKEY_FILE="/etc/doublezero/agent/pubkey"
+PUBKEY=""
+if [ -r "$PUBKEY_FILE" ]; then
+    PUBKEY="$(tr -d '[:space:]' < "$PUBKEY_FILE")"
+fi
+
+EXTRA_ARGS=()
+if [ -n "$PUBKEY" ]; then
+    EXTRA_ARGS+=(-pubkey "$PUBKEY")
+fi
+EXTRA_ARGS+=(-metrics-enable -metrics-addr ":9100")
+
+exec /mnt/flash/doublezero-agent "${EXTRA_ARGS[@]}" "$@"
diff --git a/tools/stress/scripts/README.md b/tools/stress/scripts/README.md
new file mode 100644
index 000000000..464a87b57
--- /dev/null
+++ b/tools/stress/scripts/README.md
@@ -0,0 +1,67 @@
+# Local stress-test harness
+
+`run-stress-local.sh` brings up a containerized devnet (ledger + manager +
+controller + funder, via `dev/dzctl start`), adds one custom **stress device**
+container whose EOS startup config does NOT include the `doublezero-agent`
+daemon, then launches the orchestrator and observer against it. The
+orchestrator owns the agent lifecycle (starts it over SSH); the observer
+samples the device.
+
+Components:
+
+| Piece                                | Image / source                                                 |
+| ------------------------------------ | -------------------------------------------------------------- |
+| Ledger / manager / controller        | e2e harness images (`dz-local/{ledger,manager,controller}`)    |
+| Stress device                        | `tools/stress/docker/device/Dockerfile` (extends e2e device)   |
+| Agent invocation wrapper             | `tools/stress/docker/device/agent-wrapper.sh` (in stress image)|
+| Stress orchestrator                  | `tools/stress/device-orchestrator/cmd/device-orchestrator`     |
+| Stress observer                      | `tools/stress/device-observer/cmd/device-observer`             |
+
+## Quick start
+
+```bash
+# Full build + run (creates a 4-user sweep with 30s holds by default)
+tools/stress/scripts/run-stress-local.sh --clean
+
+# Skip docker build on subsequent runs
+tools/stress/scripts/run-stress-local.sh --no-build
+
+# Tweak the sweep
+tools/stress/scripts/run-stress-local.sh --target-users 8 --hold 60
+```
+
+The script ends by printing the orchestrator/observer PIDs and the run
+working directory (under `dev/.deploy/dz-local/stress/run/<UTC timestamp>/`).
+Both processes keep running in the background. Stop them with the
+`kill $(cat …)` snippet the script prints.
+
+## What the stress device differs from the e2e device
+
+It is the same cEOS base, but the startup config (rendered at run time by the
+script) drops the `daemon doublezero-agent` and `daemon doublezero-telemetry`
+blocks. SSH access for `admin` is enabled via the orchestrator's
+auto-generated ed25519 key, and the admin login shell in `/etc/passwd` is
+flipped to `/bin/bash` after EOS boot so the SSH-exec'd
+`doublezero-agent …` command runs through bash instead of the Cli parser.
+
+## Caveats / known issues
+
+- The orchestrator's hardcoded SSH command is
+  `doublezero-agent -verbose [-controller HOST:PORT]`. It does not pass
+  `-pubkey` or `-metrics-enable`. The stress image works around this with
+  the wrapper at `/usr/local/bin/doublezero-agent`, which injects
+  `-pubkey` from `/etc/doublezero/agent/pubkey` and turns on metrics on
+  `:9100`.
+- Use `--no-agent` to skip the SSH agent entirely; the orchestrator will
+  only drive the onchain sweep and the observer will only see passive
+  device state (no agent-log / metrics rows). Useful as a first smoke test.
+- The observer's `agent_silence` and `apply_config_errors` triggers depend
+  on the agent's metrics endpoint being reachable — they stay quiet under
+  `--no-agent`.
+
+## Teardown
+
+```bash
+dev/dzctl destroy -y
+docker rm -f dz-local-device-dzstress 2>/dev/null
+```
diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
new file mode 100755
index 000000000..8cb7c37e9
--- /dev/null
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -0,0 +1,436 @@
+#!/usr/bin/env bash
+# Bring up a containerized devnet plus a stress-test device, then run the
+# orchestrator and observer against it. Reuses the e2e ledger / manager /
+# controller stack (via `dev/dzctl start`) and adds one custom stress device
+# whose EOS config does NOT run a doublezero-agent daemon — the orchestrator
+# starts the agent over SSH instead.
+#
+# Usage:
+#   tools/stress/scripts/run-stress-local.sh                # default run
+#   tools/stress/scripts/run-stress-local.sh --clean        # destroy first
+#   tools/stress/scripts/run-stress-local.sh --no-build     # skip docker build
+#   tools/stress/scripts/run-stress-local.sh --target-users 4 --hold 60
+#
+# After it returns, the orchestrator and observer keep running in the
+# background. Their PIDs, log files, and working directory are printed.
+# Stop them with: kill $(cat <working-dir>/orchestrator.pid <working-dir>/observer.pid)
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+WORKSPACE_DIR="$(cd -- "${SCRIPT_DIR}/../../.." &> /dev/null && pwd)"
+
+DEPLOY_ID="${DZ_DEPLOY_ID:-dz-local}"
+STRESS_IMAGE="${DZ_STRESS_DEVICE_IMAGE:-dz-local/device-stress:dev}"
+BASE_DEVICE_IMAGE="${DZ_DEVICE_IMAGE:-dz-local/device:dev}"
+
+DEVICE_CODE="${DZ_STRESS_DEVICE_CODE:-dzstress}"
+DEVICE_LOCATION="ewr"
+DEVICE_EXCHANGE="xewr"
+DEVICE_HOST_ID="${DZ_STRESS_DEVICE_HOST_ID:-50}"   # offset inside the CYOA /24
+
+CONTAINER_NAME="${DEPLOY_ID}-device-${DEVICE_CODE}"
+DEFAULT_NETWORK="${DEPLOY_ID}-default"
+CYOA_NETWORK="${DEPLOY_ID}-cyoa"
+
+DEPLOY_DIR="${WORKSPACE_DIR}/dev/.deploy/${DEPLOY_ID}/stress"
+WORKING_DIR="${DZ_STRESS_WORKING_DIR:-${DEPLOY_DIR}/run}"
+SSH_KEY_PATH="${DEPLOY_DIR}/orchestrator_ed25519"
+
+TARGET_USERS="${DZ_STRESS_TARGET_USERS:-4}"
+USERS_PER_BATCH="${DZ_STRESS_USERS_PER_BATCH:-2}"
+HOLD_SECONDS="${DZ_STRESS_HOLD_SECONDS:-30}"
+SAMPLE_INTERVAL="${DZ_STRESS_SAMPLE_INTERVAL:-10s}"
+# The serviceability program rejects user creates whose client_ip isn't
+# globally routable (rejects CGNAT 100.64.0.0/10 and friends). Pin the
+# orchestrator's IP allocator to a global-unicast /16 instead of its CGNAT
+# default.
+CLIENT_IP_BASE="${DZ_STRESS_CLIENT_IP_BASE:-9.200.0.0}"
+
+CLEAN=false
+NO_BUILD=false
+NO_AGENT=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --clean) CLEAN=true; shift ;;
+        --no-build) NO_BUILD=true; shift ;;
+        --no-agent) NO_AGENT=true; shift ;;
+        --target-users) TARGET_USERS="$2"; shift 2 ;;
+        --users-per-batch) USERS_PER_BATCH="$2"; shift 2 ;;
+        --hold) HOLD_SECONDS="$2"; shift 2 ;;
+        --sample-interval) SAMPLE_INTERVAL="$2"; shift 2 ;;
+        -h|--help) sed -n '1,/^set -euo/p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
+        *) echo "unknown flag: $1" >&2; exit 2 ;;
+    esac
+done
+
+log() { printf '\033[1;36m[stress]\033[0m %s\n' "$*" >&2; }
+
+require() {
+    command -v "$1" >/dev/null 2>&1 || { echo "missing required tool: $1" >&2; exit 1; }
+}
+require docker
+require jq
+require go
+require ssh-keygen
+require python3   # for IP-in-subnet math
+
+mkdir -p "$DEPLOY_DIR" "$WORKING_DIR"
+
+# ---------------------------------------------------------------------------
+# Phase 1: bring up the core devnet
+# ---------------------------------------------------------------------------
+if [ "$CLEAN" = true ]; then
+    log "destroying any prior devnet ($DEPLOY_ID)"
+    "${WORKSPACE_DIR}/dev/dzctl" destroy -y || true
+    docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+fi
+
+if [ "$NO_BUILD" = false ]; then
+    log "building e2e docker images via dzctl"
+    "${WORKSPACE_DIR}/dev/dzctl" build
+fi
+
+log "starting core devnet (ledger, manager, controller, funder)"
+# dzctl start currently fails at "doublezero geolocation init" because that
+# subcommand was removed from the CLI. The earlier steps (ledger up,
+# serviceability deploy, smart-contract init) succeed and that's all we
+# need. Re-check post-failure that the smart contract is initialized and
+# continue.
+if ! "${WORKSPACE_DIR}/dev/dzctl" start --no-build; then
+    log "dzctl start exited non-zero; verifying chain state"
+    if ! docker exec "${DEPLOY_ID}-manager" \
+            doublezero global-config get >/dev/null 2>&1; then
+        echo "dzctl start failed AND smart-contract is not initialized" >&2
+        exit 1
+    fi
+    log "smart contract is initialized; continuing despite dzctl failure"
+fi
+
+# ---------------------------------------------------------------------------
+# Phase 2: build the stress device image
+# ---------------------------------------------------------------------------
+log "building stress device image: $STRESS_IMAGE"
+docker build \
+    --build-arg "DZ_DEVICE_IMAGE=${BASE_DEVICE_IMAGE}" \
+    -t "$STRESS_IMAGE" \
+    "${WORKSPACE_DIR}/tools/stress/docker/device"
+
+# ---------------------------------------------------------------------------
+# Phase 3: discover network / address state
+# ---------------------------------------------------------------------------
+log "inspecting networks"
+CYOA_SUBNET="$(docker network inspect "$CYOA_NETWORK" \
+    --format '{{(index .IPAM.Config 0).Subnet}}' 2>/dev/null || true)"
+if [ -z "$CYOA_SUBNET" ]; then
+    # dzctl bailed before creating the CYOA network. Make it ourselves.
+    log "creating $CYOA_NETWORK (dzctl skipped it)"
+    docker network create \
+        --driver bridge \
+        --subnet 9.128.0.0/24 \
+        --label "dz.malbeclabs.com/type=devnet" \
+        --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
+        "$CYOA_NETWORK" >/dev/null
+    CYOA_SUBNET="9.128.0.0/24"
+fi
+CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \
+    --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)"
+LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \
+    --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")"
+
+# Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a
+# globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors
+# the rules in e2e/internal/devnet/device.go.
+read -r CYOA_IP DZ_PREFIX < <(python3 - <<PY
+import ipaddress
+net = ipaddress.ip_network("$CYOA_SUBNET")
+host_id = $DEVICE_HOST_ID
+ip = net.network_address + host_id
+last = (host_id + 128) if (host_id + 128) < 256 else ((host_id - 128) // 8) * 8
+prefix = ipaddress.ip_address(int(net.network_address) + last)
+print(ip, f"{prefix}/29")
+PY
+)
+
+log "device CYOA IP=$CYOA_IP  dz_prefix=$DZ_PREFIX  controller=${CONTROLLER_IP:-<none>}"
+
+# ---------------------------------------------------------------------------
+# Phase 4: SSH keypair for the orchestrator
+# ---------------------------------------------------------------------------
+if [ ! -f "$SSH_KEY_PATH" ]; then
+    log "generating orchestrator SSH keypair: $SSH_KEY_PATH"
+    ssh-keygen -t ed25519 -f "$SSH_KEY_PATH" -N '' -C 'doublezero-stress-orchestrator' >/dev/null
+fi
+SSH_PUBKEY="$(cat "${SSH_KEY_PATH}.pub")"
+
+# ---------------------------------------------------------------------------
+# Phase 5: render EOS startup-config (no doublezero-agent daemon)
+# ---------------------------------------------------------------------------
+STARTUP_CONFIG_PATH="${WORKING_DIR}/startup-config"
+CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}"
+
+# Note: the orchestrator's SSH runner exec's `doublezero-agent` over SSH. cEOS
+# routes non-EOS-CLI commands through bash for privilege-15 admin users, so
+# our /usr/local/bin/doublezero-agent wrapper runs. The `protocol http` /
+# eos-sdk-rpc / Loopback0 blocks mirror the e2e device so the agent's
+# hardcoded 127.0.0.1:9543 endpoint works.
+cat > "$STARTUP_CONFIG_PATH" <<EOF
+! stress-test device startup-config (no doublezero-agent daemon)
+!
+no aaa root
+!
+username admin privilege 15 role network-admin secret sha512 \$6\$hb.8VFI7A9D/0zi2\$sZady959HlXHgFdWU9r01VDwmbM2CrhDYIXBJzHb3scDP8/t/4ozwxpZbwEgDxbWL.mHYtie0rSO8fRSZ5D0T1
+username admin sshkey ${SSH_PUBKEY}
+!
+service configuration session commit merge
+!
+vrf instance vrf1
+ip routing vrf vrf1
+!
+ip access-list standard allow-all
+   permit any
+!
+management api http-commands
+   protocol http
+   ip access-group allow-all
+   no shutdown
+!
+management api eos-sdk-rpc
+   transport grpc foo
+      localhost loopback
+      service all
+      no disabled
+!
+management api gnmi
+   transport grpc gnmi
+!
+management ssh
+   no shutdown
+!
+hostname ${DEVICE_CODE}
+!
+no service interface inactive port-id allocation disabled
+!
+transceiver qsfp default-mode 4x10G
+!
+service routing protocols model multi-agent
+!
+agent PowerManager shutdown
+agent LedPolicy shutdown
+agent Thermostat shutdown
+agent PowerFuse shutdown
+agent StandbyCpld shutdown
+agent LicenseManager shutdown
+!
+spanning-tree mode mstp
+!
+system l1
+   unsupported speed action error
+   unsupported error-correction action error
+!
+interface Loopback0
+  vrf vrf1
+  ip address 8.8.8.8/32
+!
+interface Ethernet1
+   no switchport
+   ip address ${CYOA_IP}/${CYOA_CIDR_PREFIX}
+!
+interface Management0
+   no shutdown
+!
+ip routing
+!
+router bgp 65342
+   router-id 10.10.10.10
+   vrf vrf1
+     network 8.8.8.8/32 route-map e2e
+!
+route-map e2e permit 10
+   set community 21682:1200
+!
+end
+EOF
+log "rendered startup-config: $STARTUP_CONFIG_PATH"
+
+# ---------------------------------------------------------------------------
+# Phase 6: start the stress device container
+# ---------------------------------------------------------------------------
+if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then
+    log "removing existing stress device container"
+    docker rm -f "$CONTAINER_NAME" >/dev/null
+fi
+
+log "starting stress device container: $CONTAINER_NAME"
+docker run -d \
+    --name "$CONTAINER_NAME" \
+    --hostname "device-${DEVICE_CODE}" \
+    --privileged \
+    --network "$DEFAULT_NETWORK" \
+    --label "dz.malbeclabs.com/type=devnet" \
+    --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
+    "$STRESS_IMAGE" >/dev/null
+
+# Attach to the CYOA network with the assigned IP (matches e2e ordering:
+# default first → eth0, cyoa second → eth1). Then drop in the startup-config
+# so the wait loop in the entrypoint can proceed.
+docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME"
+docker exec "$CONTAINER_NAME" mkdir -p /etc/doublezero/agent
+docker cp "$STARTUP_CONFIG_PATH" "${CONTAINER_NAME}:/etc/doublezero/agent/startup-config"
+
+log "waiting for stress device to become healthy"
+for _ in $(seq 1 60); do
+    status="$(docker inspect "$CONTAINER_NAME" --format '{{.State.Health.Status}}' 2>/dev/null || echo starting)"
+    if [ "$status" = "healthy" ]; then
+        break
+    fi
+    sleep 5
+done
+if [ "$status" != "healthy" ]; then
+    echo "stress device did not become healthy (last status: $status)" >&2
+    docker logs --tail 50 "$CONTAINER_NAME" >&2 || true
+    exit 1
+fi
+
+# Force admin's login shell to bash. cEOS provisions admin with /usr/bin/Cli,
+# which doesn't recognize external binaries like `doublezero-agent`. With bash
+# as the login shell, the orchestrator's SSH exec runs the wrapper directly.
+docker exec "$CONTAINER_NAME" \
+    sed -i 's|^\(admin:[^:]*:[^:]*:[^:]*:[^:]*:[^:]*:\).*$|\1/bin/bash|' /etc/passwd
+
+# ---------------------------------------------------------------------------
+# Phase 7: create the device onchain
+# ---------------------------------------------------------------------------
+log "creating device onchain (code=${DEVICE_CODE})"
+docker exec "${DEPLOY_ID}-manager" bash -c "
+    set -e
+    if ! doublezero device get --code ${DEVICE_CODE} >/dev/null 2>&1; then
+        doublezero device create \
+            --contributor co01 \
+            --code ${DEVICE_CODE} \
+            --location ${DEVICE_LOCATION} \
+            --exchange ${DEVICE_EXCHANGE} \
+            --public-ip ${CYOA_IP} \
+            --dz-prefixes ${DZ_PREFIX} \
+            --mgmt-vrf mgmt
+        DEVICE_PK=\$(doublezero device get --code ${DEVICE_CODE} --json | jq -r .account)
+        doublezero device update --pubkey \"\$DEVICE_PK\" --max-users 128 --desired-status activated
+    fi
+"
+
+DEVICE_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \
+    bash -c "doublezero device get --code ${DEVICE_CODE} --json" | jq -r .account)"
+log "device onchain pubkey: $DEVICE_PUBKEY"
+
+# Plant the pubkey on the device so the agent wrapper can supply --pubkey.
+echo -n "$DEVICE_PUBKEY" | docker exec -i "$CONTAINER_NAME" \
+    bash -c 'cat > /etc/doublezero/agent/pubkey'
+
+PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \
+    solana address -k /etc/doublezero/manager/dz-program-keypair.json | tr -d '[:space:]')"
+log "serviceability program id: $PROGRAM_ID"
+
+KEYPAIR_LOCAL="${DEPLOY_DIR}/manager-keypair.json"
+docker cp "${DEPLOY_ID}-manager:/root/.config/doublezero/id.json" "$KEYPAIR_LOCAL"
+# docker cp preserves the container's file mode (000 for keypairs); make it
+# readable by the orchestrator running as the host user.
+chmod 600 "$KEYPAIR_LOCAL"
+
+# Each user the orchestrator provisions onchain needs a prepaid access pass
+# keyed on (client_ip, user_payer). The orchestrator signs as the manager, so
+# user_payer is the manager's pubkey. Sweep CLIENT_IP_BASE + 0..N to cover
+# every IP the orchestrator might use.
+PAYER_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \
+    solana-keygen pubkey /root/.config/doublezero/id.json | tr -d '[:space:]')"
+IFS=. read -r b1 b2 b3 b4 <<<"$CLIENT_IP_BASE"
+log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY)"
+for i in $(seq 0 $((TARGET_USERS - 1))); do
+    host=$(( (b3 << 8) + b4 + i ))
+    octet3=$(( (host >> 8) & 0xff ))
+    octet4=$(( host & 0xff ))
+    client_ip="${b1}.${b2}.${octet3}.${octet4}"
+    docker exec "${DEPLOY_ID}-manager" \
+        doublezero access-pass set \
+            --accesspass-type prepaid \
+            --client-ip "$client_ip" \
+            --user-payer "$PAYER_PUBKEY" >/dev/null
+done
+
+# ---------------------------------------------------------------------------
+# Phase 8: build orchestrator + observer
+# ---------------------------------------------------------------------------
+log "building orchestrator + observer binaries"
+ORCH_BIN="${DEPLOY_DIR}/device-orchestrator"
+OBS_BIN="${DEPLOY_DIR}/device-observer"
+( cd "$WORKSPACE_DIR" && \
+  go build -o "$ORCH_BIN" ./tools/stress/device-orchestrator/cmd/device-orchestrator && \
+  go build -o "$OBS_BIN"  ./tools/stress/device-observer/cmd/device-observer )
+
+# ---------------------------------------------------------------------------
+# Phase 9: launch orchestrator + observer
+# ---------------------------------------------------------------------------
+RUN_DIR="${WORKING_DIR}/$(date -u +%Y%m%dT%H%M%SZ)"
+mkdir -p "$RUN_DIR"
+log "run working-dir: $RUN_DIR"
+
+ORCH_ARGS=(
+    --dut-pubkey "$DEVICE_PUBKEY"
+    --rpc-url "http://${LEDGER_IP}:8899"
+    --program-id "$PROGRAM_ID"
+    --keypair "$KEYPAIR_LOCAL"
+    --working-dir "$RUN_DIR"
+    --abort-file "${RUN_DIR}/abort"
+    --target-user-count "$TARGET_USERS"
+    --users-per-batch "$USERS_PER_BATCH"
+    --hold-seconds "$HOLD_SECONDS"
+    --client-ip-base "$CLIENT_IP_BASE"
+    --log-level info
+)
+if [ -n "$CONTROLLER_IP" ]; then
+    ORCH_ARGS+=(--controller "${CONTROLLER_IP}:7000")
+fi
+if [ "$NO_AGENT" = true ]; then
+    ORCH_ARGS+=(--no-agent)
+else
+    ORCH_ARGS+=(
+        --dut-ssh-host "${CYOA_IP}:22"
+        --dut-ssh-key  "$SSH_KEY_PATH"
+        --dut-ssh-user admin
+    )
+fi
+
+log "launching orchestrator (background)"
+nohup "$ORCH_BIN" "${ORCH_ARGS[@]}" \
+    > "${RUN_DIR}/orchestrator.stdout" \
+    2> "${RUN_DIR}/orchestrator.stderr" &
+ORCH_PID=$!
+echo "$ORCH_PID" > "${RUN_DIR}/orchestrator.pid"
+
+log "launching observer (background)"
+nohup "$OBS_BIN" \
+    --dut-host "$CYOA_IP" \
+    --eapi-user admin --eapi-pass admin \
+    --agent-metrics-url "http://${CYOA_IP}:9100/metrics" \
+    --working-dir "$RUN_DIR" \
+    --abort-file "${RUN_DIR}/abort" \
+    --sample-interval "$SAMPLE_INTERVAL" \
+    --force \
+    > "${RUN_DIR}/observer.stdout" \
+    2> "${RUN_DIR}/observer.stderr" &
+OBS_PID=$!
+echo "$OBS_PID" > "${RUN_DIR}/observer.pid"
+
+cat <<EOF
+
+==> stress test launched
+    orchestrator pid : $ORCH_PID  (logs: ${RUN_DIR}/orchestrator.std{out,err})
+    observer     pid : $OBS_PID  (logs: ${RUN_DIR}/observer.std{out,err})
+    working dir      : ${RUN_DIR}
+    abort sentinel   : ${RUN_DIR}/abort
+
+To stop both: kill \$(cat ${RUN_DIR}/orchestrator.pid ${RUN_DIR}/observer.pid)
+To follow:    tail -F ${RUN_DIR}/orchestrator.stderr ${RUN_DIR}/observer.stderr
+To tear down: ${WORKSPACE_DIR}/dev/dzctl destroy -y && docker rm -f ${CONTAINER_NAME}
+EOF

From 1bd6ac7d7788b545d2eb83afd91738c3554a84fd Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 14:42:38 +0000
Subject: [PATCH 2/7] tools/stress: enable SSH agent path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two adjustments to make the SSH-driven agent path work end-to-end:

- Add a `stress` system user with /bin/bash to the stress device image and
  plant the orchestrator's pubkey into its authorized_keys post-boot. cEOS
  pins admin's NSS shell to /usr/bin/RunCli, which intercepts SSH-exec'd
  commands and feeds them to the EOS Cli parser — the orchestrator's
  `doublezero-agent -verbose …` is not valid EOS Cli. The orchestrator
  now connects as `stress` so SSH-exec runs through bash and the
  /usr/local/bin/doublezero-agent wrapper executes.
- Render the device's default-network IP, prefix, and gateway into
  Management0 after the container starts (so the agent can route to the
  controller container), and permit the agent's prometheus port in the
  control-plane ACL. The startup-config render now happens after the
  container is started and inspected — the device's entrypoint blocks
  until the config file appears, so this ordering is safe.
---
 tools/stress/docker/device/Dockerfile    |   9 ++
 tools/stress/scripts/run-stress-local.sh | 110 ++++++++++++++++-------
 2 files changed, 85 insertions(+), 34 deletions(-)

diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile
index 2ae1422c3..d29867264 100644
--- a/tools/stress/docker/device/Dockerfile
+++ b/tools/stress/docker/device/Dockerfile
@@ -15,4 +15,13 @@ FROM ${DZ_DEVICE_IMAGE}
 COPY agent-wrapper.sh /usr/local/bin/doublezero-agent
 RUN chmod +x /usr/local/bin/doublezero-agent
 
+# cEOS provisions admin via NSS with shell /usr/bin/RunCli, which intercepts
+# SSH-exec'd commands and feeds them to the EOS Cli parser. The orchestrator's
+# hardcoded SSH command (`doublezero-agent -verbose …`) is not valid EOS Cli,
+# so we add a separate system user with /bin/bash for the orchestrator to use.
+# run-stress-local.sh plants the orchestrator's pubkey into this user's
+# authorized_keys at runtime (the keypair is generated per devnet, so we can't
+# bake it in).
+RUN useradd -m -s /bin/bash stress
+
 EXPOSE 22
diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
index 8cb7c37e9..9050df004 100755
--- a/tools/stress/scripts/run-stress-local.sh
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -166,16 +166,54 @@ fi
 SSH_PUBKEY="$(cat "${SSH_KEY_PATH}.pub")"
 
 # ---------------------------------------------------------------------------
-# Phase 5: render EOS startup-config (no doublezero-agent daemon)
+# Phase 5: start the stress device container (it will block on its
+# entrypoint's wait-for-startup-config loop until phase 6 copies the config)
+# ---------------------------------------------------------------------------
+if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then
+    log "removing existing stress device container"
+    docker rm -f "$CONTAINER_NAME" >/dev/null
+fi
+
+log "starting stress device container: $CONTAINER_NAME"
+docker run -d \
+    --name "$CONTAINER_NAME" \
+    --hostname "device-${DEVICE_CODE}" \
+    --privileged \
+    --network "$DEFAULT_NETWORK" \
+    --label "dz.malbeclabs.com/type=devnet" \
+    --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
+    "$STRESS_IMAGE" >/dev/null
+
+# Network ordering matters with containerized EOS: the first network
+# attached is the management interface (Management0/eth0), then subsequent
+# networks correspond to Ethernet1+ in order. So default → eth0, cyoa → eth1.
+docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME"
+
+# Inspect for the default-network IP / gateway / prefix Docker assigned —
+# the agent's path to the controller goes through Management0, so the EOS
+# config needs all three.
+DEFAULT_IP="$(docker inspect "$CONTAINER_NAME" \
+    --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")"
+DEFAULT_PREFIX="$(docker inspect "$CONTAINER_NAME" \
+    --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPPrefixLen}}")"
+DEFAULT_GATEWAY="$(docker inspect "$CONTAINER_NAME" \
+    --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").Gateway}}")"
+log "default network: ip=${DEFAULT_IP}/${DEFAULT_PREFIX} gw=${DEFAULT_GATEWAY}"
+
+# ---------------------------------------------------------------------------
+# Phase 6: render EOS startup-config and unblock the container's init
 # ---------------------------------------------------------------------------
 STARTUP_CONFIG_PATH="${WORKING_DIR}/startup-config"
 CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}"
 
-# Note: the orchestrator's SSH runner exec's `doublezero-agent` over SSH. cEOS
-# routes non-EOS-CLI commands through bash for privilege-15 admin users, so
-# our /usr/local/bin/doublezero-agent wrapper runs. The `protocol http` /
-# eos-sdk-rpc / Loopback0 blocks mirror the e2e device so the agent's
-# hardcoded 127.0.0.1:9543 endpoint works.
+# Note: the orchestrator's SSH runner exec's `doublezero-agent`. cEOS pins
+# `admin`'s NSS shell to /usr/bin/RunCli, so SSH-exec'd commands hit the EOS
+# Cli parser. We connect as a separate /bin/bash system user (`stress`,
+# added in the stress device image) and plant the orchestrator's pubkey
+# into its authorized_keys post-boot. The `protocol http` / eos-sdk-rpc /
+# Loopback0 blocks mirror the e2e device so the agent's hardcoded
+# 127.0.0.1:9543 endpoint works. The 999-line ACL exception lets the
+# observer scrape `:9100` (agent prometheus metrics).
 cat > "$STARTUP_CONFIG_PATH" <<EOF
 ! stress-test device startup-config (no doublezero-agent daemon)
 !
@@ -209,6 +247,20 @@ management api gnmi
 management ssh
    no shutdown
 !
+ip access-list MAIN-CONTROL-PLANE-ACL-MGMT
+   counters per-entry
+   10 permit icmp any any
+   20 permit ip any any tracked
+   30 permit tcp any any eq ssh telnet www snmp bgp https
+   40 permit udp any any eq bootps bootpc snmp ntp
+   50 permit tcp any eq bgp any
+   60 permit ahp any any
+   70 permit pim any any
+   80 permit igmp any any
+   90 permit ospf any any
+   100 permit vrrp any any
+   999 permit tcp any any eq 9100
+!
 hostname ${DEVICE_CODE}
 !
 no service interface inactive port-id allocation disabled
@@ -240,9 +292,12 @@ interface Ethernet1
 !
 interface Management0
    no shutdown
+   ip address ${DEFAULT_IP}/${DEFAULT_PREFIX}
 !
 ip routing
 !
+ip route 0.0.0.0/0 ${DEFAULT_GATEWAY}
+!
 router bgp 65342
    router-id 10.10.10.10
    vrf vrf1
@@ -255,28 +310,6 @@ end
 EOF
 log "rendered startup-config: $STARTUP_CONFIG_PATH"
 
-# ---------------------------------------------------------------------------
-# Phase 6: start the stress device container
-# ---------------------------------------------------------------------------
-if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then
-    log "removing existing stress device container"
-    docker rm -f "$CONTAINER_NAME" >/dev/null
-fi
-
-log "starting stress device container: $CONTAINER_NAME"
-docker run -d \
-    --name "$CONTAINER_NAME" \
-    --hostname "device-${DEVICE_CODE}" \
-    --privileged \
-    --network "$DEFAULT_NETWORK" \
-    --label "dz.malbeclabs.com/type=devnet" \
-    --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
-    "$STRESS_IMAGE" >/dev/null
-
-# Attach to the CYOA network with the assigned IP (matches e2e ordering:
-# default first → eth0, cyoa second → eth1). Then drop in the startup-config
-# so the wait loop in the entrypoint can proceed.
-docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME"
 docker exec "$CONTAINER_NAME" mkdir -p /etc/doublezero/agent
 docker cp "$STARTUP_CONFIG_PATH" "${CONTAINER_NAME}:/etc/doublezero/agent/startup-config"
 
@@ -294,11 +327,20 @@ if [ "$status" != "healthy" ]; then
     exit 1
 fi
 
-# Force admin's login shell to bash. cEOS provisions admin with /usr/bin/Cli,
-# which doesn't recognize external binaries like `doublezero-agent`. With bash
-# as the login shell, the orchestrator's SSH exec runs the wrapper directly.
-docker exec "$CONTAINER_NAME" \
-    sed -i 's|^\(admin:[^:]*:[^:]*:[^:]*:[^:]*:[^:]*:\).*$|\1/bin/bash|' /etc/passwd
+# Plant the orchestrator's SSH pubkey into the `stress` system user's
+# authorized_keys. The user is created in the Dockerfile with /bin/bash so
+# SSH-exec'd commands run through bash rather than EOS Cli. The pubkey can
+# only be installed at runtime because the keypair is generated per devnet.
+docker exec "$CONTAINER_NAME" bash -c '
+    mkdir -p /home/stress/.ssh &&
+    chown stress:stress /home/stress/.ssh &&
+    chmod 700 /home/stress/.ssh
+'
+docker exec -i "$CONTAINER_NAME" bash -c '
+    cat > /home/stress/.ssh/authorized_keys &&
+    chown stress:stress /home/stress/.ssh/authorized_keys &&
+    chmod 600 /home/stress/.ssh/authorized_keys
+' < "${SSH_KEY_PATH}.pub"
 
 # ---------------------------------------------------------------------------
 # Phase 7: create the device onchain
@@ -397,7 +439,7 @@ else
     ORCH_ARGS+=(
         --dut-ssh-host "${CYOA_IP}:22"
         --dut-ssh-key  "$SSH_KEY_PATH"
-        --dut-ssh-user admin
+        --dut-ssh-user stress
     )
 fi
 

From c66b2fe4ea90e9c4621021de60662f10f1f48d6f Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 15:10:08 +0000
Subject: [PATCH 3/7] tools/stress: agent metrics on a controller-permitted
 port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The controller-pushed device config fully redefines
MAIN-CONTROL-PLANE-ACL on every agent apply (it starts with
`no ip access-list MAIN-CONTROL-PLANE-ACL`), so any port permit we
add to the startup-config gets wiped on the first apply. The
controller's MAIN-CONTROL-PLANE-ACL is the ACL that's actually
bound to `system control-plane in` (our MAIN-CONTROL-PLANE-ACL-MGMT
is defined but unbound, so adding our permit there had no effect).

That bound ACL does permit TCP 50000-50100 by default, so move the
agent's prometheus listener to :50100 and point the observer's
`--agent-metrics-url` at the same port. Drop the now-unused
MAIN-CONTROL-PLANE-ACL-MGMT block from the startup-config — it was
never bound to anything.
---
 tools/stress/docker/device/agent-wrapper.sh |  7 +++++-
 tools/stress/scripts/run-stress-local.sh    | 25 +++++++--------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh
index dce5cdcfe..b7abe55c0 100755
--- a/tools/stress/docker/device/agent-wrapper.sh
+++ b/tools/stress/docker/device/agent-wrapper.sh
@@ -16,6 +16,11 @@ EXTRA_ARGS=()
 if [ -n "$PUBKEY" ]; then
     EXTRA_ARGS+=(-pubkey "$PUBKEY")
 fi
-EXTRA_ARGS+=(-metrics-enable -metrics-addr ":9100")
+# Pick a port the controller-pushed MAIN-CONTROL-PLANE-ACL already permits.
+# That ACL binds `system control-plane in` and the controller fully redefines
+# it on every apply (`no ip access-list MAIN-CONTROL-PLANE-ACL` + recreate),
+# so anything we add gets wiped on the agent's next tick. The default ACL
+# does permit TCP 50000-50100, so park the metrics endpoint there.
+EXTRA_ARGS+=(-metrics-enable -metrics-addr ":50100")
 
 exec /mnt/flash/doublezero-agent "${EXTRA_ARGS[@]}" "$@"
diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
index 9050df004..1ce2f7c15 100755
--- a/tools/stress/scripts/run-stress-local.sh
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -212,8 +212,13 @@ CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}"
 # added in the stress device image) and plant the orchestrator's pubkey
 # into its authorized_keys post-boot. The `protocol http` / eos-sdk-rpc /
 # Loopback0 blocks mirror the e2e device so the agent's hardcoded
-# 127.0.0.1:9543 endpoint works. The 999-line ACL exception lets the
-# observer scrape `:9100` (agent prometheus metrics).
+# 127.0.0.1:9543 endpoint works.
+#
+# The agent's prometheus metrics listen on :50100 (set by agent-wrapper.sh)
+# because the controller-pushed MAIN-CONTROL-PLANE-ACL — which is the one
+# actually bound to `system control-plane in` — permits TCP 50000-50100 by
+# default. The controller fully redefines that ACL on every apply, so we
+# can't add a port-9100 permit ourselves and have it survive.
 cat > "$STARTUP_CONFIG_PATH" <<EOF
 ! stress-test device startup-config (no doublezero-agent daemon)
 !
@@ -247,20 +252,6 @@ management api gnmi
 management ssh
    no shutdown
 !
-ip access-list MAIN-CONTROL-PLANE-ACL-MGMT
-   counters per-entry
-   10 permit icmp any any
-   20 permit ip any any tracked
-   30 permit tcp any any eq ssh telnet www snmp bgp https
-   40 permit udp any any eq bootps bootpc snmp ntp
-   50 permit tcp any eq bgp any
-   60 permit ahp any any
-   70 permit pim any any
-   80 permit igmp any any
-   90 permit ospf any any
-   100 permit vrrp any any
-   999 permit tcp any any eq 9100
-!
 hostname ${DEVICE_CODE}
 !
 no service interface inactive port-id allocation disabled
@@ -454,7 +445,7 @@ log "launching observer (background)"
 nohup "$OBS_BIN" \
     --dut-host "$CYOA_IP" \
     --eapi-user admin --eapi-pass admin \
-    --agent-metrics-url "http://${CYOA_IP}:9100/metrics" \
+    --agent-metrics-url "http://${CYOA_IP}:50100/metrics" \
     --working-dir "$RUN_DIR" \
     --abort-file "${RUN_DIR}/abort" \
     --sample-interval "$SAMPLE_INTERVAL" \

From 31eccb6f948b2eb8b15a995ba1ff248854998892 Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 15:40:29 +0000
Subject: [PATCH 4/7] tools/stress: start controller + register loopbacks if
 missing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes that make the harness self-sufficient against a fresh
devnet:

- If `dzctl start` bails at the (broken upstream) `doublezero
  geolocation init` step, the controller container is never started.
  When the script detects the controller is missing post-dzctl, start
  it with the same env the e2e harness uses
  (DZ_LEDGER_URL=http://ledger:8899, DZ_SERVICEABILITY_PROGRAM_ID
  resolved from the manager's keypair).
- After creating the device onchain, register Loopback255 (vpnv4) and
  Loopback256 (ipv4). Without these the controller reports "device
  has pathology" every poll and returns an empty config — the agent
  runs but has nothing useful to apply. Interface types mirror the
  e2e harness.

Also update the README to document why the agent's prometheus port
lives at :50100 (the controller-pushed MAIN-CONTROL-PLANE-ACL
overwrites our startup-config ACL on every apply, and that pushed ACL
permits TCP 50000-50100 by default).
---
 tools/stress/scripts/README.md           | 42 ++++++++++++++--------
 tools/stress/scripts/run-stress-local.sh | 46 ++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/tools/stress/scripts/README.md b/tools/stress/scripts/README.md
index 464a87b57..c14b9f7fc 100644
--- a/tools/stress/scripts/README.md
+++ b/tools/stress/scripts/README.md
@@ -37,27 +37,39 @@ Both processes keep running in the background. Stop them with the
 
 ## What the stress device differs from the e2e device
 
-It is the same cEOS base, but the startup config (rendered at run time by the
-script) drops the `daemon doublezero-agent` and `daemon doublezero-telemetry`
-blocks. SSH access for `admin` is enabled via the orchestrator's
-auto-generated ed25519 key, and the admin login shell in `/etc/passwd` is
-flipped to `/bin/bash` after EOS boot so the SSH-exec'd
-`doublezero-agent …` command runs through bash instead of the Cli parser.
+It is the same cEOS base, but the startup config (rendered at run time
+by the script) drops the `daemon doublezero-agent` and
+`daemon doublezero-telemetry` blocks. cEOS pins admin's NSS shell to
+`/usr/bin/RunCli` (the EOS Cli wrapper), so the image adds a separate
+`stress` system user with `/bin/bash`; the script plants the
+orchestrator's pubkey into its authorized_keys at runtime, and the
+orchestrator's SSH session connects as `stress`.
+
+## Agent metrics port: why 50100, not 9100
+
+The agent's prometheus listener is parked on `:50100`, not the default
+`:9100`. The cEOS device's `system control-plane` binds
+`MAIN-CONTROL-PLANE-ACL` (no `-MGMT` suffix), and the
+doublezero-controller's pushed device config fully redefines that ACL
+on every apply (starting with `no ip access-list
+MAIN-CONTROL-PLANE-ACL`). Any port permit we add via our startup-config
+is wiped on the first agent apply. The controller's default ACL does
+permit TCP `50000-50100`, so the wrapper at
+`/usr/local/bin/doublezero-agent` sets `-metrics-addr :50100` and the
+script points the observer's `--agent-metrics-url` at the same port.
 
 ## Caveats / known issues
 
 - The orchestrator's hardcoded SSH command is
   `doublezero-agent -verbose [-controller HOST:PORT]`. It does not pass
-  `-pubkey` or `-metrics-enable`. The stress image works around this with
-  the wrapper at `/usr/local/bin/doublezero-agent`, which injects
+  `-pubkey` or `-metrics-enable`. The stress image works around this
+  with the wrapper at `/usr/local/bin/doublezero-agent`, which injects
   `-pubkey` from `/etc/doublezero/agent/pubkey` and turns on metrics on
-  `:9100`.
-- Use `--no-agent` to skip the SSH agent entirely; the orchestrator will
-  only drive the onchain sweep and the observer will only see passive
-  device state (no agent-log / metrics rows). Useful as a first smoke test.
-- The observer's `agent_silence` and `apply_config_errors` triggers depend
-  on the agent's metrics endpoint being reachable — they stay quiet under
-  `--no-agent`.
+  `:50100`.
+- Use `--no-agent` to skip the SSH agent entirely; the orchestrator
+  will only drive the onchain sweep and the observer will only see
+  passive device state (no agent-log / metrics rows). Useful as a
+  first smoke test.
 
 ## Teardown
 
diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
index 1ce2f7c15..eb1531f55 100755
--- a/tools/stress/scripts/run-stress-local.sh
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -140,6 +140,35 @@ CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \
 LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \
     --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")"
 
+# dzctl bails on `doublezero geolocation init` before starting the
+# controller, so start it ourselves if missing. The agent's config-fetch
+# path goes through it; without it the agent's polling loop errors and
+# the orchestrator never sees a healthy apply.
+CONTROLLER_NAME="${DEPLOY_ID}-controller"
+if [ -z "$CONTROLLER_IP" ]; then
+    log "starting $CONTROLLER_NAME (dzctl skipped it)"
+    SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \
+        solana address -k /etc/doublezero/manager/dz-program-keypair.json \
+        | tr -d '[:space:]')"
+    docker run -d \
+        --name "$CONTROLLER_NAME" \
+        --hostname controller \
+        --network "$DEFAULT_NETWORK" \
+        --label "dz.malbeclabs.com/type=devnet" \
+        --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
+        -e "DZ_LEDGER_URL=http://ledger:8899" \
+        -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \
+        "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" >/dev/null
+    # Re-inspect for the IP.
+    for _ in $(seq 1 10); do
+        CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \
+            --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)"
+        [ -n "$CONTROLLER_IP" ] && break
+        sleep 1
+    done
+    [ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; }
+fi
+
 # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a
 # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors
 # the rules in e2e/internal/devnet/device.go.
@@ -361,6 +390,23 @@ log "device onchain pubkey: $DEVICE_PUBKEY"
 echo -n "$DEVICE_PUBKEY" | docker exec -i "$CONTAINER_NAME" \
     bash -c 'cat > /etc/doublezero/agent/pubkey'
 
+# Register VPNv4/IPv4 loopback interfaces onchain. Without these, the
+# controller reports "device has pathology" every poll and returns an
+# empty config — the agent runs but has nothing to apply. The interface
+# names + types mirror the e2e harness.
+for entry in "Loopback255:vpnv4" "Loopback256:ipv4"; do
+    iface="${entry%:*}"
+    iftype="${entry#*:}"
+    out=$(docker exec "${DEPLOY_ID}-manager" \
+        doublezero device interface create "$DEVICE_CODE" "$iface" \
+            --loopback-type "$iftype" --bandwidth 10G 2>&1) || true
+    if echo "$out" | grep -q "already exists"; then
+        log "loopback ${iface} (${iftype}) already exists onchain"
+    else
+        log "registered loopback ${iface} (${iftype})"
+    fi
+done
+
 PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \
     solana address -k /etc/doublezero/manager/dz-program-keypair.json | tr -d '[:space:]')"
 log "serviceability program id: $PROGRAM_ID"

From 843cac7bef58656efc18db11df9191db6160507e Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 15:59:13 +0000
Subject: [PATCH 5/7] tools/stress: run the agent as root via sudo in the
 wrapper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The doublezero-agent shells out to
`ip netns exec default /usr/bin/Cli -p 15 -c "show session-config
named X diffs"` to inspect the staged config session, and `ip netns
exec` requires CAP_SYS_ADMIN. The orchestrator's SSH session lands as
the unprivileged `stress` user, so this command fails with "Operation
not permitted" every apply cycle and the agent never settles — every
sweep aborts on apply_config_errors / diff_timeout.

Grant `stress` passwordless sudo and re-exec the wrapper through sudo
so the agent runs as root regardless of which user SSH lands. The
sudo rule has to go in /etc/sudoers.Eos because cEOS overlays that
template onto /etc/sudoers at boot and does not source
/etc/sudoers.d/, so anything we'd write to either of those gets
silently wiped.

With this fix a 4-user sweep completes cleanly: 28 runlog rows, 0
agent errors, 728 prometheus metric rows captured by the observer.
---
 tools/stress/docker/device/Dockerfile       | 13 ++++++++++++-
 tools/stress/docker/device/agent-wrapper.sh | 10 ++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile
index d29867264..1f488ca32 100644
--- a/tools/stress/docker/device/Dockerfile
+++ b/tools/stress/docker/device/Dockerfile
@@ -22,6 +22,17 @@ RUN chmod +x /usr/local/bin/doublezero-agent
 # run-stress-local.sh plants the orchestrator's pubkey into this user's
 # authorized_keys at runtime (the keypair is generated per devnet, so we can't
 # bake it in).
-RUN useradd -m -s /bin/bash stress
+#
+# The agent shells out to `ip netns exec default /usr/bin/Cli -p 15 -c "show
+# session-config named X diffs"` to inspect the staged config session, and
+# `ip netns exec` requires CAP_SYS_ADMIN. Give `stress` passwordless sudo so
+# the wrapper can run the agent as root; without this every apply cycle ends
+# in "Operation not permitted" from netns_exec and the agent never settles.
+# Append to /etc/sudoers.Eos because cEOS overlays that template onto
+# /etc/sudoers at boot (and it does not source /etc/sudoers.d/), so any
+# rule we'd add to /etc/sudoers or /etc/sudoers.d/ gets wiped.
+RUN useradd -m -s /bin/bash stress \
+    && echo 'stress ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers.Eos \
+    && visudo -cf /etc/sudoers.Eos
 
 EXPOSE 22
diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh
index b7abe55c0..b84c85578 100755
--- a/tools/stress/docker/device/agent-wrapper.sh
+++ b/tools/stress/docker/device/agent-wrapper.sh
@@ -4,8 +4,18 @@
 #   doublezero-agent -verbose [-controller HOST:PORT]
 # It does not pass -pubkey or enable metrics. This wrapper supplies both so
 # the agent can fetch its config and the observer can scrape its counters.
+#
+# The agent must run as root: it shells out to `ip netns exec default
+# /usr/bin/Cli` to inspect staged configure-session diffs, and that requires
+# CAP_SYS_ADMIN. We invoke ourselves through sudo so the agent runs with
+# the privilege it needs even when SSH lands the orchestrator as the `stress`
+# user. (The Dockerfile grants `stress` passwordless sudo.)
 set -eu
 
+if [ "$(id -u)" -ne 0 ]; then
+    exec sudo -E -- "$0" "$@"
+fi
+
 PUBKEY_FILE="/etc/doublezero/agent/pubkey"
 PUBKEY=""
 if [ -r "$PUBKEY_FILE" ]; then

From 162f3aae7ad1d292077a1cf6b52d715be0e662d1 Mon Sep 17 00:00:00 2001
From: Nik Weidenbacher <nik@malbeclabs.com>
Date: Tue, 2 Jun 2026 20:57:15 +0000
Subject: [PATCH 6/7] tools/stress: parallelize access-pass setup and pass
 max-tunnel-slots

Two adjustments for high-user-count runs:

- Fan out the access-pass setup loop via xargs -P (default 16
  concurrent) so the per-user serial CLI roundtrip doesn't bottleneck
  setup at high TARGET_USERS. At 1024 users a serial loop is ~12
  minutes of sustained txn submission and reliably knocks the local
  validator over; the parallel version completes in well under a
  minute.
- Always (re)start the controller container ourselves with
  -max-user-tunnel-slots set to TARGET_USERS (floor 128). The
  controller defaults the per-device slot count to 128; past that, it
  silently truncates the device config to the first 128 tunnels and
  the agent never applies the rest. dzctl currently can't start the
  controller anyway (its broken geolocation-init step bails first), so
  this also stops being a fallback and becomes the canonical start
  path. The entrypoint override preserves the original ledger-readiness
  wait.

Knobs: DZ_STRESS_ACCESS_PASS_PARALLEL (default 16),
DZ_STRESS_CONTROLLER_MAX_SLOTS (default TARGET_USERS).
---
 tools/stress/scripts/run-stress-local.sh | 92 ++++++++++++++++--------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
index eb1531f55..5f7d4a8d3 100755
--- a/tools/stress/scripts/run-stress-local.sh
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -140,34 +140,59 @@ CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \
 LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \
     --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")"
 
-# dzctl bails on `doublezero geolocation init` before starting the
-# controller, so start it ourselves if missing. The agent's config-fetch
-# path goes through it; without it the agent's polling loop errors and
-# the orchestrator never sees a healthy apply.
+# The controller defaults `-max-user-tunnel-slots` to 128 — beyond that the
+# controller renders only the first 128 tunnels in the device config and
+# the agent never knows about higher-index users (the orchestrator's
+# onchain provisions succeed regardless, so the bug is silent). Always
+# restart the controller with our slot count derived from TARGET_USERS so
+# stress sweeps past 128 users actually exercise the device. This also
+# replaces any controller dzctl may have started before failing on its
+# broken geolocation init step (we override the entrypoint to keep the
+# ledger-readiness wait and inject the flag).
 CONTROLLER_NAME="${DEPLOY_ID}-controller"
-if [ -z "$CONTROLLER_IP" ]; then
-    log "starting $CONTROLLER_NAME (dzctl skipped it)"
-    SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \
-        solana address -k /etc/doublezero/manager/dz-program-keypair.json \
-        | tr -d '[:space:]')"
-    docker run -d \
-        --name "$CONTROLLER_NAME" \
-        --hostname controller \
-        --network "$DEFAULT_NETWORK" \
-        --label "dz.malbeclabs.com/type=devnet" \
-        --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
-        -e "DZ_LEDGER_URL=http://ledger:8899" \
-        -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \
-        "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" >/dev/null
-    # Re-inspect for the IP.
-    for _ in $(seq 1 10); do
-        CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \
-            --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)"
-        [ -n "$CONTROLLER_IP" ] && break
-        sleep 1
-    done
-    [ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; }
+CONTROLLER_MAX_SLOTS="${DZ_STRESS_CONTROLLER_MAX_SLOTS:-$TARGET_USERS}"
+if [ "$CONTROLLER_MAX_SLOTS" -lt 128 ]; then
+    CONTROLLER_MAX_SLOTS=128
 fi
+log "starting $CONTROLLER_NAME (max-user-tunnel-slots=$CONTROLLER_MAX_SLOTS)"
+docker rm -f "$CONTROLLER_NAME" >/dev/null 2>&1 || true
+SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \
+    solana address -k /etc/doublezero/manager/dz-program-keypair.json \
+    | tr -d '[:space:]')"
+docker run -d \
+    --name "$CONTROLLER_NAME" \
+    --hostname controller \
+    --network "$DEFAULT_NETWORK" \
+    --label "dz.malbeclabs.com/type=devnet" \
+    --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \
+    -e "DZ_LEDGER_URL=http://ledger:8899" \
+    -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \
+    --entrypoint bash \
+    "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" \
+    -c "
+        while ! curl -sf -X POST -H 'Content-Type: application/json' \\
+            --data '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getHealth\"}' \\
+            \"\${DZ_LEDGER_URL}\" | grep -q '\"result\":\"ok\"'; do
+            echo 'Waiting for solana validator to be ready...'
+            sleep 1
+        done
+        exec doublezero-controller start \\
+            -listen-addr 0.0.0.0 -listen-port 7000 \\
+            -program-id \"\${DZ_SERVICEABILITY_PROGRAM_ID}\" \\
+            -solana-rpc-endpoint \"\${DZ_LEDGER_URL}\" \\
+            -device-local-asn 65342 \\
+            -max-user-tunnel-slots ${CONTROLLER_MAX_SLOTS} \\
+            -no-hardware
+    " >/dev/null
+# Re-inspect for the IP.
+CONTROLLER_IP=""
+for _ in $(seq 1 10); do
+    CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \
+        --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)"
+    [ -n "$CONTROLLER_IP" ] && break
+    sleep 1
+done
+[ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; }
 
 # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a
 # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors
@@ -421,11 +446,20 @@ chmod 600 "$KEYPAIR_LOCAL"
 # keyed on (client_ip, user_payer). The orchestrator signs as the manager, so
 # user_payer is the manager's pubkey. Sweep CLIENT_IP_BASE + 0..N to cover
 # every IP the orchestrator might use.
+#
+# The set-access-pass calls fan out via xargs -P so high user counts don't
+# bottleneck on serial CLI roundtrips — at 1024 users a serial loop is 12+
+# minutes of sustained txn submission per second and can knock the local
+# validator over, while batches of ACCESS_PASS_PARALLEL concurrent calls
+# complete in well under a minute.
 PAYER_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \
     solana-keygen pubkey /root/.config/doublezero/id.json | tr -d '[:space:]')"
 IFS=. read -r b1 b2 b3 b4 <<<"$CLIENT_IP_BASE"
-log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY)"
-for i in $(seq 0 $((TARGET_USERS - 1))); do
+ACCESS_PASS_PARALLEL="${DZ_STRESS_ACCESS_PASS_PARALLEL:-16}"
+log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY, parallel=$ACCESS_PASS_PARALLEL)"
+export DEPLOY_ID PAYER_PUBKEY b1 b2 b3 b4
+seq 0 $((TARGET_USERS - 1)) | xargs -P "$ACCESS_PASS_PARALLEL" -I{} bash -c '
+    i=$1
     host=$(( (b3 << 8) + b4 + i ))
     octet3=$(( (host >> 8) & 0xff ))
     octet4=$(( host & 0xff ))
@@ -435,7 +469,7 @@ for i in $(seq 0 $((TARGET_USERS - 1))); do
             --accesspass-type prepaid \
             --client-ip "$client_ip" \
             --user-payer "$PAYER_PUBKEY" >/dev/null
-done
+' _ {}
 
 # ---------------------------------------------------------------------------
 # Phase 8: build orchestrator + observer

From ab48f61966a451ab94d0f3f8eedf7b7ed82ace1e Mon Sep 17 00:00:00 2001
From: nikw9944 <nik@malbeclabs.com>
Date: Wed, 3 Jun 2026 18:10:55 +0000
Subject: [PATCH 7/7] tools/stress: snap dz_prefix down to /29 boundary

---
 tools/stress/scripts/run-stress-local.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh
index 5f7d4a8d3..fc1718731 100755
--- a/tools/stress/scripts/run-stress-local.sh
+++ b/tools/stress/scripts/run-stress-local.sh
@@ -196,13 +196,15 @@ done
 
 # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a
 # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors
-# the rules in e2e/internal/devnet/device.go.
+# the rules in e2e/internal/devnet/device.go. Snap to a /29 boundary so the
+# prefix is network-aligned in case `--dz-prefixes` ever validates alignment.
 read -r CYOA_IP DZ_PREFIX < <(python3 - <<PY
 import ipaddress
 net = ipaddress.ip_network("$CYOA_SUBNET")
 host_id = $DEVICE_HOST_ID
 ip = net.network_address + host_id
-last = (host_id + 128) if (host_id + 128) < 256 else ((host_id - 128) // 8) * 8
+last = (host_id + 128) if (host_id + 128) < 256 else (host_id - 128)
+last &= ~0x7
 prefix = ipaddress.ip_address(int(net.network_address) + last)
 print(ip, f"{prefix}/29")
 PY