From 25c21f9fab3c720118359b38f76285497c3ad725 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Tue, 2 Jun 2026 13:56:19 +0000 Subject: [PATCH 1/7] tools/stress: local containerized harness script Adds a script that brings up the e2e ledger / manager / controller / funder via dev/dzctl, layers a stress-test device on top of the e2e device image (with the agent daemon removed and SSH enabled), creates the device + prepaid access passes onchain, then launches the device-orchestrator and device-observer against it. Supports a --no-agent smoke-test path that exercises the onchain sweep + observer sampling without driving the SSH agent runner. --- tools/stress/docker/device/Dockerfile | 18 + tools/stress/docker/device/agent-wrapper.sh | 21 + tools/stress/scripts/README.md | 67 +++ tools/stress/scripts/run-stress-local.sh | 436 ++++++++++++++++++++ 4 files changed, 542 insertions(+) create mode 100644 tools/stress/docker/device/Dockerfile create mode 100755 tools/stress/docker/device/agent-wrapper.sh create mode 100644 tools/stress/scripts/README.md create mode 100755 tools/stress/scripts/run-stress-local.sh diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile new file mode 100644 index 000000000..2ae1422c3 --- /dev/null +++ b/tools/stress/docker/device/Dockerfile @@ -0,0 +1,18 @@ +# Stress-test device image. +# +# Layers on top of the e2e device image (dz-local/device by default) and +# replaces the default `doublezero-agent` binary path with a wrapper that the +# orchestrator's SSH runner can invoke as `doublezero-agent`. The wrapper +# injects the per-device --pubkey (read from /etc/doublezero/agent/pubkey, +# populated by run-stress-local.sh) and enables prometheus metrics so the +# observer can scrape them. +# +# This image deliberately does NOT bake an EOS startup-config that runs the +# agent as a daemon. The orchestrator owns the agent lifecycle. +ARG DZ_DEVICE_IMAGE=dz-local/device:dev +FROM ${DZ_DEVICE_IMAGE} + +COPY agent-wrapper.sh /usr/local/bin/doublezero-agent +RUN chmod +x /usr/local/bin/doublezero-agent + +EXPOSE 22 diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh new file mode 100755 index 000000000..dce5cdcfe --- /dev/null +++ b/tools/stress/docker/device/agent-wrapper.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Wrapper invoked over SSH by the orchestrator as `doublezero-agent`. +# The orchestrator's SSH command is hardcoded today as: +# doublezero-agent -verbose [-controller HOST:PORT] +# It does not pass -pubkey or enable metrics. This wrapper supplies both so +# the agent can fetch its config and the observer can scrape its counters. +set -eu + +PUBKEY_FILE="/etc/doublezero/agent/pubkey" +PUBKEY="" +if [ -r "$PUBKEY_FILE" ]; then + PUBKEY="$(tr -d '[:space:]' < "$PUBKEY_FILE")" +fi + +EXTRA_ARGS=() +if [ -n "$PUBKEY" ]; then + EXTRA_ARGS+=(-pubkey "$PUBKEY") +fi +EXTRA_ARGS+=(-metrics-enable -metrics-addr ":9100") + +exec /mnt/flash/doublezero-agent "${EXTRA_ARGS[@]}" "$@" diff --git a/tools/stress/scripts/README.md b/tools/stress/scripts/README.md new file mode 100644 index 000000000..464a87b57 --- /dev/null +++ b/tools/stress/scripts/README.md @@ -0,0 +1,67 @@ +# Local stress-test harness + +`run-stress-local.sh` brings up a containerized devnet (ledger + manager + +controller + funder, via `dev/dzctl start`), adds one custom **stress device** +container whose EOS startup config does NOT include the `doublezero-agent` +daemon, then launches the orchestrator and observer against it. The +orchestrator owns the agent lifecycle (starts it over SSH); the observer +samples the device. + +Components: + +| Piece | Image / source | +| ------------------------------------ | -------------------------------------------------------------- | +| Ledger / manager / controller | e2e harness images (`dz-local/{ledger,manager,controller}`) | +| Stress device | `tools/stress/docker/device/Dockerfile` (extends e2e device) | +| Agent invocation wrapper | `tools/stress/docker/device/agent-wrapper.sh` (in stress image)| +| Stress orchestrator | `tools/stress/device-orchestrator/cmd/device-orchestrator` | +| Stress observer | `tools/stress/device-observer/cmd/device-observer` | + +## Quick start + +```bash +# Full build + run (creates a 4-user sweep with 30s holds by default) +tools/stress/scripts/run-stress-local.sh --clean + +# Skip docker build on subsequent runs +tools/stress/scripts/run-stress-local.sh --no-build + +# Tweak the sweep +tools/stress/scripts/run-stress-local.sh --target-users 8 --hold 60 +``` + +The script ends by printing the orchestrator/observer PIDs and the run +working directory (under `dev/.deploy/dz-local/stress/run//`). +Both processes keep running in the background. Stop them with the +`kill $(cat …)` snippet the script prints. + +## What the stress device differs from the e2e device + +It is the same cEOS base, but the startup config (rendered at run time by the +script) drops the `daemon doublezero-agent` and `daemon doublezero-telemetry` +blocks. SSH access for `admin` is enabled via the orchestrator's +auto-generated ed25519 key, and the admin login shell in `/etc/passwd` is +flipped to `/bin/bash` after EOS boot so the SSH-exec'd +`doublezero-agent …` command runs through bash instead of the Cli parser. + +## Caveats / known issues + +- The orchestrator's hardcoded SSH command is + `doublezero-agent -verbose [-controller HOST:PORT]`. It does not pass + `-pubkey` or `-metrics-enable`. The stress image works around this with + the wrapper at `/usr/local/bin/doublezero-agent`, which injects + `-pubkey` from `/etc/doublezero/agent/pubkey` and turns on metrics on + `:9100`. +- Use `--no-agent` to skip the SSH agent entirely; the orchestrator will + only drive the onchain sweep and the observer will only see passive + device state (no agent-log / metrics rows). Useful as a first smoke test. +- The observer's `agent_silence` and `apply_config_errors` triggers depend + on the agent's metrics endpoint being reachable — they stay quiet under + `--no-agent`. + +## Teardown + +```bash +dev/dzctl destroy -y +docker rm -f dz-local-device-dzstress 2>/dev/null +``` diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh new file mode 100755 index 000000000..8cb7c37e9 --- /dev/null +++ b/tools/stress/scripts/run-stress-local.sh @@ -0,0 +1,436 @@ +#!/usr/bin/env bash +# Bring up a containerized devnet plus a stress-test device, then run the +# orchestrator and observer against it. Reuses the e2e ledger / manager / +# controller stack (via `dev/dzctl start`) and adds one custom stress device +# whose EOS config does NOT run a doublezero-agent daemon — the orchestrator +# starts the agent over SSH instead. +# +# Usage: +# tools/stress/scripts/run-stress-local.sh # default run +# tools/stress/scripts/run-stress-local.sh --clean # destroy first +# tools/stress/scripts/run-stress-local.sh --no-build # skip docker build +# tools/stress/scripts/run-stress-local.sh --target-users 4 --hold 60 +# +# After it returns, the orchestrator and observer keep running in the +# background. Their PIDs, log files, and working directory are printed. +# Stop them with: kill $(cat /orchestrator.pid /observer.pid) +set -euo pipefail + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +WORKSPACE_DIR="$(cd -- "${SCRIPT_DIR}/../../.." &> /dev/null && pwd)" + +DEPLOY_ID="${DZ_DEPLOY_ID:-dz-local}" +STRESS_IMAGE="${DZ_STRESS_DEVICE_IMAGE:-dz-local/device-stress:dev}" +BASE_DEVICE_IMAGE="${DZ_DEVICE_IMAGE:-dz-local/device:dev}" + +DEVICE_CODE="${DZ_STRESS_DEVICE_CODE:-dzstress}" +DEVICE_LOCATION="ewr" +DEVICE_EXCHANGE="xewr" +DEVICE_HOST_ID="${DZ_STRESS_DEVICE_HOST_ID:-50}" # offset inside the CYOA /24 + +CONTAINER_NAME="${DEPLOY_ID}-device-${DEVICE_CODE}" +DEFAULT_NETWORK="${DEPLOY_ID}-default" +CYOA_NETWORK="${DEPLOY_ID}-cyoa" + +DEPLOY_DIR="${WORKSPACE_DIR}/dev/.deploy/${DEPLOY_ID}/stress" +WORKING_DIR="${DZ_STRESS_WORKING_DIR:-${DEPLOY_DIR}/run}" +SSH_KEY_PATH="${DEPLOY_DIR}/orchestrator_ed25519" + +TARGET_USERS="${DZ_STRESS_TARGET_USERS:-4}" +USERS_PER_BATCH="${DZ_STRESS_USERS_PER_BATCH:-2}" +HOLD_SECONDS="${DZ_STRESS_HOLD_SECONDS:-30}" +SAMPLE_INTERVAL="${DZ_STRESS_SAMPLE_INTERVAL:-10s}" +# The serviceability program rejects user creates whose client_ip isn't +# globally routable (rejects CGNAT 100.64.0.0/10 and friends). Pin the +# orchestrator's IP allocator to a global-unicast /16 instead of its CGNAT +# default. +CLIENT_IP_BASE="${DZ_STRESS_CLIENT_IP_BASE:-9.200.0.0}" + +CLEAN=false +NO_BUILD=false +NO_AGENT=false +while [[ $# -gt 0 ]]; do + case "$1" in + --clean) CLEAN=true; shift ;; + --no-build) NO_BUILD=true; shift ;; + --no-agent) NO_AGENT=true; shift ;; + --target-users) TARGET_USERS="$2"; shift 2 ;; + --users-per-batch) USERS_PER_BATCH="$2"; shift 2 ;; + --hold) HOLD_SECONDS="$2"; shift 2 ;; + --sample-interval) SAMPLE_INTERVAL="$2"; shift 2 ;; + -h|--help) sed -n '1,/^set -euo/p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "unknown flag: $1" >&2; exit 2 ;; + esac +done + +log() { printf '\033[1;36m[stress]\033[0m %s\n' "$*" >&2; } + +require() { + command -v "$1" >/dev/null 2>&1 || { echo "missing required tool: $1" >&2; exit 1; } +} +require docker +require jq +require go +require ssh-keygen +require python3 # for IP-in-subnet math + +mkdir -p "$DEPLOY_DIR" "$WORKING_DIR" + +# --------------------------------------------------------------------------- +# Phase 1: bring up the core devnet +# --------------------------------------------------------------------------- +if [ "$CLEAN" = true ]; then + log "destroying any prior devnet ($DEPLOY_ID)" + "${WORKSPACE_DIR}/dev/dzctl" destroy -y || true + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true +fi + +if [ "$NO_BUILD" = false ]; then + log "building e2e docker images via dzctl" + "${WORKSPACE_DIR}/dev/dzctl" build +fi + +log "starting core devnet (ledger, manager, controller, funder)" +# dzctl start currently fails at "doublezero geolocation init" because that +# subcommand was removed from the CLI. The earlier steps (ledger up, +# serviceability deploy, smart-contract init) succeed and that's all we +# need. Re-check post-failure that the smart contract is initialized and +# continue. +if ! "${WORKSPACE_DIR}/dev/dzctl" start --no-build; then + log "dzctl start exited non-zero; verifying chain state" + if ! docker exec "${DEPLOY_ID}-manager" \ + doublezero global-config get >/dev/null 2>&1; then + echo "dzctl start failed AND smart-contract is not initialized" >&2 + exit 1 + fi + log "smart contract is initialized; continuing despite dzctl failure" +fi + +# --------------------------------------------------------------------------- +# Phase 2: build the stress device image +# --------------------------------------------------------------------------- +log "building stress device image: $STRESS_IMAGE" +docker build \ + --build-arg "DZ_DEVICE_IMAGE=${BASE_DEVICE_IMAGE}" \ + -t "$STRESS_IMAGE" \ + "${WORKSPACE_DIR}/tools/stress/docker/device" + +# --------------------------------------------------------------------------- +# Phase 3: discover network / address state +# --------------------------------------------------------------------------- +log "inspecting networks" +CYOA_SUBNET="$(docker network inspect "$CYOA_NETWORK" \ + --format '{{(index .IPAM.Config 0).Subnet}}' 2>/dev/null || true)" +if [ -z "$CYOA_SUBNET" ]; then + # dzctl bailed before creating the CYOA network. Make it ourselves. + log "creating $CYOA_NETWORK (dzctl skipped it)" + docker network create \ + --driver bridge \ + --subnet 9.128.0.0/24 \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + "$CYOA_NETWORK" >/dev/null + CYOA_SUBNET="9.128.0.0/24" +fi +CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" +LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" + +# Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a +# globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors +# the rules in e2e/internal/devnet/device.go. +read -r CYOA_IP DZ_PREFIX < <(python3 - <}" + +# --------------------------------------------------------------------------- +# Phase 4: SSH keypair for the orchestrator +# --------------------------------------------------------------------------- +if [ ! -f "$SSH_KEY_PATH" ]; then + log "generating orchestrator SSH keypair: $SSH_KEY_PATH" + ssh-keygen -t ed25519 -f "$SSH_KEY_PATH" -N '' -C 'doublezero-stress-orchestrator' >/dev/null +fi +SSH_PUBKEY="$(cat "${SSH_KEY_PATH}.pub")" + +# --------------------------------------------------------------------------- +# Phase 5: render EOS startup-config (no doublezero-agent daemon) +# --------------------------------------------------------------------------- +STARTUP_CONFIG_PATH="${WORKING_DIR}/startup-config" +CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}" + +# Note: the orchestrator's SSH runner exec's `doublezero-agent` over SSH. cEOS +# routes non-EOS-CLI commands through bash for privilege-15 admin users, so +# our /usr/local/bin/doublezero-agent wrapper runs. The `protocol http` / +# eos-sdk-rpc / Loopback0 blocks mirror the e2e device so the agent's +# hardcoded 127.0.0.1:9543 endpoint works. +cat > "$STARTUP_CONFIG_PATH" </dev/null 2>&1; then + log "removing existing stress device container" + docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +log "starting stress device container: $CONTAINER_NAME" +docker run -d \ + --name "$CONTAINER_NAME" \ + --hostname "device-${DEVICE_CODE}" \ + --privileged \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + "$STRESS_IMAGE" >/dev/null + +# Attach to the CYOA network with the assigned IP (matches e2e ordering: +# default first → eth0, cyoa second → eth1). Then drop in the startup-config +# so the wait loop in the entrypoint can proceed. +docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME" +docker exec "$CONTAINER_NAME" mkdir -p /etc/doublezero/agent +docker cp "$STARTUP_CONFIG_PATH" "${CONTAINER_NAME}:/etc/doublezero/agent/startup-config" + +log "waiting for stress device to become healthy" +for _ in $(seq 1 60); do + status="$(docker inspect "$CONTAINER_NAME" --format '{{.State.Health.Status}}' 2>/dev/null || echo starting)" + if [ "$status" = "healthy" ]; then + break + fi + sleep 5 +done +if [ "$status" != "healthy" ]; then + echo "stress device did not become healthy (last status: $status)" >&2 + docker logs --tail 50 "$CONTAINER_NAME" >&2 || true + exit 1 +fi + +# Force admin's login shell to bash. cEOS provisions admin with /usr/bin/Cli, +# which doesn't recognize external binaries like `doublezero-agent`. With bash +# as the login shell, the orchestrator's SSH exec runs the wrapper directly. +docker exec "$CONTAINER_NAME" \ + sed -i 's|^\(admin:[^:]*:[^:]*:[^:]*:[^:]*:[^:]*:\).*$|\1/bin/bash|' /etc/passwd + +# --------------------------------------------------------------------------- +# Phase 7: create the device onchain +# --------------------------------------------------------------------------- +log "creating device onchain (code=${DEVICE_CODE})" +docker exec "${DEPLOY_ID}-manager" bash -c " + set -e + if ! doublezero device get --code ${DEVICE_CODE} >/dev/null 2>&1; then + doublezero device create \ + --contributor co01 \ + --code ${DEVICE_CODE} \ + --location ${DEVICE_LOCATION} \ + --exchange ${DEVICE_EXCHANGE} \ + --public-ip ${CYOA_IP} \ + --dz-prefixes ${DZ_PREFIX} \ + --mgmt-vrf mgmt + DEVICE_PK=\$(doublezero device get --code ${DEVICE_CODE} --json | jq -r .account) + doublezero device update --pubkey \"\$DEVICE_PK\" --max-users 128 --desired-status activated + fi +" + +DEVICE_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \ + bash -c "doublezero device get --code ${DEVICE_CODE} --json" | jq -r .account)" +log "device onchain pubkey: $DEVICE_PUBKEY" + +# Plant the pubkey on the device so the agent wrapper can supply --pubkey. +echo -n "$DEVICE_PUBKEY" | docker exec -i "$CONTAINER_NAME" \ + bash -c 'cat > /etc/doublezero/agent/pubkey' + +PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ + solana address -k /etc/doublezero/manager/dz-program-keypair.json | tr -d '[:space:]')" +log "serviceability program id: $PROGRAM_ID" + +KEYPAIR_LOCAL="${DEPLOY_DIR}/manager-keypair.json" +docker cp "${DEPLOY_ID}-manager:/root/.config/doublezero/id.json" "$KEYPAIR_LOCAL" +# docker cp preserves the container's file mode (000 for keypairs); make it +# readable by the orchestrator running as the host user. +chmod 600 "$KEYPAIR_LOCAL" + +# Each user the orchestrator provisions onchain needs a prepaid access pass +# keyed on (client_ip, user_payer). The orchestrator signs as the manager, so +# user_payer is the manager's pubkey. Sweep CLIENT_IP_BASE + 0..N to cover +# every IP the orchestrator might use. +PAYER_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \ + solana-keygen pubkey /root/.config/doublezero/id.json | tr -d '[:space:]')" +IFS=. read -r b1 b2 b3 b4 <<<"$CLIENT_IP_BASE" +log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY)" +for i in $(seq 0 $((TARGET_USERS - 1))); do + host=$(( (b3 << 8) + b4 + i )) + octet3=$(( (host >> 8) & 0xff )) + octet4=$(( host & 0xff )) + client_ip="${b1}.${b2}.${octet3}.${octet4}" + docker exec "${DEPLOY_ID}-manager" \ + doublezero access-pass set \ + --accesspass-type prepaid \ + --client-ip "$client_ip" \ + --user-payer "$PAYER_PUBKEY" >/dev/null +done + +# --------------------------------------------------------------------------- +# Phase 8: build orchestrator + observer +# --------------------------------------------------------------------------- +log "building orchestrator + observer binaries" +ORCH_BIN="${DEPLOY_DIR}/device-orchestrator" +OBS_BIN="${DEPLOY_DIR}/device-observer" +( cd "$WORKSPACE_DIR" && \ + go build -o "$ORCH_BIN" ./tools/stress/device-orchestrator/cmd/device-orchestrator && \ + go build -o "$OBS_BIN" ./tools/stress/device-observer/cmd/device-observer ) + +# --------------------------------------------------------------------------- +# Phase 9: launch orchestrator + observer +# --------------------------------------------------------------------------- +RUN_DIR="${WORKING_DIR}/$(date -u +%Y%m%dT%H%M%SZ)" +mkdir -p "$RUN_DIR" +log "run working-dir: $RUN_DIR" + +ORCH_ARGS=( + --dut-pubkey "$DEVICE_PUBKEY" + --rpc-url "http://${LEDGER_IP}:8899" + --program-id "$PROGRAM_ID" + --keypair "$KEYPAIR_LOCAL" + --working-dir "$RUN_DIR" + --abort-file "${RUN_DIR}/abort" + --target-user-count "$TARGET_USERS" + --users-per-batch "$USERS_PER_BATCH" + --hold-seconds "$HOLD_SECONDS" + --client-ip-base "$CLIENT_IP_BASE" + --log-level info +) +if [ -n "$CONTROLLER_IP" ]; then + ORCH_ARGS+=(--controller "${CONTROLLER_IP}:7000") +fi +if [ "$NO_AGENT" = true ]; then + ORCH_ARGS+=(--no-agent) +else + ORCH_ARGS+=( + --dut-ssh-host "${CYOA_IP}:22" + --dut-ssh-key "$SSH_KEY_PATH" + --dut-ssh-user admin + ) +fi + +log "launching orchestrator (background)" +nohup "$ORCH_BIN" "${ORCH_ARGS[@]}" \ + > "${RUN_DIR}/orchestrator.stdout" \ + 2> "${RUN_DIR}/orchestrator.stderr" & +ORCH_PID=$! +echo "$ORCH_PID" > "${RUN_DIR}/orchestrator.pid" + +log "launching observer (background)" +nohup "$OBS_BIN" \ + --dut-host "$CYOA_IP" \ + --eapi-user admin --eapi-pass admin \ + --agent-metrics-url "http://${CYOA_IP}:9100/metrics" \ + --working-dir "$RUN_DIR" \ + --abort-file "${RUN_DIR}/abort" \ + --sample-interval "$SAMPLE_INTERVAL" \ + --force \ + > "${RUN_DIR}/observer.stdout" \ + 2> "${RUN_DIR}/observer.stderr" & +OBS_PID=$! +echo "$OBS_PID" > "${RUN_DIR}/observer.pid" + +cat < stress test launched + orchestrator pid : $ORCH_PID (logs: ${RUN_DIR}/orchestrator.std{out,err}) + observer pid : $OBS_PID (logs: ${RUN_DIR}/observer.std{out,err}) + working dir : ${RUN_DIR} + abort sentinel : ${RUN_DIR}/abort + +To stop both: kill \$(cat ${RUN_DIR}/orchestrator.pid ${RUN_DIR}/observer.pid) +To follow: tail -F ${RUN_DIR}/orchestrator.stderr ${RUN_DIR}/observer.stderr +To tear down: ${WORKSPACE_DIR}/dev/dzctl destroy -y && docker rm -f ${CONTAINER_NAME} +EOF From 1bd6ac7d7788b545d2eb83afd91738c3554a84fd Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Tue, 2 Jun 2026 14:42:38 +0000 Subject: [PATCH 2/7] tools/stress: enable SSH agent path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two adjustments to make the SSH-driven agent path work end-to-end: - Add a `stress` system user with /bin/bash to the stress device image and plant the orchestrator's pubkey into its authorized_keys post-boot. cEOS pins admin's NSS shell to /usr/bin/RunCli, which intercepts SSH-exec'd commands and feeds them to the EOS Cli parser — the orchestrator's `doublezero-agent -verbose …` is not valid EOS Cli. The orchestrator now connects as `stress` so SSH-exec runs through bash and the /usr/local/bin/doublezero-agent wrapper executes. - Render the device's default-network IP, prefix, and gateway into Management0 after the container starts (so the agent can route to the controller container), and permit the agent's prometheus port in the control-plane ACL. The startup-config render now happens after the container is started and inspected — the device's entrypoint blocks until the config file appears, so this ordering is safe. --- tools/stress/docker/device/Dockerfile | 9 ++ tools/stress/scripts/run-stress-local.sh | 110 ++++++++++++++++------- 2 files changed, 85 insertions(+), 34 deletions(-) diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile index 2ae1422c3..d29867264 100644 --- a/tools/stress/docker/device/Dockerfile +++ b/tools/stress/docker/device/Dockerfile @@ -15,4 +15,13 @@ FROM ${DZ_DEVICE_IMAGE} COPY agent-wrapper.sh /usr/local/bin/doublezero-agent RUN chmod +x /usr/local/bin/doublezero-agent +# cEOS provisions admin via NSS with shell /usr/bin/RunCli, which intercepts +# SSH-exec'd commands and feeds them to the EOS Cli parser. The orchestrator's +# hardcoded SSH command (`doublezero-agent -verbose …`) is not valid EOS Cli, +# so we add a separate system user with /bin/bash for the orchestrator to use. +# run-stress-local.sh plants the orchestrator's pubkey into this user's +# authorized_keys at runtime (the keypair is generated per devnet, so we can't +# bake it in). +RUN useradd -m -s /bin/bash stress + EXPOSE 22 diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh index 8cb7c37e9..9050df004 100755 --- a/tools/stress/scripts/run-stress-local.sh +++ b/tools/stress/scripts/run-stress-local.sh @@ -166,16 +166,54 @@ fi SSH_PUBKEY="$(cat "${SSH_KEY_PATH}.pub")" # --------------------------------------------------------------------------- -# Phase 5: render EOS startup-config (no doublezero-agent daemon) +# Phase 5: start the stress device container (it will block on its +# entrypoint's wait-for-startup-config loop until phase 6 copies the config) +# --------------------------------------------------------------------------- +if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then + log "removing existing stress device container" + docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +log "starting stress device container: $CONTAINER_NAME" +docker run -d \ + --name "$CONTAINER_NAME" \ + --hostname "device-${DEVICE_CODE}" \ + --privileged \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + "$STRESS_IMAGE" >/dev/null + +# Network ordering matters with containerized EOS: the first network +# attached is the management interface (Management0/eth0), then subsequent +# networks correspond to Ethernet1+ in order. So default → eth0, cyoa → eth1. +docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME" + +# Inspect for the default-network IP / gateway / prefix Docker assigned — +# the agent's path to the controller goes through Management0, so the EOS +# config needs all three. +DEFAULT_IP="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" +DEFAULT_PREFIX="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPPrefixLen}}")" +DEFAULT_GATEWAY="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").Gateway}}")" +log "default network: ip=${DEFAULT_IP}/${DEFAULT_PREFIX} gw=${DEFAULT_GATEWAY}" + +# --------------------------------------------------------------------------- +# Phase 6: render EOS startup-config and unblock the container's init # --------------------------------------------------------------------------- STARTUP_CONFIG_PATH="${WORKING_DIR}/startup-config" CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}" -# Note: the orchestrator's SSH runner exec's `doublezero-agent` over SSH. cEOS -# routes non-EOS-CLI commands through bash for privilege-15 admin users, so -# our /usr/local/bin/doublezero-agent wrapper runs. The `protocol http` / -# eos-sdk-rpc / Loopback0 blocks mirror the e2e device so the agent's -# hardcoded 127.0.0.1:9543 endpoint works. +# Note: the orchestrator's SSH runner exec's `doublezero-agent`. cEOS pins +# `admin`'s NSS shell to /usr/bin/RunCli, so SSH-exec'd commands hit the EOS +# Cli parser. We connect as a separate /bin/bash system user (`stress`, +# added in the stress device image) and plant the orchestrator's pubkey +# into its authorized_keys post-boot. The `protocol http` / eos-sdk-rpc / +# Loopback0 blocks mirror the e2e device so the agent's hardcoded +# 127.0.0.1:9543 endpoint works. The 999-line ACL exception lets the +# observer scrape `:9100` (agent prometheus metrics). cat > "$STARTUP_CONFIG_PATH" </dev/null 2>&1; then - log "removing existing stress device container" - docker rm -f "$CONTAINER_NAME" >/dev/null -fi - -log "starting stress device container: $CONTAINER_NAME" -docker run -d \ - --name "$CONTAINER_NAME" \ - --hostname "device-${DEVICE_CODE}" \ - --privileged \ - --network "$DEFAULT_NETWORK" \ - --label "dz.malbeclabs.com/type=devnet" \ - --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ - "$STRESS_IMAGE" >/dev/null - -# Attach to the CYOA network with the assigned IP (matches e2e ordering: -# default first → eth0, cyoa second → eth1). Then drop in the startup-config -# so the wait loop in the entrypoint can proceed. -docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME" docker exec "$CONTAINER_NAME" mkdir -p /etc/doublezero/agent docker cp "$STARTUP_CONFIG_PATH" "${CONTAINER_NAME}:/etc/doublezero/agent/startup-config" @@ -294,11 +327,20 @@ if [ "$status" != "healthy" ]; then exit 1 fi -# Force admin's login shell to bash. cEOS provisions admin with /usr/bin/Cli, -# which doesn't recognize external binaries like `doublezero-agent`. With bash -# as the login shell, the orchestrator's SSH exec runs the wrapper directly. -docker exec "$CONTAINER_NAME" \ - sed -i 's|^\(admin:[^:]*:[^:]*:[^:]*:[^:]*:[^:]*:\).*$|\1/bin/bash|' /etc/passwd +# Plant the orchestrator's SSH pubkey into the `stress` system user's +# authorized_keys. The user is created in the Dockerfile with /bin/bash so +# SSH-exec'd commands run through bash rather than EOS Cli. The pubkey can +# only be installed at runtime because the keypair is generated per devnet. +docker exec "$CONTAINER_NAME" bash -c ' + mkdir -p /home/stress/.ssh && + chown stress:stress /home/stress/.ssh && + chmod 700 /home/stress/.ssh +' +docker exec -i "$CONTAINER_NAME" bash -c ' + cat > /home/stress/.ssh/authorized_keys && + chown stress:stress /home/stress/.ssh/authorized_keys && + chmod 600 /home/stress/.ssh/authorized_keys +' < "${SSH_KEY_PATH}.pub" # --------------------------------------------------------------------------- # Phase 7: create the device onchain @@ -397,7 +439,7 @@ else ORCH_ARGS+=( --dut-ssh-host "${CYOA_IP}:22" --dut-ssh-key "$SSH_KEY_PATH" - --dut-ssh-user admin + --dut-ssh-user stress ) fi From c66b2fe4ea90e9c4621021de60662f10f1f48d6f Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Tue, 2 Jun 2026 15:10:08 +0000 Subject: [PATCH 3/7] tools/stress: agent metrics on a controller-permitted port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The controller-pushed device config fully redefines MAIN-CONTROL-PLANE-ACL on every agent apply (it starts with `no ip access-list MAIN-CONTROL-PLANE-ACL`), so any port permit we add to the startup-config gets wiped on the first apply. The controller's MAIN-CONTROL-PLANE-ACL is the ACL that's actually bound to `system control-plane in` (our MAIN-CONTROL-PLANE-ACL-MGMT is defined but unbound, so adding our permit there had no effect). That bound ACL does permit TCP 50000-50100 by default, so move the agent's prometheus listener to :50100 and point the observer's `--agent-metrics-url` at the same port. Drop the now-unused MAIN-CONTROL-PLANE-ACL-MGMT block from the startup-config — it was never bound to anything. --- tools/stress/docker/device/agent-wrapper.sh | 7 +++++- tools/stress/scripts/run-stress-local.sh | 25 +++++++-------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh index dce5cdcfe..b7abe55c0 100755 --- a/tools/stress/docker/device/agent-wrapper.sh +++ b/tools/stress/docker/device/agent-wrapper.sh @@ -16,6 +16,11 @@ EXTRA_ARGS=() if [ -n "$PUBKEY" ]; then EXTRA_ARGS+=(-pubkey "$PUBKEY") fi -EXTRA_ARGS+=(-metrics-enable -metrics-addr ":9100") +# Pick a port the controller-pushed MAIN-CONTROL-PLANE-ACL already permits. +# That ACL binds `system control-plane in` and the controller fully redefines +# it on every apply (`no ip access-list MAIN-CONTROL-PLANE-ACL` + recreate), +# so anything we add gets wiped on the agent's next tick. The default ACL +# does permit TCP 50000-50100, so park the metrics endpoint there. +EXTRA_ARGS+=(-metrics-enable -metrics-addr ":50100") exec /mnt/flash/doublezero-agent "${EXTRA_ARGS[@]}" "$@" diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh index 9050df004..1ce2f7c15 100755 --- a/tools/stress/scripts/run-stress-local.sh +++ b/tools/stress/scripts/run-stress-local.sh @@ -212,8 +212,13 @@ CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}" # added in the stress device image) and plant the orchestrator's pubkey # into its authorized_keys post-boot. The `protocol http` / eos-sdk-rpc / # Loopback0 blocks mirror the e2e device so the agent's hardcoded -# 127.0.0.1:9543 endpoint works. The 999-line ACL exception lets the -# observer scrape `:9100` (agent prometheus metrics). +# 127.0.0.1:9543 endpoint works. +# +# The agent's prometheus metrics listen on :50100 (set by agent-wrapper.sh) +# because the controller-pushed MAIN-CONTROL-PLANE-ACL — which is the one +# actually bound to `system control-plane in` — permits TCP 50000-50100 by +# default. The controller fully redefines that ACL on every apply, so we +# can't add a port-9100 permit ourselves and have it survive. cat > "$STARTUP_CONFIG_PATH" < Date: Tue, 2 Jun 2026 15:40:29 +0000 Subject: [PATCH 4/7] tools/stress: start controller + register loopbacks if missing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that make the harness self-sufficient against a fresh devnet: - If `dzctl start` bails at the (broken upstream) `doublezero geolocation init` step, the controller container is never started. When the script detects the controller is missing post-dzctl, start it with the same env the e2e harness uses (DZ_LEDGER_URL=http://ledger:8899, DZ_SERVICEABILITY_PROGRAM_ID resolved from the manager's keypair). - After creating the device onchain, register Loopback255 (vpnv4) and Loopback256 (ipv4). Without these the controller reports "device has pathology" every poll and returns an empty config — the agent runs but has nothing useful to apply. Interface types mirror the e2e harness. Also update the README to document why the agent's prometheus port lives at :50100 (the controller-pushed MAIN-CONTROL-PLANE-ACL overwrites our startup-config ACL on every apply, and that pushed ACL permits TCP 50000-50100 by default). --- tools/stress/scripts/README.md | 42 ++++++++++++++-------- tools/stress/scripts/run-stress-local.sh | 46 ++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/tools/stress/scripts/README.md b/tools/stress/scripts/README.md index 464a87b57..c14b9f7fc 100644 --- a/tools/stress/scripts/README.md +++ b/tools/stress/scripts/README.md @@ -37,27 +37,39 @@ Both processes keep running in the background. Stop them with the ## What the stress device differs from the e2e device -It is the same cEOS base, but the startup config (rendered at run time by the -script) drops the `daemon doublezero-agent` and `daemon doublezero-telemetry` -blocks. SSH access for `admin` is enabled via the orchestrator's -auto-generated ed25519 key, and the admin login shell in `/etc/passwd` is -flipped to `/bin/bash` after EOS boot so the SSH-exec'd -`doublezero-agent …` command runs through bash instead of the Cli parser. +It is the same cEOS base, but the startup config (rendered at run time +by the script) drops the `daemon doublezero-agent` and +`daemon doublezero-telemetry` blocks. cEOS pins admin's NSS shell to +`/usr/bin/RunCli` (the EOS Cli wrapper), so the image adds a separate +`stress` system user with `/bin/bash`; the script plants the +orchestrator's pubkey into its authorized_keys at runtime, and the +orchestrator's SSH session connects as `stress`. + +## Agent metrics port: why 50100, not 9100 + +The agent's prometheus listener is parked on `:50100`, not the default +`:9100`. The cEOS device's `system control-plane` binds +`MAIN-CONTROL-PLANE-ACL` (no `-MGMT` suffix), and the +doublezero-controller's pushed device config fully redefines that ACL +on every apply (starting with `no ip access-list +MAIN-CONTROL-PLANE-ACL`). Any port permit we add via our startup-config +is wiped on the first agent apply. The controller's default ACL does +permit TCP `50000-50100`, so the wrapper at +`/usr/local/bin/doublezero-agent` sets `-metrics-addr :50100` and the +script points the observer's `--agent-metrics-url` at the same port. ## Caveats / known issues - The orchestrator's hardcoded SSH command is `doublezero-agent -verbose [-controller HOST:PORT]`. It does not pass - `-pubkey` or `-metrics-enable`. The stress image works around this with - the wrapper at `/usr/local/bin/doublezero-agent`, which injects + `-pubkey` or `-metrics-enable`. The stress image works around this + with the wrapper at `/usr/local/bin/doublezero-agent`, which injects `-pubkey` from `/etc/doublezero/agent/pubkey` and turns on metrics on - `:9100`. -- Use `--no-agent` to skip the SSH agent entirely; the orchestrator will - only drive the onchain sweep and the observer will only see passive - device state (no agent-log / metrics rows). Useful as a first smoke test. -- The observer's `agent_silence` and `apply_config_errors` triggers depend - on the agent's metrics endpoint being reachable — they stay quiet under - `--no-agent`. + `:50100`. +- Use `--no-agent` to skip the SSH agent entirely; the orchestrator + will only drive the onchain sweep and the observer will only see + passive device state (no agent-log / metrics rows). Useful as a + first smoke test. ## Teardown diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh index 1ce2f7c15..eb1531f55 100755 --- a/tools/stress/scripts/run-stress-local.sh +++ b/tools/stress/scripts/run-stress-local.sh @@ -140,6 +140,35 @@ CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \ LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \ --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" +# dzctl bails on `doublezero geolocation init` before starting the +# controller, so start it ourselves if missing. The agent's config-fetch +# path goes through it; without it the agent's polling loop errors and +# the orchestrator never sees a healthy apply. +CONTROLLER_NAME="${DEPLOY_ID}-controller" +if [ -z "$CONTROLLER_IP" ]; then + log "starting $CONTROLLER_NAME (dzctl skipped it)" + SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ + solana address -k /etc/doublezero/manager/dz-program-keypair.json \ + | tr -d '[:space:]')" + docker run -d \ + --name "$CONTROLLER_NAME" \ + --hostname controller \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + -e "DZ_LEDGER_URL=http://ledger:8899" \ + -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \ + "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" >/dev/null + # Re-inspect for the IP. + for _ in $(seq 1 10); do + CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" + [ -n "$CONTROLLER_IP" ] && break + sleep 1 + done + [ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; } +fi + # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors # the rules in e2e/internal/devnet/device.go. @@ -361,6 +390,23 @@ log "device onchain pubkey: $DEVICE_PUBKEY" echo -n "$DEVICE_PUBKEY" | docker exec -i "$CONTAINER_NAME" \ bash -c 'cat > /etc/doublezero/agent/pubkey' +# Register VPNv4/IPv4 loopback interfaces onchain. Without these, the +# controller reports "device has pathology" every poll and returns an +# empty config — the agent runs but has nothing to apply. The interface +# names + types mirror the e2e harness. +for entry in "Loopback255:vpnv4" "Loopback256:ipv4"; do + iface="${entry%:*}" + iftype="${entry#*:}" + out=$(docker exec "${DEPLOY_ID}-manager" \ + doublezero device interface create "$DEVICE_CODE" "$iface" \ + --loopback-type "$iftype" --bandwidth 10G 2>&1) || true + if echo "$out" | grep -q "already exists"; then + log "loopback ${iface} (${iftype}) already exists onchain" + else + log "registered loopback ${iface} (${iftype})" + fi +done + PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ solana address -k /etc/doublezero/manager/dz-program-keypair.json | tr -d '[:space:]')" log "serviceability program id: $PROGRAM_ID" From 843cac7bef58656efc18db11df9191db6160507e Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Tue, 2 Jun 2026 15:59:13 +0000 Subject: [PATCH 5/7] tools/stress: run the agent as root via sudo in the wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doublezero-agent shells out to `ip netns exec default /usr/bin/Cli -p 15 -c "show session-config named X diffs"` to inspect the staged config session, and `ip netns exec` requires CAP_SYS_ADMIN. The orchestrator's SSH session lands as the unprivileged `stress` user, so this command fails with "Operation not permitted" every apply cycle and the agent never settles — every sweep aborts on apply_config_errors / diff_timeout. Grant `stress` passwordless sudo and re-exec the wrapper through sudo so the agent runs as root regardless of which user SSH lands. The sudo rule has to go in /etc/sudoers.Eos because cEOS overlays that template onto /etc/sudoers at boot and does not source /etc/sudoers.d/, so anything we'd write to either of those gets silently wiped. With this fix a 4-user sweep completes cleanly: 28 runlog rows, 0 agent errors, 728 prometheus metric rows captured by the observer. --- tools/stress/docker/device/Dockerfile | 13 ++++++++++++- tools/stress/docker/device/agent-wrapper.sh | 10 ++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile index d29867264..1f488ca32 100644 --- a/tools/stress/docker/device/Dockerfile +++ b/tools/stress/docker/device/Dockerfile @@ -22,6 +22,17 @@ RUN chmod +x /usr/local/bin/doublezero-agent # run-stress-local.sh plants the orchestrator's pubkey into this user's # authorized_keys at runtime (the keypair is generated per devnet, so we can't # bake it in). -RUN useradd -m -s /bin/bash stress +# +# The agent shells out to `ip netns exec default /usr/bin/Cli -p 15 -c "show +# session-config named X diffs"` to inspect the staged config session, and +# `ip netns exec` requires CAP_SYS_ADMIN. Give `stress` passwordless sudo so +# the wrapper can run the agent as root; without this every apply cycle ends +# in "Operation not permitted" from netns_exec and the agent never settles. +# Append to /etc/sudoers.Eos because cEOS overlays that template onto +# /etc/sudoers at boot (and it does not source /etc/sudoers.d/), so any +# rule we'd add to /etc/sudoers or /etc/sudoers.d/ gets wiped. +RUN useradd -m -s /bin/bash stress \ + && echo 'stress ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers.Eos \ + && visudo -cf /etc/sudoers.Eos EXPOSE 22 diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh index b7abe55c0..b84c85578 100755 --- a/tools/stress/docker/device/agent-wrapper.sh +++ b/tools/stress/docker/device/agent-wrapper.sh @@ -4,8 +4,18 @@ # doublezero-agent -verbose [-controller HOST:PORT] # It does not pass -pubkey or enable metrics. This wrapper supplies both so # the agent can fetch its config and the observer can scrape its counters. +# +# The agent must run as root: it shells out to `ip netns exec default +# /usr/bin/Cli` to inspect staged configure-session diffs, and that requires +# CAP_SYS_ADMIN. We invoke ourselves through sudo so the agent runs with +# the privilege it needs even when SSH lands the orchestrator as the `stress` +# user. (The Dockerfile grants `stress` passwordless sudo.) set -eu +if [ "$(id -u)" -ne 0 ]; then + exec sudo -E -- "$0" "$@" +fi + PUBKEY_FILE="/etc/doublezero/agent/pubkey" PUBKEY="" if [ -r "$PUBKEY_FILE" ]; then From 162f3aae7ad1d292077a1cf6b52d715be0e662d1 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Tue, 2 Jun 2026 20:57:15 +0000 Subject: [PATCH 6/7] tools/stress: parallelize access-pass setup and pass max-tunnel-slots Two adjustments for high-user-count runs: - Fan out the access-pass setup loop via xargs -P (default 16 concurrent) so the per-user serial CLI roundtrip doesn't bottleneck setup at high TARGET_USERS. At 1024 users a serial loop is ~12 minutes of sustained txn submission and reliably knocks the local validator over; the parallel version completes in well under a minute. - Always (re)start the controller container ourselves with -max-user-tunnel-slots set to TARGET_USERS (floor 128). The controller defaults the per-device slot count to 128; past that, it silently truncates the device config to the first 128 tunnels and the agent never applies the rest. dzctl currently can't start the controller anyway (its broken geolocation-init step bails first), so this also stops being a fallback and becomes the canonical start path. The entrypoint override preserves the original ledger-readiness wait. Knobs: DZ_STRESS_ACCESS_PASS_PARALLEL (default 16), DZ_STRESS_CONTROLLER_MAX_SLOTS (default TARGET_USERS). --- tools/stress/scripts/run-stress-local.sh | 92 ++++++++++++++++-------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh index eb1531f55..5f7d4a8d3 100755 --- a/tools/stress/scripts/run-stress-local.sh +++ b/tools/stress/scripts/run-stress-local.sh @@ -140,34 +140,59 @@ CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \ LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \ --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" -# dzctl bails on `doublezero geolocation init` before starting the -# controller, so start it ourselves if missing. The agent's config-fetch -# path goes through it; without it the agent's polling loop errors and -# the orchestrator never sees a healthy apply. +# The controller defaults `-max-user-tunnel-slots` to 128 — beyond that the +# controller renders only the first 128 tunnels in the device config and +# the agent never knows about higher-index users (the orchestrator's +# onchain provisions succeed regardless, so the bug is silent). Always +# restart the controller with our slot count derived from TARGET_USERS so +# stress sweeps past 128 users actually exercise the device. This also +# replaces any controller dzctl may have started before failing on its +# broken geolocation init step (we override the entrypoint to keep the +# ledger-readiness wait and inject the flag). CONTROLLER_NAME="${DEPLOY_ID}-controller" -if [ -z "$CONTROLLER_IP" ]; then - log "starting $CONTROLLER_NAME (dzctl skipped it)" - SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ - solana address -k /etc/doublezero/manager/dz-program-keypair.json \ - | tr -d '[:space:]')" - docker run -d \ - --name "$CONTROLLER_NAME" \ - --hostname controller \ - --network "$DEFAULT_NETWORK" \ - --label "dz.malbeclabs.com/type=devnet" \ - --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ - -e "DZ_LEDGER_URL=http://ledger:8899" \ - -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \ - "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" >/dev/null - # Re-inspect for the IP. - for _ in $(seq 1 10); do - CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \ - --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" - [ -n "$CONTROLLER_IP" ] && break - sleep 1 - done - [ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; } +CONTROLLER_MAX_SLOTS="${DZ_STRESS_CONTROLLER_MAX_SLOTS:-$TARGET_USERS}" +if [ "$CONTROLLER_MAX_SLOTS" -lt 128 ]; then + CONTROLLER_MAX_SLOTS=128 fi +log "starting $CONTROLLER_NAME (max-user-tunnel-slots=$CONTROLLER_MAX_SLOTS)" +docker rm -f "$CONTROLLER_NAME" >/dev/null 2>&1 || true +SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ + solana address -k /etc/doublezero/manager/dz-program-keypair.json \ + | tr -d '[:space:]')" +docker run -d \ + --name "$CONTROLLER_NAME" \ + --hostname controller \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + -e "DZ_LEDGER_URL=http://ledger:8899" \ + -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \ + --entrypoint bash \ + "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" \ + -c " + while ! curl -sf -X POST -H 'Content-Type: application/json' \\ + --data '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getHealth\"}' \\ + \"\${DZ_LEDGER_URL}\" | grep -q '\"result\":\"ok\"'; do + echo 'Waiting for solana validator to be ready...' + sleep 1 + done + exec doublezero-controller start \\ + -listen-addr 0.0.0.0 -listen-port 7000 \\ + -program-id \"\${DZ_SERVICEABILITY_PROGRAM_ID}\" \\ + -solana-rpc-endpoint \"\${DZ_LEDGER_URL}\" \\ + -device-local-asn 65342 \\ + -max-user-tunnel-slots ${CONTROLLER_MAX_SLOTS} \\ + -no-hardware + " >/dev/null +# Re-inspect for the IP. +CONTROLLER_IP="" +for _ in $(seq 1 10); do + CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" + [ -n "$CONTROLLER_IP" ] && break + sleep 1 +done +[ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; } # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors @@ -421,11 +446,20 @@ chmod 600 "$KEYPAIR_LOCAL" # keyed on (client_ip, user_payer). The orchestrator signs as the manager, so # user_payer is the manager's pubkey. Sweep CLIENT_IP_BASE + 0..N to cover # every IP the orchestrator might use. +# +# The set-access-pass calls fan out via xargs -P so high user counts don't +# bottleneck on serial CLI roundtrips — at 1024 users a serial loop is 12+ +# minutes of sustained txn submission per second and can knock the local +# validator over, while batches of ACCESS_PASS_PARALLEL concurrent calls +# complete in well under a minute. PAYER_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \ solana-keygen pubkey /root/.config/doublezero/id.json | tr -d '[:space:]')" IFS=. read -r b1 b2 b3 b4 <<<"$CLIENT_IP_BASE" -log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY)" -for i in $(seq 0 $((TARGET_USERS - 1))); do +ACCESS_PASS_PARALLEL="${DZ_STRESS_ACCESS_PASS_PARALLEL:-16}" +log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY, parallel=$ACCESS_PASS_PARALLEL)" +export DEPLOY_ID PAYER_PUBKEY b1 b2 b3 b4 +seq 0 $((TARGET_USERS - 1)) | xargs -P "$ACCESS_PASS_PARALLEL" -I{} bash -c ' + i=$1 host=$(( (b3 << 8) + b4 + i )) octet3=$(( (host >> 8) & 0xff )) octet4=$(( host & 0xff )) @@ -435,7 +469,7 @@ for i in $(seq 0 $((TARGET_USERS - 1))); do --accesspass-type prepaid \ --client-ip "$client_ip" \ --user-payer "$PAYER_PUBKEY" >/dev/null -done +' _ {} # --------------------------------------------------------------------------- # Phase 8: build orchestrator + observer From ab48f61966a451ab94d0f3f8eedf7b7ed82ace1e Mon Sep 17 00:00:00 2001 From: nikw9944 Date: Wed, 3 Jun 2026 18:10:55 +0000 Subject: [PATCH 7/7] tools/stress: snap dz_prefix down to /29 boundary --- tools/stress/scripts/run-stress-local.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh index 5f7d4a8d3..fc1718731 100755 --- a/tools/stress/scripts/run-stress-local.sh +++ b/tools/stress/scripts/run-stress-local.sh @@ -196,13 +196,15 @@ done # Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a # globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors -# the rules in e2e/internal/devnet/device.go. +# the rules in e2e/internal/devnet/device.go. Snap to a /29 boundary so the +# prefix is network-aligned in case `--dz-prefixes` ever validates alignment. read -r CYOA_IP DZ_PREFIX < <(python3 - <