diff --git a/tools/stress/docker/device/Dockerfile b/tools/stress/docker/device/Dockerfile new file mode 100644 index 000000000..1f488ca32 --- /dev/null +++ b/tools/stress/docker/device/Dockerfile @@ -0,0 +1,38 @@ +# Stress-test device image. +# +# Layers on top of the e2e device image (dz-local/device by default) and +# replaces the default `doublezero-agent` binary path with a wrapper that the +# orchestrator's SSH runner can invoke as `doublezero-agent`. The wrapper +# injects the per-device --pubkey (read from /etc/doublezero/agent/pubkey, +# populated by run-stress-local.sh) and enables prometheus metrics so the +# observer can scrape them. +# +# This image deliberately does NOT bake an EOS startup-config that runs the +# agent as a daemon. The orchestrator owns the agent lifecycle. +ARG DZ_DEVICE_IMAGE=dz-local/device:dev +FROM ${DZ_DEVICE_IMAGE} + +COPY agent-wrapper.sh /usr/local/bin/doublezero-agent +RUN chmod +x /usr/local/bin/doublezero-agent + +# cEOS provisions admin via NSS with shell /usr/bin/RunCli, which intercepts +# SSH-exec'd commands and feeds them to the EOS Cli parser. The orchestrator's +# hardcoded SSH command (`doublezero-agent -verbose …`) is not valid EOS Cli, +# so we add a separate system user with /bin/bash for the orchestrator to use. +# run-stress-local.sh plants the orchestrator's pubkey into this user's +# authorized_keys at runtime (the keypair is generated per devnet, so we can't +# bake it in). +# +# The agent shells out to `ip netns exec default /usr/bin/Cli -p 15 -c "show +# session-config named X diffs"` to inspect the staged config session, and +# `ip netns exec` requires CAP_SYS_ADMIN. Give `stress` passwordless sudo so +# the wrapper can run the agent as root; without this every apply cycle ends +# in "Operation not permitted" from netns_exec and the agent never settles. +# Append to /etc/sudoers.Eos because cEOS overlays that template onto +# /etc/sudoers at boot (and it does not source /etc/sudoers.d/), so any +# rule we'd add to /etc/sudoers or /etc/sudoers.d/ gets wiped. +RUN useradd -m -s /bin/bash stress \ + && echo 'stress ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers.Eos \ + && visudo -cf /etc/sudoers.Eos + +EXPOSE 22 diff --git a/tools/stress/docker/device/agent-wrapper.sh b/tools/stress/docker/device/agent-wrapper.sh new file mode 100755 index 000000000..b84c85578 --- /dev/null +++ b/tools/stress/docker/device/agent-wrapper.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Wrapper invoked over SSH by the orchestrator as `doublezero-agent`. +# The orchestrator's SSH command is hardcoded today as: +# doublezero-agent -verbose [-controller HOST:PORT] +# It does not pass -pubkey or enable metrics. This wrapper supplies both so +# the agent can fetch its config and the observer can scrape its counters. +# +# The agent must run as root: it shells out to `ip netns exec default +# /usr/bin/Cli` to inspect staged configure-session diffs, and that requires +# CAP_SYS_ADMIN. We invoke ourselves through sudo so the agent runs with +# the privilege it needs even when SSH lands the orchestrator as the `stress` +# user. (The Dockerfile grants `stress` passwordless sudo.) +set -eu + +if [ "$(id -u)" -ne 0 ]; then + exec sudo -E -- "$0" "$@" +fi + +PUBKEY_FILE="/etc/doublezero/agent/pubkey" +PUBKEY="" +if [ -r "$PUBKEY_FILE" ]; then + PUBKEY="$(tr -d '[:space:]' < "$PUBKEY_FILE")" +fi + +EXTRA_ARGS=() +if [ -n "$PUBKEY" ]; then + EXTRA_ARGS+=(-pubkey "$PUBKEY") +fi +# Pick a port the controller-pushed MAIN-CONTROL-PLANE-ACL already permits. +# That ACL binds `system control-plane in` and the controller fully redefines +# it on every apply (`no ip access-list MAIN-CONTROL-PLANE-ACL` + recreate), +# so anything we add gets wiped on the agent's next tick. The default ACL +# does permit TCP 50000-50100, so park the metrics endpoint there. +EXTRA_ARGS+=(-metrics-enable -metrics-addr ":50100") + +exec /mnt/flash/doublezero-agent "${EXTRA_ARGS[@]}" "$@" diff --git a/tools/stress/scripts/README.md b/tools/stress/scripts/README.md new file mode 100644 index 000000000..c14b9f7fc --- /dev/null +++ b/tools/stress/scripts/README.md @@ -0,0 +1,79 @@ +# Local stress-test harness + +`run-stress-local.sh` brings up a containerized devnet (ledger + manager + +controller + funder, via `dev/dzctl start`), adds one custom **stress device** +container whose EOS startup config does NOT include the `doublezero-agent` +daemon, then launches the orchestrator and observer against it. The +orchestrator owns the agent lifecycle (starts it over SSH); the observer +samples the device. + +Components: + +| Piece | Image / source | +| ------------------------------------ | -------------------------------------------------------------- | +| Ledger / manager / controller | e2e harness images (`dz-local/{ledger,manager,controller}`) | +| Stress device | `tools/stress/docker/device/Dockerfile` (extends e2e device) | +| Agent invocation wrapper | `tools/stress/docker/device/agent-wrapper.sh` (in stress image)| +| Stress orchestrator | `tools/stress/device-orchestrator/cmd/device-orchestrator` | +| Stress observer | `tools/stress/device-observer/cmd/device-observer` | + +## Quick start + +```bash +# Full build + run (creates a 4-user sweep with 30s holds by default) +tools/stress/scripts/run-stress-local.sh --clean + +# Skip docker build on subsequent runs +tools/stress/scripts/run-stress-local.sh --no-build + +# Tweak the sweep +tools/stress/scripts/run-stress-local.sh --target-users 8 --hold 60 +``` + +The script ends by printing the orchestrator/observer PIDs and the run +working directory (under `dev/.deploy/dz-local/stress/run//`). +Both processes keep running in the background. Stop them with the +`kill $(cat …)` snippet the script prints. + +## What the stress device differs from the e2e device + +It is the same cEOS base, but the startup config (rendered at run time +by the script) drops the `daemon doublezero-agent` and +`daemon doublezero-telemetry` blocks. cEOS pins admin's NSS shell to +`/usr/bin/RunCli` (the EOS Cli wrapper), so the image adds a separate +`stress` system user with `/bin/bash`; the script plants the +orchestrator's pubkey into its authorized_keys at runtime, and the +orchestrator's SSH session connects as `stress`. + +## Agent metrics port: why 50100, not 9100 + +The agent's prometheus listener is parked on `:50100`, not the default +`:9100`. The cEOS device's `system control-plane` binds +`MAIN-CONTROL-PLANE-ACL` (no `-MGMT` suffix), and the +doublezero-controller's pushed device config fully redefines that ACL +on every apply (starting with `no ip access-list +MAIN-CONTROL-PLANE-ACL`). Any port permit we add via our startup-config +is wiped on the first agent apply. The controller's default ACL does +permit TCP `50000-50100`, so the wrapper at +`/usr/local/bin/doublezero-agent` sets `-metrics-addr :50100` and the +script points the observer's `--agent-metrics-url` at the same port. + +## Caveats / known issues + +- The orchestrator's hardcoded SSH command is + `doublezero-agent -verbose [-controller HOST:PORT]`. It does not pass + `-pubkey` or `-metrics-enable`. The stress image works around this + with the wrapper at `/usr/local/bin/doublezero-agent`, which injects + `-pubkey` from `/etc/doublezero/agent/pubkey` and turns on metrics on + `:50100`. +- Use `--no-agent` to skip the SSH agent entirely; the orchestrator + will only drive the onchain sweep and the observer will only see + passive device state (no agent-log / metrics rows). Useful as a + first smoke test. + +## Teardown + +```bash +dev/dzctl destroy -y +docker rm -f dz-local-device-dzstress 2>/dev/null +``` diff --git a/tools/stress/scripts/run-stress-local.sh b/tools/stress/scripts/run-stress-local.sh new file mode 100755 index 000000000..fc1718731 --- /dev/null +++ b/tools/stress/scripts/run-stress-local.sh @@ -0,0 +1,551 @@ +#!/usr/bin/env bash +# Bring up a containerized devnet plus a stress-test device, then run the +# orchestrator and observer against it. Reuses the e2e ledger / manager / +# controller stack (via `dev/dzctl start`) and adds one custom stress device +# whose EOS config does NOT run a doublezero-agent daemon — the orchestrator +# starts the agent over SSH instead. +# +# Usage: +# tools/stress/scripts/run-stress-local.sh # default run +# tools/stress/scripts/run-stress-local.sh --clean # destroy first +# tools/stress/scripts/run-stress-local.sh --no-build # skip docker build +# tools/stress/scripts/run-stress-local.sh --target-users 4 --hold 60 +# +# After it returns, the orchestrator and observer keep running in the +# background. Their PIDs, log files, and working directory are printed. +# Stop them with: kill $(cat /orchestrator.pid /observer.pid) +set -euo pipefail + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +WORKSPACE_DIR="$(cd -- "${SCRIPT_DIR}/../../.." &> /dev/null && pwd)" + +DEPLOY_ID="${DZ_DEPLOY_ID:-dz-local}" +STRESS_IMAGE="${DZ_STRESS_DEVICE_IMAGE:-dz-local/device-stress:dev}" +BASE_DEVICE_IMAGE="${DZ_DEVICE_IMAGE:-dz-local/device:dev}" + +DEVICE_CODE="${DZ_STRESS_DEVICE_CODE:-dzstress}" +DEVICE_LOCATION="ewr" +DEVICE_EXCHANGE="xewr" +DEVICE_HOST_ID="${DZ_STRESS_DEVICE_HOST_ID:-50}" # offset inside the CYOA /24 + +CONTAINER_NAME="${DEPLOY_ID}-device-${DEVICE_CODE}" +DEFAULT_NETWORK="${DEPLOY_ID}-default" +CYOA_NETWORK="${DEPLOY_ID}-cyoa" + +DEPLOY_DIR="${WORKSPACE_DIR}/dev/.deploy/${DEPLOY_ID}/stress" +WORKING_DIR="${DZ_STRESS_WORKING_DIR:-${DEPLOY_DIR}/run}" +SSH_KEY_PATH="${DEPLOY_DIR}/orchestrator_ed25519" + +TARGET_USERS="${DZ_STRESS_TARGET_USERS:-4}" +USERS_PER_BATCH="${DZ_STRESS_USERS_PER_BATCH:-2}" +HOLD_SECONDS="${DZ_STRESS_HOLD_SECONDS:-30}" +SAMPLE_INTERVAL="${DZ_STRESS_SAMPLE_INTERVAL:-10s}" +# The serviceability program rejects user creates whose client_ip isn't +# globally routable (rejects CGNAT 100.64.0.0/10 and friends). Pin the +# orchestrator's IP allocator to a global-unicast /16 instead of its CGNAT +# default. +CLIENT_IP_BASE="${DZ_STRESS_CLIENT_IP_BASE:-9.200.0.0}" + +CLEAN=false +NO_BUILD=false +NO_AGENT=false +while [[ $# -gt 0 ]]; do + case "$1" in + --clean) CLEAN=true; shift ;; + --no-build) NO_BUILD=true; shift ;; + --no-agent) NO_AGENT=true; shift ;; + --target-users) TARGET_USERS="$2"; shift 2 ;; + --users-per-batch) USERS_PER_BATCH="$2"; shift 2 ;; + --hold) HOLD_SECONDS="$2"; shift 2 ;; + --sample-interval) SAMPLE_INTERVAL="$2"; shift 2 ;; + -h|--help) sed -n '1,/^set -euo/p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "unknown flag: $1" >&2; exit 2 ;; + esac +done + +log() { printf '\033[1;36m[stress]\033[0m %s\n' "$*" >&2; } + +require() { + command -v "$1" >/dev/null 2>&1 || { echo "missing required tool: $1" >&2; exit 1; } +} +require docker +require jq +require go +require ssh-keygen +require python3 # for IP-in-subnet math + +mkdir -p "$DEPLOY_DIR" "$WORKING_DIR" + +# --------------------------------------------------------------------------- +# Phase 1: bring up the core devnet +# --------------------------------------------------------------------------- +if [ "$CLEAN" = true ]; then + log "destroying any prior devnet ($DEPLOY_ID)" + "${WORKSPACE_DIR}/dev/dzctl" destroy -y || true + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true +fi + +if [ "$NO_BUILD" = false ]; then + log "building e2e docker images via dzctl" + "${WORKSPACE_DIR}/dev/dzctl" build +fi + +log "starting core devnet (ledger, manager, controller, funder)" +# dzctl start currently fails at "doublezero geolocation init" because that +# subcommand was removed from the CLI. The earlier steps (ledger up, +# serviceability deploy, smart-contract init) succeed and that's all we +# need. Re-check post-failure that the smart contract is initialized and +# continue. +if ! "${WORKSPACE_DIR}/dev/dzctl" start --no-build; then + log "dzctl start exited non-zero; verifying chain state" + if ! docker exec "${DEPLOY_ID}-manager" \ + doublezero global-config get >/dev/null 2>&1; then + echo "dzctl start failed AND smart-contract is not initialized" >&2 + exit 1 + fi + log "smart contract is initialized; continuing despite dzctl failure" +fi + +# --------------------------------------------------------------------------- +# Phase 2: build the stress device image +# --------------------------------------------------------------------------- +log "building stress device image: $STRESS_IMAGE" +docker build \ + --build-arg "DZ_DEVICE_IMAGE=${BASE_DEVICE_IMAGE}" \ + -t "$STRESS_IMAGE" \ + "${WORKSPACE_DIR}/tools/stress/docker/device" + +# --------------------------------------------------------------------------- +# Phase 3: discover network / address state +# --------------------------------------------------------------------------- +log "inspecting networks" +CYOA_SUBNET="$(docker network inspect "$CYOA_NETWORK" \ + --format '{{(index .IPAM.Config 0).Subnet}}' 2>/dev/null || true)" +if [ -z "$CYOA_SUBNET" ]; then + # dzctl bailed before creating the CYOA network. Make it ourselves. + log "creating $CYOA_NETWORK (dzctl skipped it)" + docker network create \ + --driver bridge \ + --subnet 9.128.0.0/24 \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + "$CYOA_NETWORK" >/dev/null + CYOA_SUBNET="9.128.0.0/24" +fi +CONTROLLER_IP="$(docker inspect "${DEPLOY_ID}-controller" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" +LEDGER_IP="$(docker inspect "${DEPLOY_ID}-ledger" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" + +# The controller defaults `-max-user-tunnel-slots` to 128 — beyond that the +# controller renders only the first 128 tunnels in the device config and +# the agent never knows about higher-index users (the orchestrator's +# onchain provisions succeed regardless, so the bug is silent). Always +# restart the controller with our slot count derived from TARGET_USERS so +# stress sweeps past 128 users actually exercise the device. This also +# replaces any controller dzctl may have started before failing on its +# broken geolocation init step (we override the entrypoint to keep the +# ledger-readiness wait and inject the flag). +CONTROLLER_NAME="${DEPLOY_ID}-controller" +CONTROLLER_MAX_SLOTS="${DZ_STRESS_CONTROLLER_MAX_SLOTS:-$TARGET_USERS}" +if [ "$CONTROLLER_MAX_SLOTS" -lt 128 ]; then + CONTROLLER_MAX_SLOTS=128 +fi +log "starting $CONTROLLER_NAME (max-user-tunnel-slots=$CONTROLLER_MAX_SLOTS)" +docker rm -f "$CONTROLLER_NAME" >/dev/null 2>&1 || true +SERVICEABILITY_PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ + solana address -k /etc/doublezero/manager/dz-program-keypair.json \ + | tr -d '[:space:]')" +docker run -d \ + --name "$CONTROLLER_NAME" \ + --hostname controller \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + -e "DZ_LEDGER_URL=http://ledger:8899" \ + -e "DZ_SERVICEABILITY_PROGRAM_ID=${SERVICEABILITY_PROGRAM_ID}" \ + --entrypoint bash \ + "${DZ_CONTROLLER_IMAGE:-dz-local/controller:dev}" \ + -c " + while ! curl -sf -X POST -H 'Content-Type: application/json' \\ + --data '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getHealth\"}' \\ + \"\${DZ_LEDGER_URL}\" | grep -q '\"result\":\"ok\"'; do + echo 'Waiting for solana validator to be ready...' + sleep 1 + done + exec doublezero-controller start \\ + -listen-addr 0.0.0.0 -listen-port 7000 \\ + -program-id \"\${DZ_SERVICEABILITY_PROGRAM_ID}\" \\ + -solana-rpc-endpoint \"\${DZ_LEDGER_URL}\" \\ + -device-local-asn 65342 \\ + -max-user-tunnel-slots ${CONTROLLER_MAX_SLOTS} \\ + -no-hardware + " >/dev/null +# Re-inspect for the IP. +CONTROLLER_IP="" +for _ in $(seq 1 10); do + CONTROLLER_IP="$(docker inspect "$CONTROLLER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}" 2>/dev/null || true)" + [ -n "$CONTROLLER_IP" ] && break + sleep 1 +done +[ -n "$CONTROLLER_IP" ] || { echo "controller did not get a default-network IP" >&2; exit 1; } + +# Derive an IP inside the CYOA subnet (host octet = DEVICE_HOST_ID) and a +# globally-routable /29 dz_prefix at a non-overlapping host offset. Mirrors +# the rules in e2e/internal/devnet/device.go. Snap to a /29 boundary so the +# prefix is network-aligned in case `--dz-prefixes` ever validates alignment. +read -r CYOA_IP DZ_PREFIX < <(python3 - <}" + +# --------------------------------------------------------------------------- +# Phase 4: SSH keypair for the orchestrator +# --------------------------------------------------------------------------- +if [ ! -f "$SSH_KEY_PATH" ]; then + log "generating orchestrator SSH keypair: $SSH_KEY_PATH" + ssh-keygen -t ed25519 -f "$SSH_KEY_PATH" -N '' -C 'doublezero-stress-orchestrator' >/dev/null +fi +SSH_PUBKEY="$(cat "${SSH_KEY_PATH}.pub")" + +# --------------------------------------------------------------------------- +# Phase 5: start the stress device container (it will block on its +# entrypoint's wait-for-startup-config loop until phase 6 copies the config) +# --------------------------------------------------------------------------- +if docker inspect "$CONTAINER_NAME" >/dev/null 2>&1; then + log "removing existing stress device container" + docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +log "starting stress device container: $CONTAINER_NAME" +docker run -d \ + --name "$CONTAINER_NAME" \ + --hostname "device-${DEVICE_CODE}" \ + --privileged \ + --network "$DEFAULT_NETWORK" \ + --label "dz.malbeclabs.com/type=devnet" \ + --label "dz.malbeclabs.com/deploy-id=${DEPLOY_ID}" \ + "$STRESS_IMAGE" >/dev/null + +# Network ordering matters with containerized EOS: the first network +# attached is the management interface (Management0/eth0), then subsequent +# networks correspond to Ethernet1+ in order. So default → eth0, cyoa → eth1. +docker network connect --ip "$CYOA_IP" "$CYOA_NETWORK" "$CONTAINER_NAME" + +# Inspect for the default-network IP / gateway / prefix Docker assigned — +# the agent's path to the controller goes through Management0, so the EOS +# config needs all three. +DEFAULT_IP="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPAddress}}")" +DEFAULT_PREFIX="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").IPPrefixLen}}")" +DEFAULT_GATEWAY="$(docker inspect "$CONTAINER_NAME" \ + --format "{{(index .NetworkSettings.Networks \"${DEFAULT_NETWORK}\").Gateway}}")" +log "default network: ip=${DEFAULT_IP}/${DEFAULT_PREFIX} gw=${DEFAULT_GATEWAY}" + +# --------------------------------------------------------------------------- +# Phase 6: render EOS startup-config and unblock the container's init +# --------------------------------------------------------------------------- +STARTUP_CONFIG_PATH="${WORKING_DIR}/startup-config" +CYOA_CIDR_PREFIX="${CYOA_SUBNET##*/}" + +# Note: the orchestrator's SSH runner exec's `doublezero-agent`. cEOS pins +# `admin`'s NSS shell to /usr/bin/RunCli, so SSH-exec'd commands hit the EOS +# Cli parser. We connect as a separate /bin/bash system user (`stress`, +# added in the stress device image) and plant the orchestrator's pubkey +# into its authorized_keys post-boot. The `protocol http` / eos-sdk-rpc / +# Loopback0 blocks mirror the e2e device so the agent's hardcoded +# 127.0.0.1:9543 endpoint works. +# +# The agent's prometheus metrics listen on :50100 (set by agent-wrapper.sh) +# because the controller-pushed MAIN-CONTROL-PLANE-ACL — which is the one +# actually bound to `system control-plane in` — permits TCP 50000-50100 by +# default. The controller fully redefines that ACL on every apply, so we +# can't add a port-9100 permit ourselves and have it survive. +cat > "$STARTUP_CONFIG_PATH" </dev/null || echo starting)" + if [ "$status" = "healthy" ]; then + break + fi + sleep 5 +done +if [ "$status" != "healthy" ]; then + echo "stress device did not become healthy (last status: $status)" >&2 + docker logs --tail 50 "$CONTAINER_NAME" >&2 || true + exit 1 +fi + +# Plant the orchestrator's SSH pubkey into the `stress` system user's +# authorized_keys. The user is created in the Dockerfile with /bin/bash so +# SSH-exec'd commands run through bash rather than EOS Cli. The pubkey can +# only be installed at runtime because the keypair is generated per devnet. +docker exec "$CONTAINER_NAME" bash -c ' + mkdir -p /home/stress/.ssh && + chown stress:stress /home/stress/.ssh && + chmod 700 /home/stress/.ssh +' +docker exec -i "$CONTAINER_NAME" bash -c ' + cat > /home/stress/.ssh/authorized_keys && + chown stress:stress /home/stress/.ssh/authorized_keys && + chmod 600 /home/stress/.ssh/authorized_keys +' < "${SSH_KEY_PATH}.pub" + +# --------------------------------------------------------------------------- +# Phase 7: create the device onchain +# --------------------------------------------------------------------------- +log "creating device onchain (code=${DEVICE_CODE})" +docker exec "${DEPLOY_ID}-manager" bash -c " + set -e + if ! doublezero device get --code ${DEVICE_CODE} >/dev/null 2>&1; then + doublezero device create \ + --contributor co01 \ + --code ${DEVICE_CODE} \ + --location ${DEVICE_LOCATION} \ + --exchange ${DEVICE_EXCHANGE} \ + --public-ip ${CYOA_IP} \ + --dz-prefixes ${DZ_PREFIX} \ + --mgmt-vrf mgmt + DEVICE_PK=\$(doublezero device get --code ${DEVICE_CODE} --json | jq -r .account) + doublezero device update --pubkey \"\$DEVICE_PK\" --max-users 128 --desired-status activated + fi +" + +DEVICE_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \ + bash -c "doublezero device get --code ${DEVICE_CODE} --json" | jq -r .account)" +log "device onchain pubkey: $DEVICE_PUBKEY" + +# Plant the pubkey on the device so the agent wrapper can supply --pubkey. +echo -n "$DEVICE_PUBKEY" | docker exec -i "$CONTAINER_NAME" \ + bash -c 'cat > /etc/doublezero/agent/pubkey' + +# Register VPNv4/IPv4 loopback interfaces onchain. Without these, the +# controller reports "device has pathology" every poll and returns an +# empty config — the agent runs but has nothing to apply. The interface +# names + types mirror the e2e harness. +for entry in "Loopback255:vpnv4" "Loopback256:ipv4"; do + iface="${entry%:*}" + iftype="${entry#*:}" + out=$(docker exec "${DEPLOY_ID}-manager" \ + doublezero device interface create "$DEVICE_CODE" "$iface" \ + --loopback-type "$iftype" --bandwidth 10G 2>&1) || true + if echo "$out" | grep -q "already exists"; then + log "loopback ${iface} (${iftype}) already exists onchain" + else + log "registered loopback ${iface} (${iftype})" + fi +done + +PROGRAM_ID="$(docker exec "${DEPLOY_ID}-manager" \ + solana address -k /etc/doublezero/manager/dz-program-keypair.json | tr -d '[:space:]')" +log "serviceability program id: $PROGRAM_ID" + +KEYPAIR_LOCAL="${DEPLOY_DIR}/manager-keypair.json" +docker cp "${DEPLOY_ID}-manager:/root/.config/doublezero/id.json" "$KEYPAIR_LOCAL" +# docker cp preserves the container's file mode (000 for keypairs); make it +# readable by the orchestrator running as the host user. +chmod 600 "$KEYPAIR_LOCAL" + +# Each user the orchestrator provisions onchain needs a prepaid access pass +# keyed on (client_ip, user_payer). The orchestrator signs as the manager, so +# user_payer is the manager's pubkey. Sweep CLIENT_IP_BASE + 0..N to cover +# every IP the orchestrator might use. +# +# The set-access-pass calls fan out via xargs -P so high user counts don't +# bottleneck on serial CLI roundtrips — at 1024 users a serial loop is 12+ +# minutes of sustained txn submission per second and can knock the local +# validator over, while batches of ACCESS_PASS_PARALLEL concurrent calls +# complete in well under a minute. +PAYER_PUBKEY="$(docker exec "${DEPLOY_ID}-manager" \ + solana-keygen pubkey /root/.config/doublezero/id.json | tr -d '[:space:]')" +IFS=. read -r b1 b2 b3 b4 <<<"$CLIENT_IP_BASE" +ACCESS_PASS_PARALLEL="${DZ_STRESS_ACCESS_PASS_PARALLEL:-16}" +log "creating access passes for ${CLIENT_IP_BASE}+0..$((TARGET_USERS-1)) (payer=$PAYER_PUBKEY, parallel=$ACCESS_PASS_PARALLEL)" +export DEPLOY_ID PAYER_PUBKEY b1 b2 b3 b4 +seq 0 $((TARGET_USERS - 1)) | xargs -P "$ACCESS_PASS_PARALLEL" -I{} bash -c ' + i=$1 + host=$(( (b3 << 8) + b4 + i )) + octet3=$(( (host >> 8) & 0xff )) + octet4=$(( host & 0xff )) + client_ip="${b1}.${b2}.${octet3}.${octet4}" + docker exec "${DEPLOY_ID}-manager" \ + doublezero access-pass set \ + --accesspass-type prepaid \ + --client-ip "$client_ip" \ + --user-payer "$PAYER_PUBKEY" >/dev/null +' _ {} + +# --------------------------------------------------------------------------- +# Phase 8: build orchestrator + observer +# --------------------------------------------------------------------------- +log "building orchestrator + observer binaries" +ORCH_BIN="${DEPLOY_DIR}/device-orchestrator" +OBS_BIN="${DEPLOY_DIR}/device-observer" +( cd "$WORKSPACE_DIR" && \ + go build -o "$ORCH_BIN" ./tools/stress/device-orchestrator/cmd/device-orchestrator && \ + go build -o "$OBS_BIN" ./tools/stress/device-observer/cmd/device-observer ) + +# --------------------------------------------------------------------------- +# Phase 9: launch orchestrator + observer +# --------------------------------------------------------------------------- +RUN_DIR="${WORKING_DIR}/$(date -u +%Y%m%dT%H%M%SZ)" +mkdir -p "$RUN_DIR" +log "run working-dir: $RUN_DIR" + +ORCH_ARGS=( + --dut-pubkey "$DEVICE_PUBKEY" + --rpc-url "http://${LEDGER_IP}:8899" + --program-id "$PROGRAM_ID" + --keypair "$KEYPAIR_LOCAL" + --working-dir "$RUN_DIR" + --abort-file "${RUN_DIR}/abort" + --target-user-count "$TARGET_USERS" + --users-per-batch "$USERS_PER_BATCH" + --hold-seconds "$HOLD_SECONDS" + --client-ip-base "$CLIENT_IP_BASE" + --log-level info +) +if [ -n "$CONTROLLER_IP" ]; then + ORCH_ARGS+=(--controller "${CONTROLLER_IP}:7000") +fi +if [ "$NO_AGENT" = true ]; then + ORCH_ARGS+=(--no-agent) +else + ORCH_ARGS+=( + --dut-ssh-host "${CYOA_IP}:22" + --dut-ssh-key "$SSH_KEY_PATH" + --dut-ssh-user stress + ) +fi + +log "launching orchestrator (background)" +nohup "$ORCH_BIN" "${ORCH_ARGS[@]}" \ + > "${RUN_DIR}/orchestrator.stdout" \ + 2> "${RUN_DIR}/orchestrator.stderr" & +ORCH_PID=$! +echo "$ORCH_PID" > "${RUN_DIR}/orchestrator.pid" + +log "launching observer (background)" +nohup "$OBS_BIN" \ + --dut-host "$CYOA_IP" \ + --eapi-user admin --eapi-pass admin \ + --agent-metrics-url "http://${CYOA_IP}:50100/metrics" \ + --working-dir "$RUN_DIR" \ + --abort-file "${RUN_DIR}/abort" \ + --sample-interval "$SAMPLE_INTERVAL" \ + --force \ + > "${RUN_DIR}/observer.stdout" \ + 2> "${RUN_DIR}/observer.stderr" & +OBS_PID=$! +echo "$OBS_PID" > "${RUN_DIR}/observer.pid" + +cat < stress test launched + orchestrator pid : $ORCH_PID (logs: ${RUN_DIR}/orchestrator.std{out,err}) + observer pid : $OBS_PID (logs: ${RUN_DIR}/observer.std{out,err}) + working dir : ${RUN_DIR} + abort sentinel : ${RUN_DIR}/abort + +To stop both: kill \$(cat ${RUN_DIR}/orchestrator.pid ${RUN_DIR}/observer.pid) +To follow: tail -F ${RUN_DIR}/orchestrator.stderr ${RUN_DIR}/observer.stderr +To tear down: ${WORKSPACE_DIR}/dev/dzctl destroy -y && docker rm -f ${CONTAINER_NAME} +EOF