diff --git a/.github/workflows/dependencies.yaml b/.github/workflows/dependencies.yaml index ed2baa8..8cc7cb5 100644 --- a/.github/workflows/dependencies.yaml +++ b/.github/workflows/dependencies.yaml @@ -36,7 +36,7 @@ jobs: - name: Install tools uses: taiki-e/install-action@v2 with: - tool: cargo-deny,cargo-unmaintained,mise@2026.6.5,osv-scanner + tool: cargo-deny,cargo-unmaintained,coreutils,mise@2026.6.5,osv-scanner,ripgrep - name: Trust mise config run: mise trust diff --git a/.mise/config.dart.toml b/.mise/config.dart.toml index 536c5b4..5934e59 100644 --- a/.mise/config.dart.toml +++ b/.mise/config.dart.toml @@ -3,8 +3,8 @@ [vars] # Google dart-archive base URLs — the release tree and the bucket listing — # referenced by [tools.dart] below so its url/version_list_url stay one line. -dart_release = "https://storage.googleapis.com/dart-archive/channels/stable/release" dart_bucket = "https://storage.googleapis.com/storage/v1/b/dart-archive/o" +dart_release = "https://storage.googleapis.com/dart-archive/channels/stable/release" [tools] # cargo: backend -- dart-typegen has no prebuilt binary (crates.io only). @@ -14,14 +14,14 @@ dart_bucket = "https://storage.googleapis.com/storage/v1/b/dart-archive/o" # archive, discovering the latest stable from the bucket listing. `{{ version }}` # etc. resolve in the tool-url context; the static host/path prefix is a var. [tools.dart] -version = "latest" url = "{{ vars.dart_release }}/{{ version }}/sdk/dartsdk-{{ os() }}-{{ arch() }}-release.zip" -version_list_url = "{{ vars.dart_bucket }}?prefix=channels/stable/release/&delimiter=/" +version = "latest" version_expr = ''' fromJSON(body).prefixes | filter({ # matches "^channels/stable/release/(\\d+\\.\\d+\\.\\d+)/$" }) | map({split(#, "/")[3]}) | sortVersions()''' +version_list_url = "{{ vars.dart_bucket }}?prefix=channels/stable/release/&delimiter=/" [tasks.dart-pub-get] # `dart format` and `dart analyze` walk up to find pubspec.yaml *and* read diff --git a/.mise/config.linux.toml b/.mise/config.linux.toml index bf30ccb..87c762c 100644 --- a/.mise/config.linux.toml +++ b/.mise/config.linux.toml @@ -1,6 +1,6 @@ # Linux-only mise config. Shared vars (rpath_flag, pylib_flag, conda_openssl, -# py313_unix, …) come from config.toml, which loads first; PYO3_PYTHON keeps its -# config.toml default (py313_unix). +# py3_unix, …) come from config.toml, which loads first; PYO3_PYTHON keeps its +# config.toml default (py3_unix). [tools] # Ships the C `libclang.so` (+ clang resource headers) bindgen needs to build the @@ -44,19 +44,39 @@ LIBCLANG_PATH = "{{ vars.a_conda_clangxx }}/lib" [tasks.preinstall] # Preinstall: the shared cross-platform base (via _setup_all), then verify the -# system build prerequisites mise can't supply -- the C/C++ toolchain, gpg and -# archive tools the Dockerfile installs via apt. A workstation missing any is -# told to install them with its package manager (CI/Docker already have them). +# build prerequisites mise can't supply. The list has a single source of truth -- +# the Dockerfile's APT_PACKAGES: the Docker build exposes it as an env var (the +# ARG is in scope for the preinstall RUN), and a workstation reads it back from +# the Dockerfile in the checkout. Each package is probed for presence -- a command +# for the CLI tools (gpg for gnupg, xz for xz-utils, else the package name), a key +# file for ca-certificates and libc6-dev, and the versioned shared library on disk +# for libicu (a file check, not ldconfig, so it doesn't hinge on the ld.so cache +# being refreshed). An empty list is a hard error, not a silent skip. depends = ["_setup_all"] description = "Preinstall: shared base + verify system build prerequisites" run = """ -pkgs="bzip2 ca-certificates curl g++ gcc git gnupg libc6-dev libicu74 make unzip xz-utils" +pkgs="${APT_PACKAGES:-}" +if [ -z "$pkgs" ]; then + dockerfile="{{ config_root }}/Dockerfile" + pkgs="$(rg '^ARG APT_PACKAGES=' "$dockerfile" 2>/dev/null | coreutils cut -d'"' -f2 || true)" +fi +if [ -z "$pkgs" ]; then + echo "preinstall: could not read the APT_PACKAGES prerequisite list" >&2 + exit 1 +fi missing="" -for cmd in gcc g++ make git curl gpg unzip bzip2 xz; do - command -v "$cmd" >/dev/null 2>&1 || missing="$missing $cmd" +for pkg in $pkgs; do + case "$pkg" in + gnupg) command -v gpg >/dev/null 2>&1 ;; + xz-utils) command -v xz >/dev/null 2>&1 ;; + ca-certificates) [ -r /etc/ssl/certs/ca-certificates.crt ] ;; + libc6-dev) [ -r /usr/include/stdio.h ] ;; + libicu*) coreutils ls /usr/lib/*/libicui18n.so.${pkg#libicu} >/dev/null 2>&1 ;; + *) command -v "$pkg" >/dev/null 2>&1 ;; + esac || missing="$missing $pkg" done if [ -n "$missing" ]; then - echo "preinstall: missing required system tools:$missing" >&2 + echo "preinstall: missing required build prerequisites:$missing" >&2 echo "Install them with your package manager. On Debian/Ubuntu:" >&2 echo " sudo apt-get install -y $pkgs" >&2 exit 1 diff --git a/.mise/config.macos.toml b/.mise/config.macos.toml index dd1b3b4..10dfeae 100644 --- a/.mise/config.macos.toml +++ b/.mise/config.macos.toml @@ -1,6 +1,6 @@ -# macOS-only mise config. Shared vars (rpath_flag, pylib_flag, py313_unix, …) +# macOS-only mise config. Shared vars (rpath_flag, pylib_flag, py3_unix, …) # come from config.toml, which loads first; PYO3_PYTHON keeps its config.toml -# default (py313_unix). +# default (py3_unix). [tools] # The LLD linker (ships ld64.lld) the RUSTFLAGS below use; Apple's /usr/bin/clang diff --git a/.mise/config.python.toml b/.mise/config.python.toml index a52eda3..b9b060e 100644 --- a/.mise/config.python.toml +++ b/.mise/config.python.toml @@ -7,6 +7,10 @@ "pipx:datamodel-code-generator" = { version = "latest", extras = "ruff" } "pipx:openapi-python-client" = "0.29.0" "pipx:pytest" = "latest" +# PyTorch, for the et-ws-pyo3-runner `torch_inference` test. Python-only (not in +# the always-loaded config) -- it's a large optional dependency, and the test +# skips itself when torch isn't among the mise-installed packages on sys.path. +"pipx:torch" = "latest" ruff = "latest" # Use the GitHub release tarball, not `npm:pyodide`. The npm package is only @@ -18,13 +22,10 @@ ruff = "latest" # top-level `pyodide/` directory the modules service picks up — see # `default_modules_folders` in libs/edge-toolkit/src/config.rs. [tools."http:pyodide"] -version = "0.29.3" url = "https://github.com/pyodide/pyodide/releases/download/{{ version }}/pyodide-{{ version }}.tar.bz2" +version = "0.29.3" [vars] -# Directories the ruff tasks lint/format (the generated componentize-py dirs -# under them are excluded by ruff.toml's `extend-exclude`). -py_dirs = "services/ws-modules/ generated/python-ws/ generated/python-rest/" # Shared WIT dir for the wasi-graphics-info componentize-py build (relative to # that module's dir); used by both the bindings and componentize steps. wit_dir = "../../../generated/specs/wit" @@ -34,9 +35,13 @@ wit_dir = "../../../generated/specs/wit" RUFF_CACHE_DIR = "{{ config_root }}/target/ruff-cache" [tasks] -ruff-check = "ruff check {{ vars.py_dirs }}" -ruff-fmt = "ruff format {{ vars.py_dirs }}" -ruff-fmt-check = "ruff format --check {{ vars.py_dirs }}" +# No path args: ruff walks from the repo root, honouring .gitignore (so +# target/ is skipped) and ruff.toml's `exclude`/`extend-exclude`. Every +# tracked *.py already lives under a dir we want linted, so discovery covers +# them without an explicit include list -- scope is narrowed by excludes only. +ruff-check = "ruff check" +ruff-fmt = "ruff format" +ruff-fmt-check = "ruff format --check" # Namespaced aggregators picked up by the default config's globbed # `check`/`fmt`/`test`. @@ -57,15 +62,8 @@ dir = "services/ws-modules/pyface1" run = "uv run pytest" [tasks.test-pyface1.env] -# See the long-form comment in `[tasks."prefetch:python".env]` for why this is -# a per-OS tera template: `mise which python3.13` works on Linux/macOS and -# disambiguates from pyodide's `python` wrapper; on Windows mise's -# python-build-standalone install only exposes `python.exe`, so we fall -# back to `mise which python` (pyodide doesn't shim on Windows, so -# there's no ambiguity to resolve). -UV_PYTHON = """\ -{% if os() == 'windows' %}{{ exec(command='mise where python') }}/python.exe\ -{% else %}{{ exec(command='mise which python3.13') }}{% endif %}""" +# Same uv-interpreter pin as `[tasks."prefetch:python".env]`; see there. +UV_PYTHON = "{% if os() == 'windows' %}{{ vars.py3_win }}{% else %}{{ vars.py3_unix }}{% endif %}" [tasks.build-et-ws-wheel] depends = ["build-et-cli", "gen:python-ws"] @@ -152,39 +150,33 @@ uv sync --directory services/ws-modules/pyface1 shell = "bash -euo pipefail -c" [tasks."prefetch:python".env] -# UV_PYTHON pins uv to the mise-managed regular CPython, not Pyodide's -# emscripten-wasm32 interpreter. mise's `http-pyodide` install ships a -# `python` wrapper script in its install dir that uv's auto-discovery -# finds and prefers — it reports CPython 3.13.2, so bare version requests -# (`3.13`, `python3.13`, `cpython@3.13`) don't disambiguate. Resolve the -# absolute path via mise's `exec()` template rather than a shell `export` in -# the `run` body: the template is evaluated by mise (shell-agnostic), so it -# works under `cmd` on Windows, which has no `export`. Matches the same idiom -# used by `[tasks.test-pyface1.env]` above. -# -# Windows quirk: mise's python-build-standalone install on Windows only -# exposes `python.exe` (no `python3.13.exe`), so `mise which python3.13` -# errors with "not a mise bin". And `mise which python` resolves to -# pyodide's bundled `python.exe` (uv fails to inspect it — the pyodide -# shim's `sys.path` injection is not a valid uv interpreter). Use -# `mise where python` instead — that returns the python plugin's -# install dir specifically — and append `python.exe`, the bin name -# python-build-standalone places at the install root on Windows. -UV_PYTHON = """\ -{% if os() == 'windows' %}{{ exec(command='mise where python') }}/python.exe\ -{% else %}{{ exec(command='mise which python3.13') }}{% endif %}""" +# Pin uv to the mise-managed regular CPython, not Pyodide's emscripten-wasm32 +# interpreter. uv's auto-discovery otherwise finds and prefers the `python` +# wrapper mise's `http:pyodide` install ships (it reports CPython 3.13.2, so +# bare version requests like `3.13` / `python3.13` don't disambiguate). Point +# uv straight at the absolute interpreter path via the py3_unix / py3_win +# vars -- the same interpreter PYO3_PYTHON resolves to -- so the pyodide wrapper +# never enters discovery and no `mise` subprocess runs per task. Windows needs +# its own var because python-build-standalone lays the binary out at the install +# root (`python.exe`), not under `bin/`. +UV_PYTHON = "{% if os() == 'windows' %}{{ vars.py3_win }}{% else %}{{ vars.py3_unix }}{% endif %}" [tasks."gen:python-rest"] depends = ["gen:ws-spec"] description = "Emit the typed Python REST client via openapi-python-client (consumes generated/specs/rest.yaml)" run = """ +# Codegen; committed and regenerated/verified on Linux. Skip on Windows, where +# the workspace `ruff` (a mise tool) isn't on the busybox-ash PATH -- same as +# gen:python-ws. Regenerating there would only risk clobbering committed source. +[ "${OS:-}" = "Windows_NT" ] && { echo "gen:python-rest: skipped on Windows (regenerated on Linux)"; exit 0; } mkdir -p generated/python-rest/et_rest_client -openapi-python-client generate \ - --config config/openapi-python-client.yaml \ - --path generated/specs/rest.yaml \ - --meta none \ - --overwrite \ - --output-path generated/python-rest/et_rest_client +# None of these args move into config/openapi-python-client.yaml -- path / meta +# / overwrite / output-path are CLI-only `generate` options. Build them up in a +# shell var (split over two lines) so neither line needs a backslash +# continuation or runs past 120 cols; unquoted $args word-splits into argv. +args="--config config/openapi-python-client.yaml --path generated/specs/rest.yaml" +args="$args --meta none --overwrite --output-path generated/python-rest/et_rest_client" +openapi-python-client generate $args # openapi-python-client / our follow-up ruff drop a .ruff_cache next to # the generated source; we don't want that committed. rm -rf generated/python-rest/et_rest_client/.ruff_cache @@ -203,38 +195,26 @@ shell = "bash -euo pipefail -c" [tasks."gen:python-ws"] depends = ["gen:ws-spec"] description = "Emit Pydantic models for the WS protocol via datamodel-code-generator" +# Runs from generated/python-ws/ so datamodel-codegen discovers that dir's +# pyproject.toml [tool.datamodel-codegen] (model-shape + style flags); only the +# input/output paths are passed on the CLI. +dir = "generated/python-ws" run = """ -mkdir -p generated/python-ws/et_ws -# `--custom-file-header "#"` is the only way to suppress datamodel-codegen's -# "generated by datamodel-codegen / filename: …" banner (an empty string is -# treated as "no override" and the default banner returns). Strip the -# remaining lone `#` line below so the file starts cleanly. -# `--formatters ruff-format ruff-check` opts in to the new ruff-based -# formatter stack — silences the FutureWarning about black/isort being -# replaced and gives us a single formatter (ruff) end-to-end. The `[ruff]` -# extra on the pipx install (in [tools]) puts ruff in datamodel-codegen's -# venv so it can find it. -datamodel-codegen \ - --input target/int-gen/ws.schema.json \ - --input-file-type jsonschema \ - --output generated/python-ws/et_ws/messages.py \ - --output-model-type pydantic_v2.BaseModel \ - --target-python-version 3.10 \ - --use-schema-description \ - --use-title-as-name \ - --use-double-quotes \ - --use-union-operator \ - --field-constraints \ - --disable-timestamp \ - --formatters ruff-format ruff-check \ - --custom-file-header '#' -sed -i.bak -e '1{/^#$/d;}' -e '2{/^$/d;}' generated/python-ws/et_ws/messages.py -rm generated/python-ws/et_ws/messages.py.bak -# datamodel-codegen's ruff formatter runs without our repo ruff.toml, so -# line-length defaults differ and imports aren't isort-grouped. Re-run -# ruff from the repo to apply both — `check --fix` for import sorting -# ("I" rules in ruff.toml), then `format` for whitespace. -ruff check --fix generated/python-ws/et_ws/messages.py -ruff format generated/python-ws/et_ws/messages.py +# Codegen; the output is committed and regenerated/verified on Linux. Skip on +# Windows, where this task's tools (coreutils/ruff) aren't on the busybox-ash +# PATH -- regenerating there would only risk clobbering the committed source. +[ "${OS:-}" = "Windows_NT" ] && { echo "gen:python-ws: skipped on Windows (regenerated on Linux)"; exit 0; } +mkdir -p et_ws +datamodel-codegen --input ../../target/int-gen/ws.schema.json --output et_ws/messages.py +# Drop the 2-line header (a lone `#` + the blank after it) that +# custom-file-header = "#" emits, so the file starts cleanly. tail, not sed: +# datamodel-codegen always emits exactly those two lines, so a fixed offset is safe. +coreutils tail -n +3 et_ws/messages.py > et_ws/messages.py.tmp +coreutils mv et_ws/messages.py.tmp et_ws/messages.py +# datamodel-codegen's bundled ruff runs with its own defaults, not our repo +# ruff.toml, so re-run repo ruff: `check --fix` for import sorting ("I" rules), +# then `format` for whitespace/line-length. +ruff check --fix et_ws/messages.py +ruff format et_ws/messages.py """ shell = "bash -euo pipefail -c" diff --git a/.mise/config.toml b/.mise/config.toml index 458a098..7b88b0f 100644 --- a/.mise/config.toml +++ b/.mise/config.toml @@ -23,9 +23,9 @@ task.run_auto_install = false [tools] action-validator = "latest" "aqua:EmbarkStudios/cargo-deny" = "latest" +"aqua:rustwasm/wasm-pack" = "latest" ast-grep = "latest" cargo-binstall = "latest" -"aqua:rustwasm/wasm-pack" = "latest" "cargo:cargo-expand" = "latest" taplo = "latest" watchexec = "latest" @@ -37,8 +37,22 @@ watchexec = "latest" cmake = "latest" "conda:openssl" = "3" conftest = "latest" +# uutils coreutils: the Rust multicall binary, invoked as `coreutils ` in tasks. +coreutils = "latest" dprint = "latest" editorconfig-checker = "latest" +# ripgrep (rg): the Rust grep that tasks use in place of the host's grep. +ripgrep = "latest" +# uutils findutils (find + xargs), pinned to 0.8.0 (0.9.0 ships no binaries). +# Separate binaries, not a multicall, so the mise shims shadow the host's find / +# xargs in tasks. Every platform uses the 0.8.0 prebuilt via github: ... +"github:uutils/findutils" = { version = "0.8.0", os = ["linux/x64", "macos", "windows"] } +# ... except aarch64-linux, whose 0.8.0 release has no asset, so it builds from +# source via cargo: (same 0.8.0 pin, so every platform runs the same version). +"cargo:findutils" = { version = "0.8.0", os = ["linux/arm64"] } +# goawk: the awk that tasks invoke as `goawk` in place of the host's awk. Go, not +# Rust, but it's the one with prebuilt binaries for every platform we target. +"github:benhoyt/goawk" = "latest" # Retries a command with exponential backoff + jitter; wraps the model fetches # below (see the `retry` var) so a transient HTTP 429 doesn't fail the build. "github:dbohdan/recur" = "latest" @@ -56,6 +70,10 @@ osv-scanner = "latest" # `python -m pip install pipx` separately so the `pipx:*` entries below # still resolve through whatever pipx is already on PATH. pipx = { version = "latest", os = ["linux", "macos"] } +# Pure-Python fixture for the et-ws-pyo3-runner `cowsay` test: proves a +# mise-installed python package lands on the embedded interpreter's sys.path +# (see `edge_toolkit::config::mise_python_site_packages`). +"pipx:cowsay" = "latest" # semgrep lints Cargo.toml style (see `semgrep-check`), so it stays in the # always-loaded config alongside the other repo-wide linters. "pipx:semgrep" = "latest" @@ -65,7 +83,11 @@ gh = "latest" "npm:pnpm" = { version = "latest", os = ["macos/x64"] } pnpm = { version = "latest", os = ["linux", "macos/arm64", "windows"] } protoc = "latest" -python = "3.13" +# Full version triple, not a minor pin: mise installs python under a per-tool dir +# named after the request and only symlinks the `3.x` minor alias -- and that +# symlink isn't created on the Windows runner, so the py3_* interpreter paths must +# resolve to the exact `3.x.y` dir. Bump in lockstep with py3_unix/py3_win/pylib. +python = "3.13.14" rclone = "latest" rust = [ { version = "latest", components = "clippy,rust-analyzer", targets = "wasm32-unknown-unknown,wasm32-wasip2" }, @@ -81,18 +103,18 @@ zizmor = "latest" # clang-tidy's resource-dir arg (points clang at its builtin headers, e.g. # stddef.h). Empty default; config.linux.toml sets it from conda:clangxx. clang_resource_arg = "" -conda_openssl = "{{ env.HOME }}/.local/share/mise/installs/conda-openssl/latest" -# mise-managed CPython 3.13 (Linux/macOS); the PYO3_PYTHON default below. Windows -# overrides PYO3_PYTHON with its own py313_win in config.windows.toml. -py313_unix = "{{ env.HOME }}/.local/share/mise/installs/python/3.13/bin/python3" +conda_openssl = "{{ env.HOME }}/.local/share/mise/installs/conda-openssl/3" +# mise-managed CPython (Linux/macOS); the PYO3_PYTHON default below. Windows +# overrides PYO3_PYTHON with its own py3_win in config.windows.toml. The version +# segment must equal the `python` [tools] pin (conftest enforces both). +py3_unix = "{{ env.HOME }}/.local/share/mise/installs/python/3.13.14/bin/python3" # RUSTFLAGS fragments shared by the CARGO_TARGET_* env in config..toml rpath_flag = "-C link-arg=-Wl,-rpath,{{ vars.conda_openssl }}/lib" -# rpath to the mise CPython 3.13 lib dir so the et-ws-pyo3-runner binary finds +# rpath to the mise CPython lib dir so the et-ws-pyo3-runner binary finds # libpython at runtime (libpython3.13.so on Linux, libpython3.13.dylib on macOS -# via its @rpath install name). pyo3 bakes an rpath to the resolved patch-version -# dir, which doesn't exist when mise installs under the "3.13" alias; point at -# the stable install path instead. -pylib_flag = "-C link-arg=-Wl,-rpath,{{ env.HOME }}/.local/share/mise/installs/python/3.13/lib" +# via its @rpath install name). Points at the exact `3.x.y` install dir (the +# `python` pin), since the minor-version alias dir isn't created everywhere. +pylib_flag = "-C link-arg=-Wl,-rpath,{{ env.HOME }}/.local/share/mise/installs/python/3.13.14/lib" wasm_rustflags = "-C target-cpu=mvp -C target-feature=+mutable-globals,+sign-ext,+nontrapping-fptoint" # Extra flag for the wasm-pack module builds; empty so wasm-opt runs. # config.windows.toml overrides it to --no-opt where wasm-opt can't execute. @@ -119,18 +141,18 @@ retry = "recur --attempts 8 --delay 2s --backoff 2s --max-delay 2m --jitter 0,5s # no MISE_ENV=all). Hardcoded rather than shell-discovered so it works on Windows # too — keep in sync when adding/removing a config..toml. ALL_LANGS = "dart,dotnet,java,python,rust,zig" -TAPLO_CONFIG = "{{ config_root }}/config/taplo.toml" CLIPPY_CONF_DIR = "{{ config_root }}/config" +TAPLO_CONFIG = "{{ config_root }}/config/taplo.toml" # Use the conda:openssl install for Rust's OPENSSL_DIR so openssl-sys crate builds. OPENSSL_DIR = "{{ vars.conda_openssl }}" # pyo3-ffi (et-ws-pyo3-runner) links its embedded interpreter at build time from # PYO3_PYTHON, else the first python on PATH -- under mise that's the 32-bit # Pyodide shim ("target architecture (64-bit) does not match ... (32-bit)"). Pin -# it to the mise-managed CPython 3.13. Always-loaded so `cargo check --workspace` +# it to the mise-managed CPython. Always-loaded so `cargo check --workspace` # builds the pyo3 runner without MISE_ENV=python. This is the Linux/macOS path; -# config.windows.toml overrides it with py313_win. -PYO3_PYTHON = "{{ vars.py313_unix }}" +# config.windows.toml overrides it with py3_win. +PYO3_PYTHON = "{{ vars.py3_unix }}" RCLONE_CONFIG = "{{ config_root }}/config/rclone.conf" RCLONE_RETRIES = "1" @@ -171,6 +193,7 @@ depends = [ "cargo-clippy", "cargo-doc-check", "cargo-fmt-check", + "conftest-check", "docker-check", "dprint-check", "editorconfig-check", @@ -180,8 +203,8 @@ depends = [ "ls-lint-check", "ryl-check", "semgrep-check", - "conftest-check", "taplo-check", + "taplo-fmt-check", "typos", "verification-check", "zizmor-check", @@ -248,6 +271,9 @@ run = "cargo clippy --fix --allow-dirty --allow-staged --keep-going --workspace [tasks.taplo-fmt] run = "taplo format" +[tasks.taplo-fmt-check] +run = "taplo format --check" + [tasks.conftest-check] depends = ["conftest-check-*"] description = "Run all conftest policy checks (one per parsed file format)" @@ -270,6 +296,12 @@ shell = "bash -euo pipefail -c" description = "Run conftest OPA/Rego policies over the GitHub Actions workflow YAML" run = "conftest test --parser yaml --namespace gha -p config/conftest/policy .github/workflows" +[tasks.conftest-check-dockerfile] +description = "Cross-check Dockerfile.nanoserver's hard-coded mise install paths against [tools] version pins" +# Combine the Dockerfile with the mise configs (the .mise dir is all TOML), so +# conftest auto-detects each file's parser and the rule sees every [tools] pin. +run = "conftest test --combine --namespace dockerfile -p config/conftest/policy Dockerfile.nanoserver .mise" + [tasks.taplo-check] # `taplo lint` reads config/taplo.toml's `[[rule]] schema` entries for editor / # LSP-style validation, but silently ignores their nested constraints in @@ -317,7 +349,7 @@ xargs taplo lint --schema "file://$PWD/config/taplo/no-lib-name.schema.json" <"$ # on the generated source. The exemption is about the lint table, not # about the deps — its `[dependencies]` still inherits from # `[workspace.dependencies]` like every other crate. -grep -vxF './generated/rust-rest/Cargo.toml' "$members" | +rg -vxF './generated/rust-rest/Cargo.toml' "$members" | xargs taplo lint --schema "file://$PWD/config/taplo/require-lints-section.schema.json" # mise task `run` must be a string, not an array: taplo's reorder_arrays would @@ -379,6 +411,8 @@ mise settings set cargo.binstall true mise install cargo-binstall mise install node mise install conda:openssl +mise install coreutils +mise install ripgrep """ shell = "bash -euo pipefail -c" @@ -390,13 +424,13 @@ run = "osv-scanner --lockfile Cargo.lock --config config/osv-scanner.toml" [tasks."gen:osv-scanner"] description = "Regenerate config/osv-scanner.toml from config/deny.toml's [advisories].ignore list" # osv-scanner and cargo-deny must ignore the same advisory IDs; config/deny.toml -# is the source of truth (it carries the per-ID rationale). grep/sort/sed only, so +# is the source of truth (it carries the per-ID rationale). rg + coreutils only, so # the `dependencies` workflow can run it (via mise) next to the audit binaries. run = """ { echo '# AUTO-GENERATED from config/deny.toml by `mise run gen:osv-scanner`.' echo 'IgnoredVulns = [' - grep -oE '"RUSTSEC-[0-9]{4}-[0-9]{4}"' config/deny.toml | sort -u | sed 's/.*/ { id = & },/' + rg -oN -r ' { id = $0 },' '"RUSTSEC-[0-9]{4}-[0-9]{4}"' config/deny.toml | coreutils sort -u echo ']' } > config/osv-scanner.toml """ @@ -431,7 +465,7 @@ run = ''' { cat .gitignore printf '%s\n' '.git/' 'Dockerfile*' '/README.md' '.dockerignore' - } | awk ' + } | goawk ' /^[[:space:]]*#/ { print; next } /^[[:space:]]*$/ { print; next } { diff --git a/.mise/config.windows.toml b/.mise/config.windows.toml index 5dd411e..c05db8c 100644 --- a/.mise/config.windows.toml +++ b/.mise/config.windows.toml @@ -1,7 +1,7 @@ # Windows-only mise config. Everything here is Windows-scoped, so it needs no # per-entry `os = ["windows"]` guards or `{% if os() == 'windows' %}` # conditionals; values here override the cross-platform defaults in config.toml. -# Shared vars (rpath_flag, py313_*, …) come from config.toml, which loads first. +# Shared vars (rpath_flag, py3_*, …) come from config.toml, which loads first. [tools] # msys2 mirror packages: git (cargo git deps), gpg (so mise verifies tool @@ -9,7 +9,8 @@ # Windows SDK can't come from mise, so the build uses the LLVM mingw toolchain # (the gnullvm target below); llvm-mingw bundles clang + lld + the mingw-w64 # runtime/headers/libs + the libclang.dll bindgen loads. Pinned so its install -# dir is deterministic for the PATH entry in Dockerfile.nanoserver -- bump both. +# dir is deterministic: the version segment is repeated in the win_llvm_bin var +# below and the PATH entry in Dockerfile.nanoserver -- bump all three together. "conda:m2-git" = "latest" "conda:m2-gnupg" = "latest" "conda:m2-make" = "latest" @@ -32,11 +33,22 @@ url = "https://frippery.org/files/busybox/busybox64u.exe" version = "1.37.0" [vars] +# Root of mise's per-tool install dirs (the mise data dir is LOCALAPPDATA\mise); +# the install-path vars below all hang off it. +mise_installs = '{{ get_env(name="LOCALAPPDATA", default="") }}\mise\installs' # busybox `ash` install path (for MISE_BASH_PATH), keyed by the pinned # http:busybox version above -- keep in sync. -winsh = '{{ get_env(name="LOCALAPPDATA", default="") }}\mise\installs\http-busybox\1.37.0\ash.exe' -# mise-managed CPython 3.13; the mise data dir is LOCALAPPDATA\mise. -py313_win = "{{ get_env(name='LOCALAPPDATA', default='') }}\\mise\\installs\\python\\3.13\\python.exe" +winsh = '{{ vars.mise_installs }}\http-busybox\1.37.0\ash.exe' +# mise-managed CPython; the exact `3.x.y` dir (the python [tools] pin), since +# mise's minor-version alias symlink isn't created on the Windows runner. +py3_win = '{{ vars.mise_installs }}\python\3.13.14\python.exe' +# llvm-mingw's bin dir (the github:mstorsjo/llvm-mingw install above). +# The linker/dlltool [env] vars below point at absolute exes in here rather than +# bare names because mise's cargo backend doesn't activate llvm-mingw onto the +# cargo: source-build subprocess's PATH (same problem config.macos.toml's +# absolute ld64.lld path solves): a bare `clang` would resolve to a runner's +# system LLVM, which defaults to the MSVC target and can't link gnullvm objects. +win_llvm_bin = '{{ vars.mise_installs }}\github-mstorsjo-llvm-mingw\20260602\bin' # clang-sys finds libclang on PATH (Docker's gnu build has llvm-mingw there); a # workstation with system LLVM also works. win_libclang = "C:\\Program Files\\LLVM\\bin" @@ -51,15 +63,17 @@ no_opt = "--no-opt" MISE_BASH_PATH = "{{ vars.winsh }}" # Build for the LLVM mingw target. gnullvm, not gnu: llvm-mingw ships # compiler-rt + libunwind, not the GCC runtime (libgcc/libgcc_eh) the gnu -# target's link line demands. The linker is clang and the raw-dylib import tool -# is llvm-dlltool, both from llvm-mingw on PATH. These can live in [env] here -# (unlike config.toml) because this file only loads on Windows, so there's no -# empty off-Windows value to break Linux cargo. +# target's link line demands. The linker is llvm-mingw's clang and the raw-dylib +# import tool is its llvm-dlltool, both given as absolute paths (see win_llvm_bin +# above) so the cargo: source-build subprocess resolves them without a PATH +# lookup. These can live in [env] here (unlike config.toml) because this file +# only loads on Windows, so there's no empty off-Windows value to break Linux +# cargo. CARGO_BUILD_TARGET = "x86_64-pc-windows-gnullvm" -CARGO_TARGET_X86_64_PC_WINDOWS_GNULLVM_LINKER = "clang" -CARGO_TARGET_X86_64_PC_WINDOWS_GNULLVM_RUSTFLAGS = "-Cdlltool=llvm-dlltool" +CARGO_TARGET_X86_64_PC_WINDOWS_GNULLVM_LINKER = "{{ vars.win_llvm_bin }}\\clang.exe" +CARGO_TARGET_X86_64_PC_WINDOWS_GNULLVM_RUSTFLAGS = "-Cdlltool={{ vars.win_llvm_bin }}\\llvm-dlltool.exe" LIBCLANG_PATH = "{{ vars.win_libclang }}" -PYO3_PYTHON = "{{ vars.py313_win }}" +PYO3_PYTHON = "{{ vars.py3_win }}" [tasks.preinstall] # Preinstall, shared by Dockerfile.nanoserver and a real workstation. The busybox @@ -88,7 +102,9 @@ python -m pip install pipx # `rust` tool lists. Pre-adding the wasm target means wasm-pack finds it present # and never downloads rust-std mid-build. mise sets RUSTUP_TOOLCHAIN to the bare # stable version, which now resolves to -gnullvm. -ver="$(mise exec -- rustc --version | awk '{print $2}')" +# "rustc ( )" -> field 2 via shell parameter expansion; mise's +# shims aren't on PATH in this bootstrap, so avoid an external cut/coreutils. +ver="$(mise exec -- rustc --version)"; ver="${ver#* }"; ver="${ver%% *}" mise exec -- rustup set default-host x86_64-pc-windows-gnullvm wasm="-t wasm32-unknown-unknown -t wasm32-wasip2" mise exec -- rustup toolchain install "${ver}-x86_64-pc-windows-gnullvm" -c clippy -c rust-analyzer $wasm @@ -96,3 +112,12 @@ mise exec -- rustup toolchain install nightly-x86_64-pc-windows-gnullvm -c rust- mise exec -- rustup default "${ver}-x86_64-pc-windows-gnullvm" """ shell = "bash -euo pipefail -c" + +[tasks.test-ws-web-runner] +# Override: skip et-ws-web-runner on Windows. It pulls deno -> v8, whose build +# script downloads a prebuilt rusty_v8 static lib; denoland/rusty_v8 ships +# Windows prebuilts only for x86_64-pc-windows-msvc, not the gnullvm target this +# build uses, so the fetch 404s and the from-source fallback can't build under +# llvm-mingw. The suite still runs on the Linux/macOS jobs. +run = "echo 'skipping et-ws-web-runner on windows: rusty_v8 has no gnullvm prebuilt'" +shell = "bash -euo pipefail -c" diff --git a/.mise/config.zig.toml b/.mise/config.zig.toml index 0404a45..02348c6 100644 --- a/.mise/config.zig.toml +++ b/.mise/config.zig.toml @@ -52,11 +52,11 @@ shell = "bash -euo pipefail -c" # Namespaced aggregators picked up by the default config's globbed `check`/`fmt`. [tasks."check:zig"] -depends = ["zig-check", "clang-format-check", "clang-tidy-check", "cpplint-check"] +depends = ["clang-format-check", "clang-tidy-check", "cpplint-check", "zig-check"] description = "Run Zig + C checks (zig fmt-check, clang-format, clang-tidy, cpplint)" [tasks."fmt:zig"] -depends = ["zig-fmt", "clang-format"] +depends = ["clang-format", "zig-fmt"] description = "Format Zig + C sources" [tasks.build-ws-zig-data1-module] diff --git a/CLAUDE.md b/CLAUDE.md index c7a793d..ba57620 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -332,6 +332,42 @@ repo. Every script belongs in one of two places: - **More involved** → its own tool directory under `utilities/` with its own `README.md` documenting what it does and how to run it. +## Don't depend on host tools in mise tasks + +A mise task must never assume a command-line utility happens to exist on the host +(or in a base image). Use the mise-managed, version-pinned, cross-platform tool +instead, so a task behaves identically on CI, in the Docker images, on a +workstation, and on every OS. Reach for a host binary only if there is genuinely +no mise tool for it — and then add one rather than depending on the host. + +What to use instead of the common host utilities (a list, not a table — dprint +pads table columns, which blows the 120-char limit): + +- `cut`, `ls`, `sort`, `mktemp`, `cat`, … → `coreutils ` (uutils multicall; + always invoke with the explicit `coreutils` prefix) +- `grep` → `rg` (ripgrep) +- `find`, `xargs` → bare `find` / `xargs` (uutils `findutils` mise tool; its shims + shadow the host's) +- `awk` → `goawk` +- `sed` → no tool; rewrite the step with `coreutils`, `rg -r`, or `goawk` + +`coreutils`, `ripgrep`, `findutils` and `goawk` are mise `[tools]`. `coreutils` +and `ripgrep` are additionally force-installed by `_setup_all`, because the +`preinstall` task itself uses them before the main `mise install` runs. + +What the Dockerfiles `apt-get install` is therefore only genuine build +prerequisites the toolchain needs (compilers, libraries, the archive tools mise +unpacks downloads with) — never POSIX utilities, which now all come from tools. + +One Nano Server exception: `Dockerfile.nanoserver` does not put mise's shims on +`PATH` (native busybox-w32 can't use the msys-form paths mise injects for POSIX +shells — see the `http:busybox` note in `config.windows.toml`), so the Windows +`preinstall` can't call these tools bare — it goes through `mise exec --` or a +shell builtin instead. **TODO (next time we improve `Dockerfile.nanoserver`):** +work out a busybox-compatible way to get the shims (or tool bins) onto `PATH` so +Windows tasks can call `coreutils`/`rg`/`goawk` directly like every other OS, and +drop the `mise exec --` / shell-builtin workarounds. + ## Rust Workspace Single Cargo workspace (`Cargo.toml`). diff --git a/Cargo.lock b/Cargo.lock index 8e1a00e..2653fae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1155,6 +1155,15 @@ dependencies = [ "serde", ] +[[package]] +name = "bytesize" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e78e506b9d7633710dab98996f22f95f3d0f488e8f1aa162830556ed9fc14d" +dependencies = [ + "serde_core", +] + [[package]] name = "bytestring" version = "1.5.1" @@ -4000,6 +4009,7 @@ dependencies = [ "base64 0.22.1", "et-path", "fs-err", + "humantime-serde", "log", "rstest", "schemars 1.2.1", @@ -4211,6 +4221,7 @@ dependencies = [ "actix-rt", "actix-web", "edge-toolkit", + "fs-err", "serde", "serde-inline-default", "serde_default", @@ -4249,6 +4260,7 @@ dependencies = [ name = "et-path" version = "0.1.0" dependencies = [ + "fs-err", "tempfile", ] @@ -4262,8 +4274,10 @@ dependencies = [ "opentelemetry-http 0.31.0", "progenitor-client", "reqwest 0.13.4", + "retry-policies", "serde", "serde_urlencoded", + "tokio", "tracing", "tracing-opentelemetry", "web-sys", @@ -4467,6 +4481,28 @@ dependencies = [ "web-sys", ] +[[package]] +name = "et-ws-pyo3-runner" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "edge-toolkit", + "et-rest-client", + "et-ws-runner-common", + "et-ws-test-server", + "futures-util", + "pyo3", + "pyo3-build-config", + "serde", + "serde-env", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tokio-tungstenite", + "tracing", + "tracing-subscriber", +] + [[package]] name = "et-ws-runner-common" version = "0.1.0" @@ -4476,13 +4512,17 @@ dependencies = [ "futures-util", "humantime-serde", "reqwest 0.13.4", + "retry-policies", "serde", "serde-env", "serde-inline-default", "serde_default", "serde_json", "serde_path_to_error", + "temp-env", "thiserror 2.0.18", + "tokio", + "tokio-tungstenite", "tracing", ] @@ -4547,14 +4587,19 @@ dependencies = [ "actix-web", "actix-ws", "bytes", + "bytesize", "chrono", "edge-toolkit", "fs-err", "futures-util", "opentelemetry 0.31.0", "serde", + "serde-env", + "serde-inline-default", + "serde_default", "serde_json", "serde_yaml", + "temp-env", "tokio", "tracing", "uuid", @@ -4698,6 +4743,7 @@ dependencies = [ "et-ws-runner-common", "et-ws-test-server", "futures-util", + "reqwest 0.13.4", "rstest", "serde", "serde-env", @@ -8686,6 +8732,64 @@ version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" +[[package]] +name = "pyo3" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fd8e38a3b50ed1167fb981cd6fd60147e091784c427b8f7183a7ee32c31c12" +dependencies = [ + "libc", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", +] + +[[package]] +name = "pyo3-build-config" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e368e7ddfdeb98c9bca7f8383be1648fd84ab466bf2bc015e94008db6d35611e" +dependencies = [ + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f29e10af80b1f7ccaf7f69eace800a03ecd13e883acfacc1e5d0988605f651e" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df6e520eff47c45997d2fc7dd8214b25dd1310918bbb2642156ef66a67f29813" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cdc218d835738f81c2338f822078af45b4afdf8b2e33cbb5916f108b813acb" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.117", +] + [[package]] name = "qr2term" version = "0.3.3" @@ -9186,6 +9290,15 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e061d1b48cb8d38042de4ae0a7a6401009d6143dc80d2e2d6f31f0bdd6470c7" +[[package]] +name = "retry-policies" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc05fbf560421a0357a750cbe78c7ca19d4923918490daabba313d5dbc871e47" +dependencies = [ + "rand 0.10.1", +] + [[package]] name = "rfc6979" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 9fe0539..3e16bbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ members = [ "services/modules", "services/storage", "services/ws", + "services/ws-pyo3-runner", "services/ws-server", "services/ws-wasm-agent", "services/ws-wasi-runner", @@ -53,6 +54,7 @@ asyncapi-rust = "0.2" base64 = "0.22.1" bytemuck = { version = "1.16", features = ["derive"] } bytes = "1" +bytesize = { version = "2", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] } clap = { version = "4.4", features = ["derive"] } deno_core = "0.402.0" @@ -81,6 +83,7 @@ futures-util = "0.3" heck = "0.5" hostname = "0.4" humantime-serde = "1" +int-otlp-mock = { path = "libs/otlp-mock", version = "0.1.0" } js-sys = "0.3" kdl = { version = "6", features = ["v1"] } local-ip-address = "0.6" @@ -99,17 +102,23 @@ opentelemetry-otlp = { version = "0.31", default-features = false, features = [ ] } opentelemetry_sdk = "0.31" ort = { version = "=2.0.0-rc.10", default-features = false, features = ["copy-dylibs", "download-binaries"] } -int-otlp-mock = { path = "libs/otlp-mock", version = "0.1.0" } pollster = "0.4" pretty_yaml = "0.6" prettyplease = "0.2" progenitor = "0.14" progenitor-client = "0.14" +# `auto-initialize` starts the embedded CPython interpreter on first use; the +# pyo3 runner never calls `Py_Initialize` itself. +pyo3 = { version = "0.28", features = ["auto-initialize"] } +# Build-script half of pyo3; kept on the same release so both resolve the +# interpreter identically. +pyo3-build-config = "0.28" qr2term = "0.3" quote = "1" rcgen = "0.14" regex = { version = "1.12", default-features = false } reqwest = { version = "0.13", default-features = false } +retry-policies = "0.5" rstest = "0.26" rustls = "0.23" schemars = { version = "1.1", features = ["derive"] } diff --git a/Dockerfile b/Dockerfile index c45132a..b51df95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,18 +50,19 @@ FROM ubuntu:24.04 AS build-minimal # to it) -- leaner than build-essential, which also pulls dpkg-dev + perl. # curl + ca-certificates fetch the mise installer and tool downloads; git is for # cargo + repo operations; gnupg (gpg + gpg-agent + dirmngr) lets mise verify -# downloads (bare `gpg` lacks the agent/dirmngr it needs); xz-utils, unzip and -# bzip2 unpack mise's tool archives (e.g. the pyodide .tar.bz2). libicu74 is .NET +# downloads (bare `gpg` lacks the agent/dirmngr it needs); xz-utils, unzip, +# bzip2, gzip and tar unpack mise's tool archives (.tar.bz2 / .tar.gz / .zip). +# gzip + tar are already in the base image, listed so a minimal FROM keeps them. +# libicu74 is .NET # runtime ICU for the dotnet-data1 module -- without it the dotnet CLI # FailFast-aborts at startup ("Couldn't find a valid ICU package installed on the # system"; minimal Ubuntu ships no ICU). The "74" tracks the Ubuntu base # (74 = 24.04) -- bump it alongside the FROM line; .NET needs libicu on minimal # systems (else set System.Globalization.Invariant=true). -# Vulkan for the wgpu test (libvulkan1 + mesa-vulkan-drivers) is installed in the -# test stage, not here. +ARG APT_PACKAGES="bzip2 ca-certificates curl g++ gcc git gnupg gzip libc6-dev libicu74 make tar unzip xz-utils" RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - bzip2 ca-certificates curl g++ gcc git gnupg libc6-dev libicu74 make unzip xz-utils \ + $APT_PACKAGES \ && rm -rf /var/lib/apt/lists/* # Install mise and put it + its shims on PATH; in a non-interactive build that's diff --git a/Dockerfile.nanoserver b/Dockerfile.nanoserver index b4ec877..7ca742b 100644 --- a/Dockerfile.nanoserver +++ b/Dockerfile.nanoserver @@ -121,12 +121,14 @@ RUN tar -xf C:\mise.zip -C C:\ && del C:\mise.zip # Prepend mise's bin, and re-list System32 + Windows explicitly: the base image # exposes its search path as `Path`, which Docker's case-sensitive `${PATH}` # doesn't match, so it expands empty -- dropping System32 (findstr, net, sc, ...) -# off PATH for every step below. Also put llvm-mingw's bin on PATH: the cargo: -# tool builds (windows-sys etc.) invoke `x86_64-w64-mingw32-dlltool` + the gnu -# linker by name, and mise's cargo backend doesn't activate llvm-mingw onto the -# build subprocess's PATH. The dir is the pinned-version install (config.toml); -# it doesn't exist until preinstall installs llvm-mingw, which is fine -- the -# cargo: builds that need it run afterwards. +# off PATH for every step below. Also put llvm-mingw's bin on PATH so clang-sys +# can find its libclang.dll: config.windows.toml's LIBCLANG_PATH names a system +# LLVM dir absent on Nano Server, so clang-sys falls back to a PATH search and +# llvm-mingw's bin is where libclang.dll lives here. (The gnullvm linker and +# llvm-dlltool are absolute paths in config.windows.toml, so PATH isn't needed +# for them.) The version segment must match the github:mstorsjo/llvm-mingw pin +# in config.windows.toml; the dir doesn't exist until preinstall installs +# llvm-mingw, which is fine -- the builds that need it run afterwards. ENV LLVMBIN=C:\Users\ContainerAdministrator\AppData\Local\mise\installs\github-mstorsjo-llvm-mingw\20260602\bin ENV PATH="C:\mise\bin;C:\Windows\System32;C:\Windows;${LLVMBIN};${PATH}" @@ -195,21 +197,22 @@ ENV CARGO_HOME=C:\.cargo ` # builds no python guest, and pipx can't run on Nano anyway), so that's fine. RUN (if exist C:\token\gh_token set /p GITHUB_TOKEN=` / `Python<'py>` + "Py", # pyo3 `Py` smart pointer; `PythonError::Py` variant + "py", # pyo3 GIL token: `Python<'py>` parameter / `|py|` attach closures ] min-ident-chars-threshold = 2 diff --git a/config/conftest/policy/dockerfile.rego b/config/conftest/policy/dockerfile.rego new file mode 100644 index 0000000..225aff5 --- /dev/null +++ b/config/conftest/policy/dockerfile.rego @@ -0,0 +1,59 @@ +# Cross-checks for Dockerfile.nanoserver, evaluated over the Dockerfile plus the +# .mise/config*.toml files combined (--combine, auto-detected parsers). Two rules: +# 1. version drift -- the Dockerfile hard-codes mise install-dir paths (LLVMBIN, +# the busybox shell, the python dir on PATH) that embed a tool's pinned +# version; those must match the [tools] pins (reuses mise.rego's matcher). +# 2. MISE_DISABLE_TOOLS -- every pipx: tool in the always-loaded config.toml +# must be disabled here (pipx can't run on Nano Server), so a newly added +# pipx tool can't silently break the Windows build. +# Run with `--namespace dockerfile`. +package dockerfile + +import data.mise + +# Every string argument of an ENV/RUN instruction in the Dockerfile (its parsed +# contents is the array of instruction objects; the TOMLs parse to objects). +docker_strings contains entry if { + some file in input + is_array(file.contents) + some instr in file.contents + instr.Cmd in {"env", "run"} + some value in instr.Value + is_string(value) + entry := {"path": file.path, "value": value} +} + +deny contains msg if { + some entry in docker_strings + contains(entry.value, "installs") + some d in mise.version_drift(entry.value) + msg := sprintf( + "%s: hard-codes %q version %q, but [tools] pins it to %q -- keep them in sync", + [entry.path, d.dir, d.seg, d.pinned], + ) +} + +# The comma-separated tools in the Dockerfile's `ENV MISE_DISABLE_TOOLS=...`. +disabled_tools contains tool if { + some file in input + is_array(file.contents) + some instr in file.contents + instr.Cmd == "env" + instr.Value[0] == "MISE_DISABLE_TOOLS" + some tool in split(instr.Value[1], ",") +} + +# pipx:* tools can't run on Nano Server (pipx/platformdirs can't import there), so +# every pipx: tool in the always-loaded config.toml must be in MISE_DISABLE_TOOLS +# -- otherwise the nano build tries to install it and fails. +deny contains msg if { + some file in input + endswith(file.path, ".mise/config.toml") + some name, _ in file.contents.tools + startswith(name, "pipx:") + not disabled_tools[name] + msg := sprintf( + "%s: pipx tool %q must be in Dockerfile.nanoserver MISE_DISABLE_TOOLS (pipx fails on Nano Server)", + [file.path, name], + ) +} diff --git a/config/conftest/policy/mise.rego b/config/conftest/policy/mise.rego index 0cb09c3..36a32bb 100644 --- a/config/conftest/policy/mise.rego +++ b/config/conftest/policy/mise.rego @@ -36,9 +36,10 @@ deny contains msg if { msg := sprintf("%s: task %q description must be a single line", [file.path, name]) } -# `cargo:` tools build from source; prefer a prebuilt backend. Allowlist the two -# that have no prebuilt binary. -allowed_cargo_tool := {"cargo:cargo-expand", "cargo:dart-typegen"} +# `cargo:` tools build from source; prefer a prebuilt backend. Allowlist the ones +# with no prebuilt binary (cargo-expand, dart-typegen), plus cargo:findutils which +# is os-scoped to aarch64-linux only (its prebuilt release lacks that one arch). +allowed_cargo_tool := {"cargo:cargo-expand", "cargo:dart-typegen", "cargo:findutils"} deny contains msg if { some file in input @@ -59,13 +60,18 @@ deny contains msg if { } # Tools should work on every OS (CLAUDE.md "Tools must work on every OS"). Any -# os-scoped [tools] entry must be a genuinely platform-specific one in this list. +# os-scoped [tools] entry must be in this list -- either a genuinely +# platform-specific tool, or one whose os-scoping just picks a per-platform +# backend while still covering every OS (findutils: prebuilt everywhere except +# aarch64-linux, which builds from source). allowed_os_scoped_tool := { "chromedriver", "pipx", "npm:pnpm", "pnpm", "github:christianhelle/openapi2zig", + "github:uutils/findutils", + "cargo:findutils", } deny contains msg if { @@ -77,3 +83,82 @@ deny contains msg if { not allowed_os_scoped_tool[name] msg := sprintf("%s: tool %q is os-scoped; tools must work on every OS (or allowlist it)", [file.path, name]) } + +# A [vars]/[env] value that hard-codes a tool's install path (e.g. the absolute +# linker in config.windows.toml, or the libpython rpath in config.toml) embeds +# the tool's version as a path segment. mise installs a tool to +# `installs//`, where is the tool name with `:` and `/` +# turned into `-`. Those embedded versions must track the `[tools]` pin -- a +# bump that updates the tool but not the var silently points at a missing dir. +# Collect every (install-dir, version) the [tools] tables pin, across all files +# (--combine), since a var in config..toml can reference a tool pinned in +# config.toml. +tool_versions contains [dir, version] if { + some file in input + is_mise(file) + some name, spec in file.contents.tools + is_string(spec) + dir := replace(replace(name, ":", "-"), "/", "-") + version := spec +} + +tool_versions contains [dir, version] if { + some file in input + is_mise(file) + some name, spec in file.contents.tools + is_object(spec) + dir := replace(replace(name, ":", "-"), "/", "-") + version := spec.version +} + +# Every [vars]/[env] string, tagged with where it came from for the message. +config_strings contains entry if { + some file in input + is_mise(file) + some kind in ["vars", "env"] + some key, value in object.get(file.contents, kind, {}) + is_string(value) + entry := {"path": file.path, "kind": kind, "key": key, "value": value} +} + +# An install path embeds a tool's version as the segment right after the tool's +# install dir. Yield every embedded version that isn't a pinned version of that +# tool. The captured segment is restricted to version chars so it stops at the +# next path separator OR a trailing delimiter (a quote, `;`, …) when the path is +# spliced into a larger string (as in Dockerfile ENV/RUN lines). Shared by the +# [vars]/[env] check here and the Dockerfile check (data.mise.version_drift). +version_drift(value) := {drift | + some [dir, _] in tool_versions + pattern := sprintf(`(?:^|[\\/}])%s[\\/]([A-Za-z0-9._-]+)`, [dir]) + some m in regex.find_all_string_submatch_n(pattern, value, -1) + seg := m[1] + not [dir, seg] in tool_versions + pinned := concat(", ", sort({v | some [d, v] in tool_versions; d == dir})) + drift := {"dir": dir, "seg": seg, "pinned": pinned} +} + +deny contains msg if { + some entry in config_strings + contains(entry.value, "installs") + some d in version_drift(entry.value) + msg := sprintf( + "%s: %s %q embeds %q version %q, but [tools] pins it to %q -- keep them in sync", + [entry.path, entry.kind, entry.key, d.dir, d.seg, d.pinned], + ) +} + +tool_version_str(spec) := spec if is_string(spec) + +tool_version_str(spec) := spec.version if is_object(spec) + +# python must be pinned to a full version triple (X.Y.Z), not a minor alias: +# mise installs it under a dir named after the request and only symlinks the X.Y +# alias, and that symlink isn't created on the Windows runner -- so the py3_* +# interpreter paths (and the version_drift check above) need the exact patch dir. +deny contains msg if { + some file in input + is_mise(file) + version := tool_version_str(file.contents.tools.python) + not regex.match(`^[0-9]+\.[0-9]+\.[0-9]+$`, version) + msg := sprintf("%s: python must be pinned to a full version triple, got %q", [file.path, version]) +} diff --git a/config/deny.toml b/config/deny.toml index 1f31006..0f19be6 100644 --- a/config/deny.toml +++ b/config/deny.toml @@ -65,6 +65,19 @@ ignore = [ "RUSTSEC-2026-0098", "RUSTSEC-2026-0099", "RUSTSEC-2026-0104", + # pyo3 0.28.3 -- two advisories, both unreachable in the embedded runner. + # Pulled only by et-ws-pyo3-runner (our sole pyo3 consumer). Fixed in + # pyo3 0.29; we stay on 0.28 for now and drop both on upgrade. + # * 2026-0176 (OOB read in PyList/PyTuple iterator nth/nth_back) -- the + # runner never calls nth/nth_back on a Python sequence iterator; it + # only inserts into sys.path and reads module return values. + # * 2026-0177 (missing Sync bound on PyCFunction::new_closure) -- the + # runner exposes its host API as #[pyclass]/#[pymethods] (WsSender, + # WsStorage), never PyCFunction::new_closure, and drives a single + # GIL-bound interpreter (not free-threaded), so no closure runs + # concurrently. + "RUSTSEC-2026-0176", + "RUSTSEC-2026-0177", ] [licenses] @@ -123,7 +136,7 @@ deny = [ # * `quinn-proto` -- pulled by deno_net via quinn (default features # include `ring`). Drop once Deno's quinn config switches to the # `rustls-aws-lc-rs` feature. - { crate = "ring", wrappers = ["rcgen", "rustls-webpki", "quinn-proto"] }, + { crate = "ring", wrappers = ["quinn-proto", "rcgen", "rustls-webpki"] }, # `ureq` is forbidden except via the wrappers listed below. Direct # use is blocked by the taplo no-banned-deps schema; this entry # catches transitive paths the same way as `ring` above. diff --git a/config/lychee.toml b/config/lychee.toml index e3bd895..f87f506 100644 --- a/config/lychee.toml +++ b/config/lychee.toml @@ -25,7 +25,4 @@ exclude_path = ["data", "generated", "target"] # URL patterns (regex) to skip: doc/test placeholder hosts (e.g. http://host:8080/) # and `{...}` format-string templates (.../{repo}/{git_ref}/...), not real links. -exclude = [ - '^https?://host[:/]', - '\{|%7B', -] +exclude = ['\{|%7B', '^https?://host[:/]'] diff --git a/config/osv-scanner.toml b/config/osv-scanner.toml index 25a1e54..a548f71 100644 --- a/config/osv-scanner.toml +++ b/config/osv-scanner.toml @@ -12,4 +12,6 @@ IgnoredVulns = [ { id = "RUSTSEC-2026-0118" }, { id = "RUSTSEC-2026-0119" }, { id = "RUSTSEC-2026-0173" }, + { id = "RUSTSEC-2026-0176" }, + { id = "RUSTSEC-2026-0177" }, ] diff --git a/config/semgrep/no-trailing-backslash.yaml b/config/semgrep/no-trailing-backslash.yaml index b035e9d..14dec6d 100644 --- a/config/semgrep/no-trailing-backslash.yaml +++ b/config/semgrep/no-trailing-backslash.yaml @@ -6,7 +6,6 @@ rules: # continuations and are allowlisted; everything else must avoid them # (keep the value/statement on one line, or use concat!/[vars]/arrays). exclude: - - "/.mise/config.python.toml" - "/README.md" - "/utilities/cli/README.md" - "**/Dockerfile*" diff --git a/config/taplo.toml b/config/taplo.toml index 4ba02e3..a451d17 100644 --- a/config/taplo.toml +++ b/config/taplo.toml @@ -6,7 +6,7 @@ # scratch area, so scratch TOML is skipped here too -- to lint or test a scratch # TOML, pipe it through `taplo format -` / `taplo lint -` (stdin has no path to # exclude). -exclude = ["target/**", "data/**"] +exclude = ["data/**", "target/**"] [formatting] column_width = 120 diff --git a/config/taplo/mise-cargo-backend-allowlist.schema.json b/config/taplo/mise-cargo-backend-allowlist.schema.json index 18340f8..8b27aa7 100644 --- a/config/taplo/mise-cargo-backend-allowlist.schema.json +++ b/config/taplo/mise-cargo-backend-allowlist.schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "mise [tools] -- cargo backend allowlist", - "description": "`cargo:` builds from source (slow installs); allow only the two with no prebuilt binary.", + "description": "`cargo:` builds from source; only tools without a prebuilt + cargo:findutils (aarch64-linux).", "type": "object", "properties": { "tools": { @@ -9,7 +9,7 @@ "propertyNames": { "anyOf": [ { "not": { "pattern": "^cargo:" } }, - { "enum": ["cargo:cargo-expand", "cargo:dart-typegen"] } + { "enum": ["cargo:cargo-expand", "cargo:dart-typegen", "cargo:findutils"] } ] } } diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ca74d7a --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,6 @@ +# Local-only working data: external upstream repos cloned here for development +# carry their own .git history and must never be committed into this repo. +# Ignore every top-level entry except this file and the tracked model-modules/. +/* +!/.gitignore +!/model-modules/ diff --git a/generated/python-ws/pyproject.toml b/generated/python-ws/pyproject.toml index 1837cb2..a0416fc 100644 --- a/generated/python-ws/pyproject.toml +++ b/generated/python-ws/pyproject.toml @@ -13,3 +13,27 @@ requires = ["uv_build==0.11.8"] [tool.uv.build-backend] module-name = "et_ws" module-root = "" + +# Model-shape + output-style settings for `mise run gen:python-ws`. datamodel- +# code-generator searches cwd upward for a pyproject.toml, so the task runs from +# this dir and passes only the input/output paths on the CLI; everything stable +# lives here. Two non-obvious ones: +# custom-file-header = "#": the only way to suppress the default +# "generated by ... / filename: ..." banner -- an empty string is treated as +# "no override" and the banner returns. The task strips the residual lone +# `#` line afterwards. +# formatters: opt into the ruff formatter stack (silences the black/isort +# deprecation FutureWarning). The `[ruff]` extra on the pipx install (see +# .mise/config.python.toml [tools]) puts ruff in datamodel-codegen's venv. +[tool.datamodel-codegen] +custom-file-header = "#" +disable-timestamp = true +field-constraints = true +formatters = ["ruff-check", "ruff-format"] +input-file-type = "jsonschema" +output-model-type = "pydantic_v2.BaseModel" +target-python-version = "3.10" +use-double-quotes = true +use-schema-description = true +use-title-as-name = true +use-union-operator = true diff --git a/generated/rust-rest/Cargo.toml b/generated/rust-rest/Cargo.toml index 46153be..b98f1d5 100644 --- a/generated/rust-rest/Cargo.toml +++ b/generated/rust-rest/Cargo.toml @@ -33,8 +33,15 @@ tracing-opentelemetry = { workspace = true, optional = true } # `rustls` and `stream` are native-only — WASM reqwest dispatches via the # browser's `fetch()` and has no notion of a TLS stack. +# +# `retry-policies` + `tokio` back the generator-injected `ClientHooks::exec` +# backoff retry (see `utilities/int-gen/src/openapi.rs::inject_retry_exec`). +# Native-only: that hook is `#[cfg(not(wasm32))]`, and `tokio::time::sleep` / +# `SystemTime` don't work under `wasm32-unknown-unknown`. [target.'cfg(not(target_arch = "wasm32"))'.dependencies] reqwest = { workspace = true, features = ["json", "query", "rustls", "stream"] } +retry-policies.workspace = true +tokio = { workspace = true, features = ["time"] } [target.'cfg(target_arch = "wasm32")'.dependencies] reqwest = { workspace = true, features = ["json"] } @@ -42,4 +49,4 @@ reqwest = { workspace = true, features = ["json"] } # resolves to `window.location.origin` so browser modules can write # `Client::new("")` and have it match the page origin (and our embedded # Deno runtime's stubbed `globalThis.location`). -web-sys = { workspace = true, features = ["Window", "Location"] } +web-sys = { workspace = true, features = ["Location", "Window"] } diff --git a/generated/rust-rest/src/lib.rs b/generated/rust-rest/src/lib.rs index f5388b1..67a730c 100644 --- a/generated/rust-rest/src/lib.rs +++ b/generated/rust-rest/src/lib.rs @@ -126,7 +126,43 @@ impl ClientInfo<()> for Client { &() } } -impl ClientHooks<()> for &Client {} +impl ClientHooks<()> for &Client { + // Injected by `utilities/int-gen` (inject_retry_exec): retry request + // execution with exponential backoff. reqwest's native retry has no + // backoff yet -- remove this and use `ClientBuilder::retries` once it does. + #[cfg(not(target_arch = "wasm32"))] + async fn exec(&self, request: ::reqwest::Request, _info: &OperationInfo) -> ::reqwest::Result<::reqwest::Response> { + use ::retry_policies::policies::ExponentialBackoff; + use ::retry_policies::{RetryDecision, RetryPolicy as _}; + let policy = ExponentialBackoff::builder() + .retry_bounds( + ::core::time::Duration::from_millis(250), + ::core::time::Duration::from_secs(5), + ) + .build_with_total_retry_duration(::core::time::Duration::from_secs(30)); + let started = ::std::time::SystemTime::now(); + let mut n_past_retries: u32 = 0; + loop { + // Retry only when the request can be replayed (no streaming body). + let Some(attempt) = request.try_clone() else { + return self.client().execute(request).await; + }; + match self.client().execute(attempt).await { + Ok(response) => return Ok(response), + Err(err) => match policy.should_retry(started, n_past_retries) { + RetryDecision::Retry { execute_after } => { + let wait = execute_after + .duration_since(::std::time::SystemTime::now()) + .unwrap_or_default(); + ::tokio::time::sleep(wait).await; + n_past_retries = n_past_retries.saturating_add(1); + } + RetryDecision::DoNotRetry => return Err(err), + }, + } + } + } +} #[allow(clippy::all)] impl Client { /**Liveness probe diff --git a/libs/edge-toolkit/Cargo.toml b/libs/edge-toolkit/Cargo.toml index 4bc2cc0..25547c6 100644 --- a/libs/edge-toolkit/Cargo.toml +++ b/libs/edge-toolkit/Cargo.toml @@ -14,6 +14,7 @@ asyncapi-rust = { workspace = true, optional = true } base64.workspace = true et-path.workspace = true fs-err.workspace = true +humantime-serde.workspace = true log.workspace = true schemars = { workspace = true, optional = true } secrecy.workspace = true diff --git a/libs/edge-toolkit/src/config.rs b/libs/edge-toolkit/src/config.rs index abcafd4..ec3937b 100644 --- a/libs/edge-toolkit/src/config.rs +++ b/libs/edge-toolkit/src/config.rs @@ -1,5 +1,7 @@ use std::path::{Path, PathBuf}; +use std::time::Duration; +use fs_err as fs; use serde::Deserialize; use serde_default::DefaultFromSerde; use serde_inline_default::serde_inline_default; @@ -11,6 +13,71 @@ use crate::ports::Services; /// Localhost address 127.0.0.1 . pub const LOCALHOST: &str = "127.0.0.1"; +/// Whether a config value names the "disabled" state: `none`, `off`, or +/// `disabled` (case-insensitive, surrounding whitespace ignored). +/// +/// These sentinels let an `Option<_>` env-var field be set to `None`. A blank +/// value can't serve that role -- `serde-env` drops empty-valued vars, so a +/// blank var is indistinguishable from unset (both fall back to the default). +#[must_use] +pub fn is_disabled_sentinel(value: &str) -> bool { + let trimmed = value.trim(); + trimmed.eq_ignore_ascii_case("none") + || trimmed.eq_ignore_ascii_case("off") + || trimmed.eq_ignore_ascii_case("disabled") +} + +/// Deserialize `Option` where a disable sentinel ([`is_disabled_sentinel`]) +/// maps to `None` and any other value to `Some(T)` via `T`'s own `Deserialize`. +/// +/// Generic over the inner type, for fields read from env vars via `serde-env`: +/// the value arrives as a string, so this works for any `T` whose `Deserialize` +/// accepts a string scalar (e.g. `bytesize::ByteSize`, `String`). `Duration` +/// is the exception -- its `Deserialize` isn't humantime -- so use +/// [`deserialize_optional_humantime`] for duration fields. Pair either with +/// `#[serde(default = "...")]` for the unset case. +/// +/// # Errors +/// Returns the deserializer's error if the value is neither a sentinel nor a +/// valid `T`. +pub fn deserialize_optional<'de, D, T>(deserializer: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, + T: Deserialize<'de>, +{ + use serde::de::IntoDeserializer as _; + + let raw = ::deserialize(deserializer)?; + if is_disabled_sentinel(&raw) { + return Ok(None); + } + let inner: serde::de::value::StrDeserializer<'_, D::Error> = raw.trim().into_deserializer(); + T::deserialize(inner).map(Some) +} + +/// [`deserialize_optional`] for `Option` fields, parsing the value as +/// a humantime duration (e.g. `15s`, `1m30s`). +/// +/// Separate from the generic [`deserialize_optional`] because `Duration`'s own +/// `Deserialize` expects a `{secs, nanos}` struct, not a humantime string. +/// +/// # Errors +/// Returns the deserializer's error if the value is neither a sentinel nor a +/// valid humantime duration. +pub fn deserialize_optional_humantime<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + use serde::de::IntoDeserializer as _; + + let raw = ::deserialize(deserializer)?; + if is_disabled_sentinel(&raw) { + return Ok(None); + } + let inner: serde::de::value::StrDeserializer<'_, D::Error> = raw.trim().into_deserializer(); + humantime_serde::deserialize(inner).map(Some) +} + /// Helper to find repository root. /// /// This is the one sanctioned `current_dir()`. @@ -140,7 +207,7 @@ pub fn find_npm_modules_path_in(install: &Path, package: &str) -> Option Option` tool, locates its venv `site-packages` via +/// [`find_site_packages_in`]. Best-effort: returns an empty vec if `mise` is +/// unavailable, exits non-zero, or emits output we can't parse. +#[must_use] +pub fn mise_python_site_packages() -> Vec { + if !mise_is_available() { + return Vec::new(); + } + let output = std::process::Command::new("mise") + .args(["ls", "--current", "--json"]) + .output() + .ok(); + let Some(output) = output.filter(|out| out.status.success()) else { + return Vec::new(); + }; + let Ok(tools) = serde_json::from_slice::>(&output.stdout) else { + return Vec::new(); + }; + tools + .iter() + .filter(|(name, _)| name.starts_with("pipx:")) + .filter_map(|(_, installs)| { + // Each tool maps to an array of installs; take the active one (or + // the first, if none is flagged) and read its `install_path`. + let array = installs.as_array()?; + let entry = array + .iter() + .find(|entry| entry.get("active").and_then(serde_json::Value::as_bool) == Some(true)) + .or_else(|| array.first())?; + let path = entry.get("install_path").and_then(serde_json::Value::as_str)?; + Some(PathBuf::from(path)) + }) + .filter_map(|install| find_site_packages_in(&install)) + .collect() +} + +/// Pure-filesystem helper: given a mise `pipx:` `` root, return the +/// venv `site-packages` directory. +/// +/// pipx lays each tool out as `//lib/python/site-packages`; +/// both the `` directory name and the python version vary, so the two +/// variable segments are scanned rather than assumed. Returns the first match, +/// or `None` if nothing under `` has that shape. +#[must_use] +pub fn find_site_packages_in(install: &Path) -> Option { + for pkg in fs::read_dir(install).ok()?.flatten() { + let Ok(lib_entries) = fs::read_dir(pkg.path().join("lib")) else { + continue; + }; + for py in lib_entries.flatten() { + if !py.file_name().to_string_lossy().starts_with("python") { + continue; + } + let site_packages = py.path().join("site-packages"); + if site_packages.is_dir() { + return Some(site_packages); + } + } + } + None +} + /// Default service label name for use in OpenTelemetry. /// /// Removes "-server" suffix from the invoked executable name if present, diff --git a/libs/edge-toolkit/tests/config.rs b/libs/edge-toolkit/tests/config.rs new file mode 100644 index 0000000..bbc40a3 --- /dev/null +++ b/libs/edge-toolkit/tests/config.rs @@ -0,0 +1,47 @@ +//! The generic optional/sentinel deserialize helpers in `edge_toolkit::config`. +//! +//! Exercised directly over a `StrDeserializer` (the string form every value +//! arrives as under `serde-env`), so the genericity over the inner type is +//! visible: `deserialize_optional::` and the `Duration` humantime +//! variant share one sentinel (`none` / `off` / `disabled`). +#![cfg(test)] +#![expect(clippy::expect_used, reason = "test code: expect panics surface the failure")] + +use std::time::Duration; + +use edge_toolkit::config::{deserialize_optional, deserialize_optional_humantime, is_disabled_sentinel}; +use serde::de::IntoDeserializer as _; +use serde::de::value::{Error as ValueError, StrDeserializer}; + +fn optional_string(value: &str) -> Option { + let deser: StrDeserializer<'_, ValueError> = value.into_deserializer(); + deserialize_optional::<_, String>(deser).expect("deserialize Option") +} + +fn optional_duration(value: &str) -> Option { + let deser: StrDeserializer<'_, ValueError> = value.into_deserializer(); + deserialize_optional_humantime(deser).expect("deserialize Option") +} + +#[test] +fn recognises_disable_sentinels() { + for sentinel in ["none", "off", "disabled", "NONE", "Off", " disabled "] { + assert!(is_disabled_sentinel(sentinel), "{sentinel:?} should disable"); + } + for value in ["", "30s", "64MiB", "never-mind"] { + assert!(!is_disabled_sentinel(value), "{value:?} should not disable"); + } +} + +#[test] +fn generic_optional_works_for_a_non_duration_inner() { + assert_eq!(optional_string("none"), None); + assert_eq!(optional_string("disabled"), None); + assert_eq!(optional_string("hello"), Some("hello".to_owned())); +} + +#[test] +fn humantime_optional_parses_or_disables() { + assert_eq!(optional_duration("off"), None); + assert_eq!(optional_duration("1m30s"), Some(Duration::from_secs(90))); +} diff --git a/libs/edge-toolkit/tests/npm_mod.rs b/libs/edge-toolkit/tests/npm_mod.rs index 567b1d7..4355299 100644 --- a/libs/edge-toolkit/tests/npm_mod.rs +++ b/libs/edge-toolkit/tests/npm_mod.rs @@ -6,9 +6,8 @@ #![cfg(test)] #![expect(clippy::unwrap_used, reason = "test code: failed tempdir setup should fail the test")] -use std::fs; - use edge_toolkit::config::find_npm_modules_path_in; +use fs_err as fs; use tempfile::TempDir; #[test] diff --git a/libs/edge-toolkit/tests/pipx_site_packages.rs b/libs/edge-toolkit/tests/pipx_site_packages.rs new file mode 100644 index 0000000..b8a570d --- /dev/null +++ b/libs/edge-toolkit/tests/pipx_site_packages.rs @@ -0,0 +1,42 @@ +//! Layout test for `find_site_packages_in` -- the pure-filesystem core of +//! `mise_python_site_packages`. Builds a tempdir mimicking the pipx venv layout +//! (`//lib/python/site-packages`) and verifies the resolver +//! finds it regardless of the `` name and python version. + +#![cfg(test)] +#![expect(clippy::unwrap_used, reason = "test code: failed tempdir setup should fail the test")] + +use edge_toolkit::config::find_site_packages_in; +use fs_err as fs; +use tempfile::TempDir; + +#[test] +fn resolves_pipx_venv_layout() { + // /cowsay/lib/python3.13/site-packages -- the shape mise's pipx + // backend lays down; both `` and the python version are scanned, not + // assumed. + let install = TempDir::new().unwrap(); + let site_packages = install.path().join("cowsay/lib/python3.13/site-packages"); + fs::create_dir_all(&site_packages).unwrap(); + + let found = find_site_packages_in(install.path()); + assert_eq!(found.as_deref(), Some(site_packages.as_path())); +} + +#[test] +fn ignores_non_python_lib_dirs() { + // A `lib/` whose only child isn't a `python*` dir must not match. + let install = TempDir::new().unwrap(); + fs::create_dir_all(install.path().join("tool/lib/node")).unwrap(); + + assert!(find_site_packages_in(install.path()).is_none()); +} + +#[test] +fn returns_none_without_site_packages() { + // A `python*` dir exists but has no `site-packages` under it. + let install = TempDir::new().unwrap(); + fs::create_dir_all(install.path().join("tool/lib/python3.13")).unwrap(); + + assert!(find_site_packages_in(install.path()).is_none()); +} diff --git a/libs/path/Cargo.toml b/libs/path/Cargo.toml index e44633e..fd4182c 100644 --- a/libs/path/Cargo.toml +++ b/libs/path/Cargo.toml @@ -10,6 +10,7 @@ repository.workspace = true doctest = false [dev-dependencies] +fs-err.workspace = true tempfile.workspace = true [lints] diff --git a/libs/path/tests/find.rs b/libs/path/tests/find.rs index 02d3232..8543cd4 100644 --- a/libs/path/tests/find.rs +++ b/libs/path/tests/find.rs @@ -4,9 +4,8 @@ reason = "test code: failed tempdir/fs setup should fail the test" )] -use std::fs; - use et_path::find_project_root; +use fs_err as fs; use tempfile::tempdir; #[test] diff --git a/libs/ws-runner-common/Cargo.toml b/libs/ws-runner-common/Cargo.toml index adb2e0b..d8e6247 100644 --- a/libs/ws-runner-common/Cargo.toml +++ b/libs/ws-runner-common/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "et-ws-runner-common" -description = "Shared helpers for the ws-module runners (et-ws-wasi-runner, et-ws-web-runner)" +description = "Shared helpers + constants for the native ws-server agent runners" version = "0.1.0" edition.workspace = true license.workspace = true @@ -16,16 +16,20 @@ et-rest-client.workspace = true futures-util.workspace = true humantime-serde.workspace = true reqwest.workspace = true +retry-policies.workspace = true serde.workspace = true serde-inline-default.workspace = true serde_default.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true thiserror.workspace = true +tokio = { workspace = true, features = ["time"] } +tokio-tungstenite = { workspace = true, features = ["connect"] } tracing.workspace = true [dev-dependencies] serde-env.workspace = true +temp-env.workspace = true [lints] workspace = true diff --git a/libs/ws-runner-common/src/config.rs b/libs/ws-runner-common/src/config.rs index 7b0ac9d..17afb9d 100644 --- a/libs/ws-runner-common/src/config.rs +++ b/libs/ws-runner-common/src/config.rs @@ -13,7 +13,7 @@ use serde::Deserialize; use serde_default::DefaultFromSerde; use serde_inline_default::serde_inline_default; -/// `RUNNER_*` settings shared by both native runners. +/// Shared `RUNNER_*` settings for both native runners. #[derive(Clone, Debug, Deserialize)] #[non_exhaustive] pub struct RunnerConfig { @@ -25,7 +25,10 @@ pub struct RunnerConfig { pub timeout: Option, } -/// `WS_*` settings shared by both native runners. +/// Default time [`crate::connect_and_register`] waits for `et-connect-ack`. +pub const DEFAULT_CONNECT_ACK_TIMEOUT: Duration = Duration::from_secs(5); + +/// Shared `WS_*` settings for both native runners. #[serde_inline_default] #[derive(Clone, Debug, DefaultFromSerde, Deserialize)] #[non_exhaustive] @@ -33,4 +36,22 @@ pub struct WsConfig { /// ws-server URL, from `WS_SERVER_URL`; defaults to the local insecure port. #[serde_inline_default(format!("ws://localhost:{}/ws", Services::InsecureWebSocketServer.port()))] pub server_url: String, + + /// How long [`crate::connect_and_register`] waits for the server's + /// `et-connect-ack`, from `WS_CONNECT_ACK_TIMEOUT` as a humantime duration + /// (e.g. `5s`, `500ms`). Unset defaults to 5s; `none`/`off`/`disabled` waits + /// forever (retry until the server answers). + #[serde( + default = "default_connect_ack_timeout", + deserialize_with = "edge_toolkit::config::deserialize_optional_humantime" + )] + pub connect_ack_timeout: Option, +} + +#[expect( + clippy::unnecessary_wraps, + reason = "serde default fn must return the field type Option; the default is always Some" +)] +const fn default_connect_ack_timeout() -> Option { + Some(DEFAULT_CONNECT_ACK_TIMEOUT) } diff --git a/libs/ws-runner-common/src/lib.rs b/libs/ws-runner-common/src/lib.rs index 19e1ed3..420fc80 100644 --- a/libs/ws-runner-common/src/lib.rs +++ b/libs/ws-runner-common/src/lib.rs @@ -1,11 +1,13 @@ -//! Helpers shared by the two ws-module runners. +//! Helpers and constants shared by the native ws-server agent runners. //! -//! `et-ws-wasi-runner` (WASI components under wasmtime) and `et-ws-web-runner` -//! (browser-targeted JS under Deno) both talk to the same ws-server REST -//! surface to bootstrap a module: derive the HTTP base from the WebSocket URL, -//! drain streamed responses, and read the `main` entry from `package.json`. -//! Those steps were duplicated in each crate; they live here so there is one -//! implementation to keep in sync with the server. +//! Used by `et-ws-wasi-runner`, `et-ws-web-runner`, and `et-ws-pyo3-runner`. +//! Bootstrap helpers talk to the ws-server REST surface to set up a module: +//! derive the HTTP base from the WebSocket URL, drain streamed responses, and +//! read the `main` entry from `package.json`. Connection helpers cover the +//! shared agent-loop timing: the connect-ack timeout and the keepalive +//! heartbeat (the server times out idle connections and never pings clients). +//! These were duplicated across the runner crates; one implementation here +//! keeps them in sync with the server. // `BootstrapError` is large because `et_rest_client::Error<()>` carries an // inline `reqwest::Response` (~136 B). Boxing would cost a `From` impl per @@ -16,11 +18,195 @@ reason = "et_rest_client::Error<()> dominates the footprint; boxing would force per-variant From impls" )] -use futures_util::StreamExt as _; +use std::time::{Duration, SystemTime}; + +use edge_toolkit::ws::{ClientMessage, ConnectStatus, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use retry_policies::policies::ExponentialBackoff; +use retry_policies::{RetryDecision, RetryPolicy}; use thiserror::Error; +use tokio::net::TcpStream; +use tokio_tungstenite::{MaybeTlsStream, WebSocketStream, connect_async, tungstenite}; pub mod config; +/// A live websocket to the ws-server that has completed the et-connect handshake. +pub type RegisteredSocket = WebSocketStream>; + +/// Lower bound on the connect-ack retry interval. +/// +/// Stops a small configured timeout from collapsing the backoff to near-zero +/// (which would hammer the server); also the interval used when the timeout is +/// disabled (retry forever). +const MIN_RETRY_INTERVAL: Duration = Duration::from_millis(250); + +/// Upper bound on the connect-ack retry interval. +/// +/// Keeps a long or disabled timeout from letting the backoff crawl out to +/// retry-policies' multi-minute default. +const MAX_RETRY_INTERVAL: Duration = Duration::from_secs(5); + +/// How often a runner pings the ws-server to stay connected. +/// +/// The server closes connections idle longer than its connection timeout +/// (`WS_CONNECTION_TIMEOUT`, default 15s; see `services/ws/src/lib.rs`) and +/// never pings clients itself, so an agent that only waits for inbound frames +/// must ping well inside that window. +pub const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); + +/// Build a heartbeat ticker firing every [`HEARTBEAT_INTERVAL`]. +/// +/// The immediate first tick is consumed so the first heartbeat fires one +/// interval after connect (not instantly), and missed ticks are delayed rather +/// than bursting to catch up. Drive it with `interval.tick().await` and send a +/// WebSocket ping each tick. +pub async fn heartbeat_interval() -> tokio::time::Interval { + let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + let _first: tokio::time::Instant = interval.tick().await; + interval +} + +/// Errors from [`connect_and_register`]. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum ConnectError { + /// Opening, sending on, or reading the websocket failed. + #[error("websocket error during connect/register: {0}")] + WebSocket(#[from] tungstenite::Error), + + /// The `et-connect` frame could not be serialised. + #[error("failed to serialize the et-connect frame: {0}")] + Serialize(#[from] serde_json::Error), + + /// No `et-connect-ack` arrived within a single attempt's budget. + #[error("no et-connect-ack within {0:?}")] + AckTimeout(Duration), + + /// The connection closed before any `et-connect-ack` arrived. + #[error("connection closed before et-connect-ack")] + ConnectionClosed, +} + +/// Human label for a [`ConnectStatus`], for log lines. +#[must_use] +pub const fn connect_status_label(status: &ConnectStatus) -> &'static str { + match *status { + ConnectStatus::Assigned => "assigned", + ConnectStatus::Reconnected => "reconnected", + } +} + +/// Connect to the ws-server and complete the `et-connect` handshake. +/// +/// Opens the websocket, sends `et-connect` (requesting `requested_agent_id` if +/// given), and waits for `et-connect-ack`, returning the live socket plus the +/// assigned id and status. +/// +/// The wait is a retry loop: the whole attempt (connect + register) is retried +/// with exponential backoff until it succeeds or `ack_timeout` elapses, so a +/// runner started before the ws-server simply waits for it to come up. +/// `ack_timeout = None` retries forever. The backoff interval is floored so a +/// small timeout never degrades into a busy-loop. +pub async fn connect_and_register( + ws_url: &str, + requested_agent_id: Option, + ack_timeout: Option, +) -> Result<(RegisteredSocket, String, ConnectStatus), ConnectError> { + let policy = backoff_for_timeout(ack_timeout); + let started_at = SystemTime::now(); + let mut n_past_retries = 0_u32; + loop { + let budget = attempt_budget(ack_timeout, started_at); + match register_once(ws_url, requested_agent_id.clone(), budget).await { + Ok(registered) => return Ok(registered), + Err(err) => match policy.should_retry(started_at, n_past_retries) { + RetryDecision::Retry { execute_after } => { + let wait = execute_after.duration_since(SystemTime::now()).unwrap_or_default(); + tracing::warn!(attempt = n_past_retries.saturating_add(1), error = %err, retry_in = ?wait, + "connect/register attempt failed; retrying"); + tokio::time::sleep(wait).await; + n_past_retries = n_past_retries.saturating_add(1); + } + RetryDecision::DoNotRetry => return Err(err), + }, + } + } +} + +/// Map a timeout to a retry policy. +/// +/// `Some(total)` bounds total retry time to `total`; `None` retries forever. +/// The backoff floor is a fraction of the total but never below +/// [`MIN_RETRY_INTERVAL`] (so a small timeout doesn't collapse the interval to +/// near-zero and hammer the server) nor above the cap. +#[expect( + clippy::single_call_fn, + reason = "distinct step of connect_and_register; kept separate for readability and future reuse" +)] +fn backoff_for_timeout(timeout: Option) -> Box { + let retry_min = timeout.map_or(MIN_RETRY_INTERVAL, |total| { + total + .checked_div(8) + .unwrap_or(MIN_RETRY_INTERVAL) + .clamp(MIN_RETRY_INTERVAL, MAX_RETRY_INTERVAL) + }); + let builder = ExponentialBackoff::builder().retry_bounds(retry_min, MAX_RETRY_INTERVAL); + match timeout { + Some(total) => Box::new(builder.build_with_total_retry_duration(total)), + None => Box::new(builder.build_with_max_retries(u32::MAX)), + } +} + +/// Compute this attempt's connect budget. +/// +/// The time left before the total deadline, clamped so one attempt can't +/// overrun the cap and always gets a minimum window to connect. +#[expect( + clippy::single_call_fn, + reason = "distinct step of connect_and_register; kept separate for readability and future reuse" +)] +fn attempt_budget(ack_timeout: Option, started_at: SystemTime) -> Duration { + ack_timeout.map_or(MAX_RETRY_INTERVAL, |total| { + total + .saturating_sub(started_at.elapsed().unwrap_or_default()) + .clamp(MIN_RETRY_INTERVAL, MAX_RETRY_INTERVAL) + }) +} + +/// One connect + send-`et-connect` + await-`et-connect-ack`, bounded by `budget`. +#[expect( + clippy::single_call_fn, + reason = "one attempt of the connect_and_register retry loop; separated for readability" +)] +async fn register_once( + ws_url: &str, + requested_agent_id: Option, + budget: Duration, +) -> Result<(RegisteredSocket, String, ConnectStatus), ConnectError> { + let attempt = async { + let (mut socket, _response) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { + agent_id: requested_agent_id, + })?; + socket.send(tungstenite::Message::Text(connect)).await?; + while let Some(frame) = socket.next().await { + let tungstenite::Message::Text(text) = frame? else { + continue; + }; + match ServerMessage::from_text_frame(&text) { + Ok(ServerMessage::ConnectAck { agent_id, status }) => return Ok((socket, agent_id, status)), + Ok(_) => {} + Err(err) => tracing::warn!(error = %err, "ignoring undecodable et-* frame during handshake"), + } + } + Err(ConnectError::ConnectionClosed) + }; + tokio::time::timeout(budget, attempt) + .await + .unwrap_or_else(|_elapsed| Err(ConnectError::AckTimeout(budget))) +} + /// Errors produced while bootstrapping a module from the ws-server. #[derive(Debug, Error)] #[non_exhaustive] diff --git a/libs/ws-runner-common/tests/config.rs b/libs/ws-runner-common/tests/config.rs index 177367c..b3b13ce 100644 --- a/libs/ws-runner-common/tests/config.rs +++ b/libs/ws-runner-common/tests/config.rs @@ -57,3 +57,59 @@ fn missing_required_module_errors() { assert!(result.is_err(), "RUNNER_MODULE is required"); } + +// --- WS_CONNECT_ACK_TIMEOUT, via the real `from_env` path (temp-env) -------- +// +// Unset -> 5s default; a humantime value -> that duration. A *blank* value is +// dropped by serde-env (it filters empty-valued vars), so it is +// indistinguishable from unset and falls back to the default -- which is why +// the timeout can't be disabled with an empty env var, and why the field uses +// the plain `#[serde(default, with = "humantime_serde")]` form with no custom +// blank handling. + +#[derive(Debug, Deserialize)] +struct WsOnly { + #[serde(default)] + ws: WsConfig, +} + +fn load_ws() -> WsConfig { + serde_env::from_env::().expect("parse WsConfig from env").ws +} + +#[test] +fn connect_ack_timeout_absent_defaults_to_5s() { + temp_env::with_var_unset("WS_CONNECT_ACK_TIMEOUT", || { + assert_eq!(load_ws().connect_ack_timeout, Some(Duration::from_secs(5))); + }); +} + +#[test] +fn connect_ack_timeout_parses_humantime() { + temp_env::with_var("WS_CONNECT_ACK_TIMEOUT", Some("1m30s"), || { + assert_eq!(load_ws().connect_ack_timeout, Some(Duration::from_secs(90))); + }); +} + +#[test] +fn connect_ack_timeout_none_sentinel_disables() { + for sentinel in ["none", "off", "disabled", "NONE", "Off"] { + temp_env::with_var("WS_CONNECT_ACK_TIMEOUT", Some(sentinel), || { + assert_eq!( + load_ws().connect_ack_timeout, + None, + "sentinel {sentinel:?} should disable" + ); + }); + } +} + +#[test] +fn blank_env_var_is_filtered_by_serde_env_so_default_applies() { + // serde-env drops empty-valued vars, so a blank value behaves exactly like + // an unset one (the 5s default), NOT as a way to disable the timeout -- + // which is why disabling uses the `none` / `off` sentinel above. + temp_env::with_var("WS_CONNECT_ACK_TIMEOUT", Some(""), || { + assert_eq!(load_ws().connect_ack_timeout, Some(Duration::from_secs(5))); + }); +} diff --git a/ruff.toml b/ruff.toml index 32defa1..78b89fb 100644 --- a/ruff.toml +++ b/ruff.toml @@ -16,15 +16,26 @@ line-length = 120 # don't own and don't ship. Exclude them here (on top of ruff's defaults) so # the ruff tasks don't need to repeat `--exclude` flags, and so they're skipped # even when ruff runs with `--no-respect-gitignore`. -extend-exclude = ["wit_world", "componentize_py_async_support"] +extend-exclude = ["componentize_py_async_support", "wit_world"] [lint] -# Add isort-style import sorting on top of the default rule set (E, F). -# `ruff format` only handles whitespace; the "I" rules give us deterministic -# import grouping. datamodel-codegen's ruff formatter doesn't sort imports -# on its own, so without this the generated messages.py drifts between -# regens. Use `extend-select`, not `select`, so we keep the default F401 -# (unused-import) rule that openapi-python-client relies on internally — it -# generates speculative imports and trims them by re-running ruff with the -# default rules. -extend-select = ["I"] +# On top of the default rule set (E, F): +# "I" -- isort import sorting. `ruff format` only handles whitespace; the +# "I" rules give deterministic grouping. datamodel-codegen's ruff formatter +# doesn't sort imports, so without this the generated messages.py drifts +# between regens. +# "D" -- pydocstyle (PEP 257) docstring checks; see [lint.pydocstyle]. +# Use `extend-select`, not `select`, so we keep the default F401 (unused-import) +# rule that openapi-python-client relies on internally — it generates +# speculative imports and trims them by re-running ruff with the default rules. +extend-select = ["D", "I"] + +[lint.pydocstyle] +convention = "pep257" + +[lint.per-file-ignores] +# Generated clients (openapi-python-client / datamodel-codegen output): their +# docstrings come verbatim from the OpenAPI/JSON-schema spec, not hand-written. +"generated/**" = ["D"] +# Test modules: docstrings on test classes/methods are noise, not API surface. +"**/tests/**" = ["D"] diff --git a/services/modules/Cargo.toml b/services/modules/Cargo.toml index 6106638..1994aa0 100644 --- a/services/modules/Cargo.toml +++ b/services/modules/Cargo.toml @@ -16,6 +16,7 @@ openapi-spec = ["dep:utoipa"] actix-files.workspace = true actix-web.workspace = true edge-toolkit.workspace = true +fs-err.workspace = true serde.workspace = true serde-inline-default.workspace = true serde_default.workspace = true diff --git a/services/modules/src/lib.rs b/services/modules/src/lib.rs index 6127aa1..bf754b0 100644 --- a/services/modules/src/lib.rs +++ b/services/modules/src/lib.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use actix_files::Files; use actix_web::web; use edge_toolkit::config::default_modules_folders; +use fs_err as fs; use serde::Deserialize; use serde_default::DefaultFromSerde; use serde_inline_default::serde_inline_default; @@ -30,7 +31,7 @@ impl ModulesConfig { } fn read_package_name(package_json: &std::path::Path) -> Option { - let content = std::fs::read_to_string(package_json).ok()?; + let content = fs::read_to_string(package_json).ok()?; let value: serde_json::Value = serde_json::from_str(&content).ok()?; value.get("name")?.as_str().map(str::to_string) } @@ -53,7 +54,7 @@ pub fn list_modules(config: &ModulesConfig) -> Vec<(String, PathBuf)> { if let Some(name) = name { modules.push((name, path.clone())); } - } else if let Ok(entries) = std::fs::read_dir(path) { + } else if let Ok(entries) = fs::read_dir(path) { for entry in entries.flatten() { // `Path::is_dir` follows symlinks; `entry.file_type().is_dir()` // would skip them. mise's aube npm backend lays out diff --git a/services/modules/tests/symlinks.rs b/services/modules/tests/symlinks.rs index 0bdc9ba..8d17517 100644 --- a/services/modules/tests/symlinks.rs +++ b/services/modules/tests/symlinks.rs @@ -18,7 +18,6 @@ reason = "test code: fixture setup failures should fail the test" )] -use std::fs; use std::os::unix::fs::symlink; use std::path::PathBuf; @@ -26,6 +25,7 @@ use actix_web::http::StatusCode; use actix_web::{App, test, web}; use edge_toolkit::ws_server::AgentRegistry; use et_modules_service::{ModulesConfig, configure, list_modules}; +use fs_err as fs; use tempfile::TempDir; const ORT_BUNDLE: &[u8] = b"// pretend ort.min.js bundle"; diff --git a/services/storage/tests/put.rs b/services/storage/tests/put.rs index 0d99fbd..4e75f73 100644 --- a/services/storage/tests/put.rs +++ b/services/storage/tests/put.rs @@ -108,7 +108,7 @@ async fn writes_file_for_registered_agent() { let resp = test::call_service(&app, req).await; assert_eq!(resp.status(), StatusCode::OK); - let written = std::fs::read(tmp.path().join("agent-1").join("payload.txt")).unwrap(); + let written = fs_err::read(tmp.path().join("agent-1").join("payload.txt")).unwrap(); assert_eq!(written, body); } @@ -120,7 +120,7 @@ async fn surfaces_io_failure_as_500() { // `StorageError::Io` and the derived `ResponseError` impl returns 500. let tmp = tempfile::tempdir().unwrap(); let blocker = tmp.path().join("blocker"); - std::fs::write(&blocker, b"i am a file, not a directory").unwrap(); + fs_err::write(&blocker, b"i am a file, not a directory").unwrap(); let config = StorageConfig::new(blocker); let registry = registry_with_agent("agent-1"); let app = test::init_service( diff --git a/services/ws-modules/pyface1/pyface1/face_detection.py b/services/ws-modules/pyface1/pyface1/face_detection.py index 3f35b91..46f237e 100644 --- a/services/ws-modules/pyface1/pyface1/face_detection.py +++ b/services/ws-modules/pyface1/pyface1/face_detection.py @@ -31,6 +31,8 @@ class Detection(TypedDict): + """One detected face: label, class index, score, and bounding box.""" + label: str class_index: int score: float @@ -38,6 +40,8 @@ class Detection(TypedDict): class DetectionSummary(TypedDict): + """Result of one inference: the detections, best confidence, and timestamp.""" + detections: list[Detection] confidence: float processed_at: str @@ -128,18 +132,22 @@ def config() -> dict[str, object]: def starting_status() -> str: + """Return the status line shown while the workflow starts up.""" return "pyface1 face detection: starting" def stopped_status() -> str: + """Return the status line shown once the workflow has stopped.""" return "pyface1 face detection demo stopped." def model_log_message() -> str: + """Return the log line emitted when loading the RetinaFace model.""" return f"loading RetinaFace model from {FACE_MODEL_PATH}" def validate_output_names(output_names: Iterable[object]) -> list[str]: + """Coerce the session output names to strings, requiring at least three.""" output_names = [str(name) for name in output_names] if len(output_names) < 3: raise ValueError("RetinaFace session did not expose the expected outputs") @@ -147,6 +155,7 @@ def validate_output_names(output_names: Iterable[object]) -> list[str]: def initial_summary() -> DetectionSummary: + """Return the placeholder summary shown before the first inference.""" return { "detections": [], "confidence": 0.0, @@ -155,6 +164,7 @@ def initial_summary() -> DetectionSummary: def preprocess_geometry(source_width: float, source_height: float) -> dict[str, float]: + """Compute the resize ratio and resized dimensions for the source frame.""" source_width = require_positive_finite(source_width, "source_width") source_height = require_positive_finite(source_height, "source_height") target_ratio = FACE_INPUT_HEIGHT / FACE_INPUT_WIDTH @@ -171,10 +181,12 @@ def preprocess_geometry(source_width: float, source_height: float) -> dict[str, def detections_json(detections: list[Detection]) -> str: + """Serialise the detections list to JSON.""" return json.dumps(detections) def client_event_json(details: dict[str, object]) -> str: + """Build the et-client-event JSON envelope for a face-detection inference.""" return WsClientEvent( type="et-client-event", capability="face_detection", @@ -305,6 +317,7 @@ def event_payload( def build_priors(image_height: float, image_width: float) -> list[Prior]: + """Build the RetinaFace prior boxes for the given image dimensions.""" image_height = require_positive_finite(image_height, "image_height") image_width = require_positive_finite(image_width, "image_width") @@ -328,10 +341,12 @@ def build_priors(image_height: float, image_width: float) -> list[Prior]: @lru_cache(maxsize=1) def model_priors() -> tuple[Prior, ...]: + """Return the cached priors for the model's fixed input size.""" return tuple(build_priors(float(FACE_INPUT_HEIGHT), float(FACE_INPUT_WIDTH))) def decode_box(loc: Sequence[float], prior: Sequence[float]) -> DecodedBox: + """Decode one RetinaFace box from its location offsets and prior.""" if len(loc) != 4: raise ValueError("loc must contain exactly 4 values") if len(prior) != 4: @@ -350,6 +365,7 @@ def decode_box(loc: Sequence[float], prior: Sequence[float]) -> DecodedBox: def apply_nms(detections: list[Detection], threshold: float) -> list[Detection]: + """Apply non-maximum suppression, keeping the highest-scoring boxes.""" threshold = require_non_negative_finite(threshold, "threshold") kept: list[Detection] = [] for candidate in sorted(detections, key=lambda item: item["score"], reverse=True): @@ -359,6 +375,7 @@ def apply_nms(detections: list[Detection], threshold: float) -> list[Detection]: def compute_iou(left: Detection, right: Detection) -> float: + """Return the intersection-over-union of two detections' boxes.""" left_box = left["box"] right_box = right["box"] x1 = max(left_box[0], right_box[0]) @@ -380,6 +397,7 @@ def compute_iou(left: Detection, right: Detection) -> float: def softmax(values: Iterable[object]) -> list[float]: + """Return the softmax of the values (empty list for empty input).""" values = [float(value) for value in values] if not values: return [] @@ -390,10 +408,12 @@ def softmax(values: Iterable[object]) -> list[float]: def clamp(value: float, minimum: float, maximum: float) -> float: + """Clamp `value` to the inclusive range [minimum, maximum].""" return max(minimum, min(value, maximum)) def output_values(values: Iterable[object], name: str, stride: int) -> list[float]: + """Coerce a model output to floats, requiring a length multiple of `stride`.""" if stride <= 0: raise ValueError("stride must be positive") @@ -408,6 +428,7 @@ def output_values(values: Iterable[object], name: str, stride: int) -> list[floa def require_positive_finite(value: float, name: str) -> float: + """Return `value` as a float, raising if it isn't positive and finite.""" value = float(value) if not math.isfinite(value) or value <= 0.0: raise ValueError(f"{name} must be a positive finite number") @@ -415,6 +436,7 @@ def require_positive_finite(value: float, name: str) -> float: def require_non_negative_finite(value: float, name: str) -> float: + """Return `value` as a float, raising if it's negative or non-finite.""" value = float(value) if not math.isfinite(value) or value < 0.0: raise ValueError(f"{name} must be a non-negative finite number") diff --git a/services/ws-modules/wasi-graphics-info/wasi_graphics_info/__init__.py b/services/ws-modules/wasi-graphics-info/wasi_graphics_info/__init__.py index be939cb..db50218 100644 --- a/services/ws-modules/wasi-graphics-info/wasi_graphics_info/__init__.py +++ b/services/ws-modules/wasi-graphics-info/wasi_graphics_info/__init__.py @@ -210,10 +210,13 @@ def _matrix_bytes(values: list) -> bytes: def _entry(binding: int, read_only: bool) -> GpuBindGroupLayoutEntry: - """One COMPUTE-visible storage-buffer bind-group-layout entry. Bindings 0 - and 1 are read-only (matA / matB), binding 2 is read-write (matC). This - has to match the WGSL `var` / `var` - qualifiers exactly or wgpu's create-compute-pipeline validation rejects.""" + """Build one COMPUTE-visible storage-buffer bind-group-layout entry. + + Bindings 0 and 1 are read-only (matA / matB), binding 2 is read-write + (matC). This has to match the WGSL `var` / + `var` qualifiers exactly or wgpu's + create-compute-pipeline validation rejects. + """ return GpuBindGroupLayoutEntry( binding=binding, visibility=GpuShaderStage.compute(), @@ -479,6 +482,7 @@ class Entry: """ def run(self) -> None: + """Run the workflow, mapping any WASI `Err` to a typed `EntryError_*`.""" try: _run_workflow() except Err as exc: diff --git a/services/ws-pyo3-runner/Cargo.toml b/services/ws-pyo3-runner/Cargo.toml new file mode 100644 index 0000000..2a47f44 --- /dev/null +++ b/services/ws-pyo3-runner/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "et-ws-pyo3-runner" +description = "Generic edge-toolkit agent runtime that hosts a user-supplied Python module via PyO3" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +doctest = false +path = "src/lib.rs" + +[[bin]] +name = "et-ws-pyo3-runner" +path = "src/main.rs" + +[dependencies] +edge-toolkit.workspace = true +# Typed ws-server REST client (storage GET/PUT). `tracing` injects the +# W3C traceparent on each request, matching the other native runners. +et-rest-client = { workspace = true, features = ["tracing"] } +# Shared runner bootstrap: env config (RUNNER_*/WS_*), `derive_http_base`, +# `collect_byte_stream`. +et-ws-runner-common.workspace = true +futures-util.workspace = true +pyo3.workspace = true +serde.workspace = true +serde-env.workspace = true +serde_json.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["macros", "net", "rt-multi-thread", "signal", "sync", "time"] } +tokio-tungstenite = { workspace = true, features = ["connect"] } +tracing.workspace = true +tracing-subscriber.workspace = true + +[build-dependencies] +pyo3-build-config.workspace = true + +[dev-dependencies] +base64.workspace = true +et-ws-test-server.workspace = true + +[lints] +workspace = true diff --git a/services/ws-pyo3-runner/README.md b/services/ws-pyo3-runner/README.md new file mode 100644 index 0000000..27188b3 --- /dev/null +++ b/services/ws-pyo3-runner/README.md @@ -0,0 +1,48 @@ +# et-ws-pyo3-runner + +A small, **generic** program that lets you write an edge-toolkit agent in **plain Python** and have it talk to +the WebSocket server — without writing any Rust. + +## Is it generic? + +Yes. The runner contains no application logic of its own. It is a _host_: it connects to the ws-server, then +hands every incoming message to a Python module **you** choose, and sends back whatever that module produces. +The exact same binary runs the toy `echo` example and the `fanout` and `storage` test modules — only the +chosen Python module differs. + +## How it loads your module + +You point it at a Python module by name and tell it where to find that module on disk: + +```sh +RUNNER_MODULE=echo # which Python module to load (required) +PYO3_PYTHONPATH=.../python # folders to import it (and its dependencies) from +WS_SERVER_URL=ws://127.0.0.1:8080/ws # where the ws-server is (optional) +cargo run -p et-ws-pyo3-runner +``` + +On startup the runner embeds a Python interpreter, `import`s the module named by `RUNNER_MODULE`, and from then +on just calls functions on it as things happen. Your module keeps its own state in ordinary Python globals; the +runner never looks inside. + +## The contract (every function is optional) + +Your module may define any of these top-level functions; the runner calls them at the right moments: + +| Function | Called when | +| ------------------------ | ------------------------------------------------------------------ | +| `init(send, storage)` | once, at startup | +| `on_connect(agent_id)` | once, after the server assigns this agent an id | +| `on_text_frame(text)` | a text message arrived; return a reply (`str`/`bytes`) or `None` | +| `on_binary_frame(frame)` | a binary message arrived; return a reply (`bytes`/`str`) or `None` | +| `on_shutdown()` | once, as the connection closes | + +`init` receives two helpers: + +- **`send`** — call `send.text(...)` / `send.binary(...)` to push messages out at any time (not only as a reply + to an incoming one, and even from a background thread). +- **`storage`** — call `storage.get(agent_id, key)` / `storage.put(key, data)` to read and write files the + ws-server keeps for each agent. + +That is the whole interface: a module is "just a Python file with some of those functions." The smallest example +is [`python/echo.py`](python/echo.py). diff --git a/services/ws-pyo3-runner/build.rs b/services/ws-pyo3-runner/build.rs new file mode 100644 index 0000000..a7bda72 --- /dev/null +++ b/services/ws-pyo3-runner/build.rs @@ -0,0 +1,24 @@ +//! Build script: bake the active Python's lib directory in as an rpath so +//! the `et-ws-pyo3-runner` binary finds `libpython*.so` at runtime without +//! the operator having to set `LD_LIBRARY_PATH`. +//! +//! pyo3-build-config (a transitive dep via pyo3) inspects whatever +//! interpreter the build resolves (`PYO3_PYTHON`, then `python3` on +//! PATH) and exposes its `lib_dir` to us. We forward that as a linker +//! arg targeted only at the runner binary so the rlib half of the +//! crate is unaffected. + +fn main() { + let config = pyo3_build_config::get(); + let Some(lib_dir) = &config.lib_dir else { + // pyo3 will already have emitted its own diagnostics about + // which Python it picked up; nothing useful for us to add. + return; + }; + + // -rpath is the runtime search path on ELF; macOS uses + // @loader_path/ via -rpath too in modern linkers, so the + // same arg covers both targets we care about. + println!("cargo:rustc-link-arg-bin=et-ws-pyo3-runner=-Wl,-rpath,{lib_dir}"); + println!("cargo:rerun-if-env-changed=PYO3_PYTHON"); +} diff --git a/services/ws-pyo3-runner/python/cowsay_probe.py b/services/ws-pyo3-runner/python/cowsay_probe.py new file mode 100644 index 0000000..9440ad7 --- /dev/null +++ b/services/ws-pyo3-runner/python/cowsay_probe.py @@ -0,0 +1,21 @@ +"""Prove a mise-preinstalled pipx package imports under the embedded interpreter. + +`cowsay` is declared as `pipx:cowsay` in the always-loaded mise config, and the +runner puts every mise `pipx:` package's site-packages on `sys.path` via +`edge_toolkit::config::mise_python_site_packages`. So the top-level `import +cowsay` below succeeds WITHOUT the operator adding it to PYO3_PYTHONPATH -- if +the runner didn't wire that path in, this import would fail the whole module +load. Exercised by `tests/cowsay.rs`. +""" + +from __future__ import annotations + +import cowsay + + +def on_text_frame(text: str) -> str: + """Render the inbound text through cowsay and return it. + + The round-trip proves cowsay both imported and actually runs. + """ + return cowsay.get_output_string("cow", text) diff --git a/services/ws-pyo3-runner/python/echo.py b/services/ws-pyo3-runner/python/echo.py new file mode 100644 index 0000000..9f4a8b0 --- /dev/null +++ b/services/ws-pyo3-runner/python/echo.py @@ -0,0 +1,90 @@ +"""Example Python module for `et-ws-pyo3-runner`. + +This file demonstrates the contract the runner expects. Every function is +optional — if your module doesn't define it, the runner skips that hook. + +Lifecycle, in order: + + init(send, storage) # once, at startup; `send`/`storage` are host handles + on_connect(agent_id) # once, after et-connect-ack + on_text_frame(text) # per inbound text frame the et hub didn't recognise as a typed et-* message + on_binary_frame(frame) # per inbound binary frame + on_shutdown() # once, after the websocket closes + +Two ways to emit outbound frames: + +* **Simple case (this file uses it):** return `str` from `on_text_frame` + or `bytes` from `on_binary_frame`. The runner sends that single frame + back. `return None` for silence. + +* **Fan-out case:** call `send.text(...)` / `send.binary(...)` any + number of times during a handler — or later from a background thread. + Both styles compose: anything you `send.*()` during a handler goes + out *before* the value you `return`, in submission order. + +State lives in module-level globals. The runner instantiates one copy +per process, so this is the same as a singleton — no classes, no state +threading across the FFI boundary. + +To use this module, set these env vars and run the runner: + + WS_SERVER_URL=ws://127.0.0.1:8080/ws + RUNNER_MODULE=echo + PYO3_PYTHONPATH=services/ws-pyo3-runner/python + cargo run -p et-ws-pyo3-runner +""" + +from __future__ import annotations + +import logging + +_logger = logging.getLogger(__name__) + +# --- module state ---------------------------------------------------------- + +_agent_id: str | None = None +_send = None # type: WsSender | None — stashed for fan-out, unused here +_storage = None # type: WsStorage | None — stashed for completeness +_echoed: int = 0 + + +# --- runner hooks ---------------------------------------------------------- + + +def init(send, storage) -> None: + """Stash the WsSender and WsStorage handles for later use. + + Even modules that only use reply-by-return should accept and keep `send` + — it's how you'd push frames later (e.g. from a background thread). + `storage` is the ws-server's `/storage` API; this example doesn't use it. + """ + global _send, _storage + _send = send + _storage = storage + _logger.info("echo agent initialised") + + +def on_connect(agent_id: str) -> None: + """Record the agent id the server assigned on connect.""" + global _agent_id + _agent_id = agent_id + _logger.info("echo agent registered as %s", agent_id) + + +def on_text_frame(text: str) -> str | None: + """Echo the incoming text frame back verbatim (return-style).""" + global _echoed + _echoed += 1 + return text + + +def on_binary_frame(frame: bytes) -> bytes | None: + """Echo the incoming binary frame back verbatim (return-style).""" + global _echoed + _echoed += 1 + return frame + + +def on_shutdown() -> None: + """Log the running echo count as the connection closes.""" + _logger.info("echo agent shutting down after %d frames", _echoed) diff --git a/services/ws-pyo3-runner/python/fanout.py b/services/ws-pyo3-runner/python/fanout.py new file mode 100644 index 0000000..8caa334 --- /dev/null +++ b/services/ws-pyo3-runner/python/fanout.py @@ -0,0 +1,34 @@ +"""Emit multiple outbound frames per inbound frame via the `WsSender` push API. + +Used by `tests/fanout.rs` to verify the multi-send path works end to end. + +For each inbound binary frame containing `n` (a single byte 0-255), we +push `n` distinct binary frames back through `send.binary(...)`. We +return `None` so reply-by-return doesn't add an extra frame. +""" + +from __future__ import annotations + +import logging + +_logger = logging.getLogger(__name__) + +_send = None # WsSender, set in init() + + +def init(send, storage) -> None: + """Stash the WsSender for the fan-out path.""" + global _send + _send = send + # `storage` ignored — fanout doesn't persist anything. + _logger.info("fanout agent initialised") + + +def on_binary_frame(frame: bytes) -> None: + """Push one binary frame per unit of the count in the first byte.""" + if not frame: + return None + count = frame[0] + for i in range(count): + _send.binary(bytes([i])) + return None diff --git a/services/ws-pyo3-runner/python/no_hooks.py b/services/ws-pyo3-runner/python/no_hooks.py new file mode 100644 index 0000000..0c7acf3 --- /dev/null +++ b/services/ws-pyo3-runner/python/no_hooks.py @@ -0,0 +1,7 @@ +"""Test fixture for et-ws-pyo3-runner's load-time hook sanity check. + +Intentionally defines none of the runner hooks (init / on_connect / +on_text_frame / on_binary_frame / on_shutdown), so importing it must fail fast +rather than register an agent that could never be driven. Used by +tests/no_hooks.rs. +""" diff --git a/services/ws-pyo3-runner/python/storage_pingpong.py b/services/ws-pyo3-runner/python/storage_pingpong.py new file mode 100644 index 0000000..e9152cf --- /dev/null +++ b/services/ws-pyo3-runner/python/storage_pingpong.py @@ -0,0 +1,53 @@ +"""Module that exercises `WsStorage.get` / `WsStorage.put`. + +On the first inbound binary frame, the module reads `key` (the first +bytes of the payload up to a NUL byte) and the rest of the payload as +the value, then calls `storage.put(key, value)`. On the second inbound +binary frame containing just `key`, it calls `storage.get(my_agent_id, +key)` and pushes the resulting bytes back via `send.binary(...)`. + +Used by `tests/storage.rs` to verify the storage path lands bytes on +the ws-server and reads them back. +""" + +from __future__ import annotations + +import logging + +_logger = logging.getLogger(__name__) + +_send = None +_storage = None +_agent_id: str | None = None + + +def init(send, storage) -> None: + """Stash the WsSender and WsStorage handles.""" + global _send, _storage + _send = send + _storage = storage + + +def on_connect(agent_id: str) -> None: + """Record the assigned agent id for later get/put calls.""" + global _agent_id + _agent_id = agent_id + + +def on_binary_frame(frame: bytes) -> None: + r"""Put on `key\x00value` frames, get (and reply) on bare `key` frames.""" + if b"\x00" in frame: + key_bytes, value = frame.split(b"\x00", 1) + key = key_bytes.decode("utf-8") + _storage.put(key, value) + _logger.info("stored %d bytes at key=%s", len(value), key) + return None + + key = frame.decode("utf-8") + value = _storage.get(_agent_id, key) + if value is None: + _send.binary(b"") + else: + _send.binary(value) + _logger.info("fetched key=%s (%d bytes)", key, 0 if value is None else len(value)) + return None diff --git a/services/ws-pyo3-runner/python/torch_inference.py b/services/ws-pyo3-runner/python/torch_inference.py new file mode 100644 index 0000000..d2df837 --- /dev/null +++ b/services/ws-pyo3-runner/python/torch_inference.py @@ -0,0 +1,82 @@ +"""PyTorch analogue of the wasi-graphics-info module, for `et-ws-pyo3-runner`. + +Where wasi-graphics-info runs a deterministic 4x4 matmul (verifying C[0][0]) and +a single MNIST forward pass (verifying the predicted class) through standardised +WASI interfaces, this runs the same two shapes through PyTorch on the embedded +CPython interpreter: + + 1. compute: C = A @ B with A = I(4), B = 2*I(4); verify C[0][0] == 2.0 + 2. inference: a fixed tiny linear classifier over a constant input; argmax -> + class; verify it matches the deterministic expected class + +`torch` is declared as `pipx:torch` in the python-only mise config and reaches +`sys.path` via `edge_toolkit::config::mise_python_site_packages`, exactly like +cowsay. The top-level `import torch` fails the whole module load if torch isn't +wired in -- so `tests/torch_inference.rs` checks for torch up front and SKIPS, +rather than letting the runner fail, when torch isn't installed. + +On any inbound text frame the module runs the workflow and returns a JSON +summary, so the Rust test can round-trip a trigger and assert on the result. +""" + +from __future__ import annotations + +import json + +import torch + +# Identity * (2*I); mirrors MAT_A / MAT_B / EXPECTED_C00 in wasi-graphics-info. +EXPECTED_C00 = 2.0 +# Fixed weights make the classifier's argmax deterministic across builds. +EXPECTED_CLASS = 3 + + +def _matmul() -> float: + """C = I(4) @ (2 * I(4)); the (0,0) cell is 2.0.""" + a = torch.eye(4) + b = torch.eye(4) * 2.0 + c = a @ b + return float(c[0, 0].item()) + + +def _inference() -> int: + """Run a fixed 1x4 input through a fixed 4x4 weight matrix. + + The last row dominates, so argmax is deterministically EXPECTED_CLASS. No + randomness, no model file -- the point is to exercise a real torch forward + pass. + """ + x = torch.tensor([[1.0, 2.0, 3.0, 4.0]]) + weights = torch.tensor( + [ + [0.0, 0.0, 0.0, 0.0], + [0.1, 0.0, 0.0, 0.0], + [0.0, 0.1, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0], + ] + ) + logits = x @ weights.T + return int(torch.argmax(logits, dim=1).item()) + + +def on_text_frame(text: str) -> str: + """Run both checks and return a JSON summary. + + Raise on any mismatch so a regression surfaces as a failed module rather + than a wrong-but-quiet reply. + """ + c00 = _matmul() + if abs(c00 - EXPECTED_C00) > 1e-4: + raise RuntimeError(f"matmul C[0][0]={c00}, expected {EXPECTED_C00}") + predicted = _inference() + if predicted != EXPECTED_CLASS: + raise RuntimeError(f"inference predicted {predicted}, expected {EXPECTED_CLASS}") + return json.dumps( + { + "framework": "torch", + "torch_version": torch.__version__, + "matmul_c00": c00, + "predicted_class": predicted, + "expected_class": EXPECTED_CLASS, + } + ) diff --git a/services/ws-pyo3-runner/src/agent.rs b/services/ws-pyo3-runner/src/agent.rs new file mode 100644 index 0000000..d188c82 --- /dev/null +++ b/services/ws-pyo3-runner/src/agent.rs @@ -0,0 +1,305 @@ +//! WebSocket loop for the generic pyo3 runner. +//! +//! Same handshake as `et-ws-wasi-runner`: send `et-connect`, wait for +//! `et-connect-ack`, capture the assigned `agent_id`, and forward every +//! inbound frame to the user's Python module. Frames the module wants +//! to send go through a `WsSender` it received at `init()` time; the +//! WS loop drains the channel in parallel with the inbound stream via +//! `tokio::select!`, so Python can push frames whenever (during a +//! handler, after, or from a background thread). + +use std::sync::{Arc, Mutex, PoisonError}; +use std::time::Duration; + +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio::net::TcpStream; +use tokio::sync::mpsc; +use tokio_tungstenite::{MaybeTlsStream, WebSocketStream, tungstenite}; +use tracing::{info, warn}; + +use crate::error::RunnerError; +use crate::python::{AgentIdSlot, Dispatcher, OutboundFrame, StorageError, StorageOp, WsSender, WsStorage}; + +/// One unit of work for the Python dispatch thread. +/// +/// The WS loop forwards inbound frames plus the connect/shutdown lifecycle as +/// these; the worker drains them in submission order, off the WS task, so a +/// slow handler never stalls the heartbeat or the outbound drain. +#[derive(Debug)] +enum InboundEvent { + Connect(String), + Text(String), + Binary(Vec), + Shutdown, +} + +/// Connection inputs the binary entrypoint hands to [`initialize`]. +#[expect( + clippy::exhaustive_structs, + reason = "input config built by the binary entrypoint via a struct literal; new fields are additive there" +)] +pub struct AgentConfig { + pub ws_url: String, + /// Optional `agent_id` to request on connect. + /// + /// `None` lets the server assign a fresh one. + pub requested_agent_id: Option, + /// How long to wait for `et-connect-ack`; `None` waits forever. + pub connect_ack_timeout: Option, +} + +/// A built-but-not-yet-connected agent: channels, dispatcher, and config. +/// +/// Produced by [`initialize`] and consumed by [`run`], which connects and +/// drives it. +#[non_exhaustive] +pub struct InitializedAgent { + pub config: AgentConfig, + pub dispatcher: Dispatcher, + /// Reply-by-return path for handler return values. + /// + /// The Python dispatch worker pushes a handler's returned `bytes` / `str` + /// onto the same outbound queue Python's `WsSender` writes to. + pub outbound_tx: mpsc::UnboundedSender, + pub outbound_rx: mpsc::UnboundedReceiver, + /// Shared cell `WsStorage.put()` reads to learn our `agent_id`. + /// + /// The runner populates it after `et-connect-ack`. + pub agent_id_slot: AgentIdSlot, + /// Receiver half of the storage op channel, drained by the worker in `run()`. + pub storage_rx: mpsc::UnboundedReceiver, + /// Base URL for storage requests, e.g. `http://127.0.0.1:8080`. + pub http_base: String, +} + +/// Build the channels, `WsSender`, and `WsStorage`, then import the module. +/// +/// The Sender and Storage are built first so they can be handed to the +/// module's `init(send, storage)` hook. The Storage's `agent_id` is +/// initially `None`; the runner fills it in after the server replies +/// with `et-connect-ack`. Storage ops are dispatched through an mpsc +/// channel into a worker task that owns the typed REST client. +pub fn initialize( + module_name: &str, + python_path_extras: &[std::path::PathBuf], + config: AgentConfig, +) -> Result { + let (tx, rx) = mpsc::unbounded_channel::(); + let sender = WsSender::new(tx.clone()); + + let http_base = et_ws_runner_common::derive_http_base(&config.ws_url)?; + let agent_id_slot: AgentIdSlot = Arc::new(Mutex::new(None)); + let (storage_tx, storage_rx) = mpsc::unbounded_channel::(); + let storage = WsStorage::new(Arc::clone(&agent_id_slot), storage_tx); + + // Prepend mise-managed pipx `site-packages` so the module can `import` + // packages preinstalled via mise (e.g. cowsay) without the operator setting + // PYTHONPATH by hand. The explicit PYO3_PYTHONPATH entries come last so they + // keep priority -- `Dispatcher::import` inserts each at `sys.path[0]`, so the + // last entry wins. + let mut python_path = edge_toolkit::config::mise_python_site_packages(); + python_path.extend_from_slice(python_path_extras); + let dispatcher = Dispatcher::import(module_name, &python_path, sender, storage)?; + Ok(InitializedAgent { + config, + dispatcher, + outbound_tx: tx, + outbound_rx: rx, + agent_id_slot, + storage_rx, + http_base, + }) +} + +/// Connect, register, and drive the agent until the connection closes. +/// +/// Spawns the storage worker and the Python dispatch thread, completes the +/// `et-connect` handshake, then runs the WS loop. Returns once the socket +/// closes or `drive` errors. +pub async fn run(agent: InitializedAgent) -> Result<(), RunnerError> { + let InitializedAgent { + config, + dispatcher, + outbound_tx, + mut outbound_rx, + agent_id_slot, + storage_rx, + http_base, + } = agent; + + // Spawn the storage worker first so it's ready by the time Python's + // `init(send, storage)` returns. The worker outlives `run()` until + // the channel is dropped -- i.e. when WsStorage (held by Python) is + // dropped at process exit. + let storage_task = tokio::spawn(storage_worker(http_base, storage_rx)); + + // Run Python on its own OS thread: every hook executes here, off the async + // WS task, so even a long-running handler can't stall the heartbeat or the + // outbound drain in `drive`. The worker owns the Dispatcher and processes + // inbound events in submission order. + let (inbound_tx, inbound_rx) = mpsc::unbounded_channel::(); + let worker = std::thread::Builder::new() + .name("pyo3-dispatch".to_owned()) + .spawn(move || python_worker(dispatcher, inbound_rx, outbound_tx))?; + + info!("connecting to {}", config.ws_url); + let (mut socket, agent_id, status) = et_ws_runner_common::connect_and_register( + &config.ws_url, + config.requested_agent_id, + config.connect_ack_timeout, + ) + .await?; + info!( + "registered as agent_id={agent_id} ({})", + et_ws_runner_common::connect_status_label(&status) + ); + // Populate the slot before `on_connect` so Python sees a valid + // `storage.agent_id` from the first instant it can act. + *agent_id_slot.lock().unwrap_or_else(PoisonError::into_inner) = Some(agent_id.clone()); + drop(inbound_tx.send(InboundEvent::Connect(agent_id))); + + let result = drive(&mut socket, &inbound_tx, &mut outbound_rx).await; + + // Queue `on_shutdown` (the worker drains any frames ahead of it first), + // then drop our sender so the worker's recv loop ends. Join before aborting + // the storage task so an `on_shutdown` that persists state can still reach + // it; only then close the socket and stop storage. + drop(inbound_tx.send(InboundEvent::Shutdown)); + drop(inbound_tx); + drop(worker.join()); + drop(socket.send(tungstenite::Message::Close(None)).await); + storage_task.abort(); + result +} + +/// Run every Python hook on a dedicated OS thread, owning the `Dispatcher`. +/// +/// Fully decouples Python execution from the async WS task. Handler return +/// values are pushed onto the same outbound queue Python's `WsSender` writes +/// to. Runs until the inbound channel closes (after `Shutdown`). +#[expect( + clippy::cognitive_complexity, + clippy::needless_pass_by_value, + reason = "owns its args for the thread's lifetime; one linear match over the inbound event taxonomy" +)] +fn python_worker( + dispatcher: Dispatcher, + mut inbound_rx: mpsc::UnboundedReceiver, + outbound_tx: mpsc::UnboundedSender, +) { + while let Some(event) = inbound_rx.blocking_recv() { + match event { + InboundEvent::Connect(agent_id) => { + if let Err(err) = dispatcher.on_connect(&agent_id) { + warn!("on_connect hook failed: {err}"); + } + } + InboundEvent::Text(text) => match dispatcher.on_text_frame(&text) { + Ok(Some(reply)) => drop(outbound_tx.send(OutboundFrame::Text(reply))), + Ok(None) => {} + Err(err) => warn!("on_text_frame raised: {err}"), + }, + InboundEvent::Binary(bytes) => match dispatcher.on_binary_frame(&bytes) { + Ok(Some(reply)) => drop(outbound_tx.send(OutboundFrame::Binary(reply))), + Ok(None) => {} + Err(err) => warn!("on_binary_frame raised: {err}"), + }, + InboundEvent::Shutdown => { + if let Err(err) = dispatcher.on_shutdown() { + warn!("on_shutdown hook failed: {err}"); + } + break; + } + } + } +} + +/// Drain `StorageOp`s from the channel, resolving each via `et-rest-client`. +/// +/// One worker handles all storage I/O for the agent -- the operations are +/// infrequent (load on connect, save on shutdown for the typical model-weights +/// case) so serial execution is fine. A missing key surfaces as the client's +/// `ErrorResponse` (the 404 arm), which we map to `Ok(None)`. +async fn storage_worker(http_base: String, mut rx: mpsc::UnboundedReceiver) { + let client = et_rest_client::Client::new(&http_base); + while let Some(op) = rx.recv().await { + match op { + StorageOp::Get { agent_id, key, reply } => { + let outcome = match client.get_file(&agent_id, &key).await { + Ok(response) => match et_ws_runner_common::collect_byte_stream(response.into_inner()).await { + Ok(bytes) => Ok(Some(bytes)), + Err(source) => Err(StorageError::get(&agent_id, &key, format!("reading body: {source}"))), + }, + Err(et_rest_client::Error::ErrorResponse(_)) => Ok(None), + Err(source) => Err(StorageError::get(&agent_id, &key, source.to_string())), + }; + drop(reply.send(outcome)); + } + StorageOp::Put { + agent_id, + key, + data, + reply, + } => { + let outcome = match client.put_file(&agent_id, &key, data).await { + Ok(_) => Ok(()), + Err(source) => Err(StorageError::put(&agent_id, &key, source.to_string())), + }; + drop(reply.send(outcome)); + } + } + } +} + +/// Drive the socket in both directions. +/// +/// Inbound frames are forwarded to the Python dispatch worker via `inbound_tx` +/// (a non-blocking send, so a slow handler never holds up this loop); outbound +/// frames the worker or Python's `WsSender` produced come back through +/// `outbound_rx` and out to the socket. +async fn drive( + socket: &mut WebSocketStream>, + inbound_tx: &mpsc::UnboundedSender, + outbound_rx: &mut mpsc::UnboundedReceiver, +) -> Result<(), RunnerError> { + // Keepalive: the server closes idle connections and never pings us, so a + // module that only waits for inbound frames would be timed out. Ping on a + // cadence well inside the server's timeout to stay registered. + let mut heartbeat = et_ws_runner_common::heartbeat_interval().await; + loop { + tokio::select! { + // Inbound: hand the frame to the dispatch worker and keep looping. + // The worker emits any reply onto the same outbound queue Python + // pushes to via WsSender, so multi-send + reply compose in order. + frame = socket.next() => match frame { + Some(Ok(tungstenite::Message::Binary(bytes))) => { + drop(inbound_tx.send(InboundEvent::Binary(bytes))); + } + Some(Ok(tungstenite::Message::Text(text))) => { + drop(inbound_tx.send(InboundEvent::Text(text))); + } + Some(Ok(tungstenite::Message::Close(_))) => { + info!("server closed connection"); + return Ok(()); + } + // Ping / Pong / Frame and any future variant: nothing to do. + Some(Ok(_)) => {} + Some(Err(e)) => return Err(RunnerError::WebSocket(e)), + None => return Ok(()), + }, + // Outbound: drain anything the worker pushed (Python's WsSender + // sends or return-value replies). + Some(out) = outbound_rx.recv() => { + let msg = match out { + OutboundFrame::Text(text) => tungstenite::Message::Text(text), + OutboundFrame::Binary(bytes) => tungstenite::Message::Binary(bytes), + }; + socket.send(msg).await?; + } + // Keepalive ping; the server treats it as activity and pongs back. + _ = heartbeat.tick() => { + socket.send(tungstenite::Message::Ping(Vec::new())).await?; + } + } + } +} diff --git a/services/ws-pyo3-runner/src/config.rs b/services/ws-pyo3-runner/src/config.rs new file mode 100644 index 0000000..f0c0486 --- /dev/null +++ b/services/ws-pyo3-runner/src/config.rs @@ -0,0 +1,57 @@ +//! Environment-driven configuration for the pyo3 runner. +//! +//! Deserialised from the process environment via `serde-env`. The `RUNNER_*` +//! and `WS_*` vars are parsed by the shared [`RunnerConfig`] / [`WsConfig`] +//! structs from `et-ws-runner-common` (so this runner reads the same vars as +//! the WASI / web runners); `PYO3_*` populates the runner-specific bits. +//! +//! `RUNNER_MODULE` (required) -- Python module name to import. +//! `RUNNER_TIMEOUT` (optional) -- wall-clock run limit, e.g. `120s`, `3m`. +//! `WS_SERVER_URL` (optional) -- defaults to the local insecure ws port. +//! `PYO3_PYTHONPATH` (optional) -- colon-separated paths prepended to `sys.path`. +//! `PYO3_AGENT_ID` (optional) -- request this `agent_id` on connect. + +use std::path::PathBuf; + +use et_ws_runner_common::config::{RunnerConfig, WsConfig}; +use serde::Deserialize; + +/// Configuration for the pyo3 runner, sourced from the environment. +#[derive(Clone, Debug, Deserialize)] +#[non_exhaustive] +pub struct Config { + /// `RUNNER_*` settings (`RUNNER_MODULE`, `RUNNER_TIMEOUT`). + pub runner: RunnerConfig, + /// `WS_*` settings (`WS_SERVER_URL`). + #[serde(default)] + pub ws: WsConfig, + /// `PYO3_*` settings unique to this runner. + #[serde(default)] + pub pyo3: Pyo3Config, +} + +/// Runner-specific `PYO3_*` settings with no shared equivalent. +#[derive(Clone, Debug, Default, Deserialize)] +#[non_exhaustive] +pub struct Pyo3Config { + /// `PYO3_PYTHONPATH` -- colon-separated paths prepended to `sys.path`. + /// + /// Prepended before importing the module; empty by default. + #[serde(default)] + pub pythonpath: String, + /// `PYO3_AGENT_ID` -- request this `agent_id` on connect; unset gets a fresh one. + #[serde(default)] + pub agent_id: Option, +} + +impl Pyo3Config { + /// Split `PYO3_PYTHONPATH` into path entries, dropping empty segments. + #[must_use] + pub fn python_path(&self) -> Vec { + self.pythonpath + .split(':') + .filter(|segment| !segment.is_empty()) + .map(PathBuf::from) + .collect() + } +} diff --git a/services/ws-pyo3-runner/src/error.rs b/services/ws-pyo3-runner/src/error.rs new file mode 100644 index 0000000..b1bc842 --- /dev/null +++ b/services/ws-pyo3-runner/src/error.rs @@ -0,0 +1,30 @@ +//! Error type for the runner's connect/register/drive path. + +use thiserror::Error; + +use crate::python::PythonError; + +/// Failure modes of `agent::{initialize, run}`. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum RunnerError { + /// Could not derive the storage HTTP base from the ws-server URL. + #[error(transparent)] + Bootstrap(#[from] et_ws_runner_common::BootstrapError), + + /// Importing or initialising the user's Python module failed. + #[error(transparent)] + Python(#[from] PythonError), + + /// Connecting to and registering with the ws-server failed. + #[error(transparent)] + Connect(#[from] et_ws_runner_common::ConnectError), + + /// A WebSocket send / receive failed while driving the connection. + #[error("websocket: {0}")] + WebSocket(#[from] tokio_tungstenite::tungstenite::Error), + + /// The dedicated Python dispatch thread could not be spawned. + #[error("failed to spawn the Python dispatch thread: {0}")] + WorkerSpawn(#[from] std::io::Error), +} diff --git a/services/ws-pyo3-runner/src/lib.rs b/services/ws-pyo3-runner/src/lib.rs new file mode 100644 index 0000000..a2089b7 --- /dev/null +++ b/services/ws-pyo3-runner/src/lib.rs @@ -0,0 +1,24 @@ +//! Generic edge-toolkit agent runtime that hosts a user-supplied Python module. +//! +//! Uses `PyO3` to embed `CPython`. This crate is the Python sibling of +//! `et-ws-wasi-runner`: one binary, swappable user code, et-ws-server is the +//! always-on hub on the wire. +//! +//! Everything that matters lives in Python -- Rust just handles the +//! WebSocket transport, the et-* registration handshake, and dispatch +//! into `init` / `on_connect` / `on_text_frame` / `on_binary_frame` / +//! `on_shutdown`. The user module owns its state via module-level +//! globals; the runner never marshals state across the FFI boundary. +//! See `python/echo.py` for the contract. + +#![expect( + clippy::single_call_fn, + clippy::integer_division_remainder_used, + clippy::result_large_err, + reason = "register/drive/storage_worker/python_worker single-use; select! uses %; RunnerError: tungstenite::Error" +)] + +pub mod agent; +pub mod config; +pub mod error; +pub mod python; diff --git a/services/ws-pyo3-runner/src/main.rs b/services/ws-pyo3-runner/src/main.rs new file mode 100644 index 0000000..7cbcd1a --- /dev/null +++ b/services/ws-pyo3-runner/src/main.rs @@ -0,0 +1,66 @@ +//! Binary entrypoint; pure env-var config, no CLI flags. +//! +//! Configuration is deserialised from the environment by +//! [`et_ws_pyo3_runner::config::Config`]; see that module for the full variable +//! list (`RUNNER_MODULE`, `RUNNER_TIMEOUT`, `WS_SERVER_URL`, `PYO3_PYTHONPATH`, +//! `PYO3_AGENT_ID`). + +#![expect( + clippy::integer_division_remainder_used, + reason = "tokio::select! expands to % internally" +)] + +use et_ws_pyo3_runner::agent::{AgentConfig, initialize, run as run_agent}; +use et_ws_pyo3_runner::config::Config; +use tracing::info; + +// Multi-threaded runtime. Python runs on its own OS thread (see +// `agent::run`'s dispatch worker), and its sync `WsStorage.get/put` park that +// thread on a oneshot reply; the runtime's worker threads keep driving the +// storage task and the WS loop so those replies resolve while Python waits. +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| "info".into())) + .init(); + + let config = serde_env::from_env::()?; + let module = config.runner.module.clone(); + let python_path = config.pyo3.python_path(); + let ws_url = config.ws.server_url.clone(); + let requested_agent_id = config.pyo3.agent_id.clone(); + let connect_ack_timeout = config.ws.connect_ack_timeout; + + info!("module={module} python_path={python_path:?} ws_url={ws_url}"); + + let agent = initialize( + &module, + &python_path, + AgentConfig { + ws_url, + requested_agent_id, + connect_ack_timeout, + }, + )?; + + let driven = async { + tokio::select! { + result = run_agent(agent) => result, + _ = tokio::signal::ctrl_c() => { + info!("interrupted; shutting down"); + Ok(()) + } + } + }; + + let Some(limit) = config.runner.timeout else { + driven.await?; + return Ok(()); + }; + let Ok(result) = tokio::time::timeout(limit, driven).await else { + info!("run timeout {limit:?} elapsed; shutting down"); + return Ok(()); + }; + result?; + Ok(()) +} diff --git a/services/ws-pyo3-runner/src/python.rs b/services/ws-pyo3-runner/src/python.rs new file mode 100644 index 0000000..40785d5 --- /dev/null +++ b/services/ws-pyo3-runner/src/python.rs @@ -0,0 +1,474 @@ +//! Python module loader + dispatcher. +//! +//! The runner imports one user module (`RUNNER_MODULE`) and invokes +//! module-level functions on it. The user module owns its state via +//! module-level globals; the runner never sees that state. +//! +//! On startup the runner hands the module two pyclass handles: +//! * `WsSender` -- push outbound frames to the ws-server +//! * `WsStorage` -- get/put files via the ws-server's `/storage` HTTP API +//! +//! Both are useful from anywhere in Python: during a handler, from a +//! later handler, or from a background thread the module spawns. +//! Handlers return either an optional reply (sent as one outbound +//! frame) or `None`. +//! +//! Contract (all hooks optional), as Python: +//! +//! _send = None +//! _storage = None +//! +//! def init(send, storage): +//! global _send, _storage +//! _send, _storage = send, storage +//! def on_connect(agent_id: str) -> None: ... +//! def on_text_frame(text: str) -> str | bytes | None: ... +//! def on_binary_frame(frame: bytes) -> bytes | str | None: ... +//! def on_shutdown() -> None: ... +//! +//! `on_text_frame` / `on_binary_frame` may return a single reply for the +//! simple case (echo, request/response). A handler that wants to emit +//! multiple frames, or emit nothing now and a frame later from a thread, +//! ignores the return value and uses the `WsSender` it stashed at +//! `init()`. The two styles compose: any sends made during a handler go +//! out *before* the returned reply, because both push onto the same +//! outbound queue and the queue is drained in order. + +use std::path::PathBuf; +use std::sync::{Arc, Mutex, PoisonError}; + +use pyo3::exceptions::PyRuntimeError; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyList, PyString}; +use tokio::sync::mpsc; + +// Names of the optional module-level hooks the runner calls. Each is referenced +// twice -- a `hasattr` guard and the call -- so it lives here as one source of +// truth: a typo in either spot would silently skip the hook (hasattr returns +// false, the hook never fires, no error). See the module-level contract above. +const HOOK_INIT: &str = "init"; +const HOOK_ON_CONNECT: &str = "on_connect"; +const HOOK_ON_TEXT_FRAME: &str = "on_text_frame"; +const HOOK_ON_BINARY_FRAME: &str = "on_binary_frame"; +const HOOK_ON_SHUTDOWN: &str = "on_shutdown"; + +/// Every hook, for the load-time sanity check. +/// +/// A module that defines none of them can never be invoked, so importing it is +/// almost certainly a mistake. +const HOOKS: [&str; 5] = [ + HOOK_INIT, + HOOK_ON_CONNECT, + HOOK_ON_TEXT_FRAME, + HOOK_ON_BINARY_FRAME, + HOOK_ON_SHUTDOWN, +]; + +/// An error from importing or invoking the user's Python module. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum PythonError { + #[error("python: {0}")] + Py(String), +} + +impl From for PythonError { + fn from(err: PyErr) -> Self { + Self::Py(format!("{err}")) + } +} + +impl<'cast, 'py> From> for PythonError { + fn from(err: pyo3::CastError<'cast, 'py>) -> Self { + Self::Py(format!("cast: {err}")) + } +} + +impl<'py> From> for PythonError { + fn from(err: pyo3::CastIntoError<'py>) -> Self { + Self::Py(format!("cast: {err}")) + } +} + +/// A storage GET/PUT that the worker in `agent.rs` failed to complete. +/// +/// Typed so the failing operation (and the agent/key it targeted) survive +/// back to the Python caller, where `From` turns it into a +/// `RuntimeError`. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum StorageError { + #[error("GET {agent_id}/{key}: {message}")] + Get { + agent_id: String, + key: String, + message: String, + }, + #[error("PUT {agent_id}/{key}: {message}")] + Put { + agent_id: String, + key: String, + message: String, + }, +} + +impl StorageError { + /// Build a `Get` failure. + /// + /// Borrows the identifiers so the worker's error arms don't fight the borrow + /// checker over the moved op fields. + #[must_use] + pub fn get(agent_id: &str, key: &str, message: String) -> Self { + Self::Get { + agent_id: agent_id.to_owned(), + key: key.to_owned(), + message, + } + } + + /// Build a `Put` failure; see [`StorageError::get`]. + #[must_use] + pub fn put(agent_id: &str, key: &str, message: String) -> Self { + Self::Put { + agent_id: agent_id.to_owned(), + key: key.to_owned(), + message, + } + } +} + +impl From for PyErr { + fn from(err: StorageError) -> Self { + PyRuntimeError::new_err(err.to_string()) + } +} + +/// One frame queued on the agent's outbound channel. +/// +/// The WS loop in `agent.rs` drains this and writes to the socket. +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum OutboundFrame { + Text(String), + Binary(Vec), +} + +/// Python-facing handle for queuing outbound frames. +/// +/// Bound to a `tokio::sync::mpsc::UnboundedSender` so Python's `.text()` / +/// `.binary()` calls fire and forget -- no GIL hand-back across an await, no +/// head-of-line blocking on the socket. The handle is `Clone` so Python can +/// stash multiple references if it wants (e.g. across background threads). +#[pyclass(name = "WsSender")] +pub struct WsSender { + tx: mpsc::UnboundedSender, +} + +#[pymethods] +impl WsSender { + /// Queue a text frame for the agent's outbound socket. + fn text(&self, text: String) -> PyResult<()> { + match self.tx.send(OutboundFrame::Text(text)) { + Ok(()) => Ok(()), + Err(err) => Err(PyRuntimeError::new_err(format!("ws send: {err}"))), + } + } + + /// Queue a binary frame for the agent's outbound socket. + /// + /// `frame` accepts any Python buffer protocol object (bytes / bytearray / + /// memoryview) -- `PyO3`'s `Vec` extraction handles the conversion. + fn binary(&self, frame: Vec) -> PyResult<()> { + match self.tx.send(OutboundFrame::Binary(frame)) { + Ok(()) => Ok(()), + Err(err) => Err(PyRuntimeError::new_err(format!("ws send: {err}"))), + } + } + + #[expect( + clippy::unused_self, + reason = "pyo3 #[pymethods] __repr__ takes &self by Python convention" + )] + fn __repr__(&self) -> String { + "".to_string() + } +} + +#[expect( + clippy::multiple_inherent_impl, + reason = "pyo3 #[pymethods] expands to its own inherent impl; the Rust-only constructor lives here" +)] +impl WsSender { + /// Construct a [`WsSender`] over the outbound-frame channel. + #[must_use] + pub const fn new(tx: mpsc::UnboundedSender) -> Self { + Self { tx } + } +} + +/// Shared cell holding the `agent_id` assigned by et-connect-ack. +/// +/// The runner writes it from `agent.rs::register`; Python reads it via +/// `WsStorage.agent_id`. Pre-connect it's `None`, so writes that target +/// the agent's own namespace fail with a clear error. +pub type AgentIdSlot = Arc>>; + +/// One unit of work the storage worker task knows how to do. +/// +/// Python's sync `WsStorage.get/put` build one of these, hand it off +/// via `op_tx`, and `blocking_recv()` on the embedded oneshot. The +/// worker (spawned by `agent.rs`) runs async `et-rest-client` calls +/// and sends results back. +#[derive(Debug)] +#[non_exhaustive] +pub enum StorageOp { + Get { + agent_id: String, + key: String, + reply: tokio::sync::oneshot::Sender>, StorageError>>, + }, + Put { + agent_id: String, + key: String, + data: Vec, + reply: tokio::sync::oneshot::Sender>, + }, +} + +/// Python-facing handle to et-ws-server's `/storage` HTTP API. +/// +/// `PUT /storage//` to persist, `GET /storage//` +/// to read (any agent's namespace is readable since the server static-serves +/// `/storage/`; writes only succeed for our own scope). Methods look +/// synchronous to Python -- internally they dispatch to a worker task on +/// the runtime and block on a oneshot reply. +#[pyclass(name = "WsStorage")] +pub struct WsStorage { + agent_id: AgentIdSlot, + op_tx: mpsc::UnboundedSender, +} + +#[pymethods] +impl WsStorage { + /// Return our currently assigned `agent_id`, or `None` before `on_connect`. + #[getter] + fn agent_id(&self) -> Option { + self.agent_id.lock().unwrap_or_else(PoisonError::into_inner).clone() + } + + /// GET `/storage/{agent_id}/{key}`. + /// + /// Returns `None` for 404, raises on other HTTP failures. Reads work for any + /// agent's namespace (et-storage-service static-serves the storage directory). + fn get(&self, py: Python<'_>, agent_id: String, key: String) -> PyResult>> { + let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); + if self + .op_tx + .send(StorageOp::Get { + agent_id, + key, + reply: reply_tx, + }) + .is_err() + { + return Err(PyRuntimeError::new_err("storage worker gone")); + } + // `detach` drops the GIL so other Python threads run while we park + // here. We're always called from a non-runtime thread -- the dedicated + // dispatch thread, or one the module spawned -- so a plain + // `blocking_recv` is correct: the storage worker task resolves the + // reply on the runtime's own threads while this thread waits. + match py.detach(|| reply_rx.blocking_recv()) { + Ok(result) => Ok(result?), + Err(_) => Err(PyRuntimeError::new_err("storage reply dropped")), + } + } + + /// PUT to `/storage//{key}`. + /// + /// Errors if `on_connect` hasn't fired yet (we don't know our `agent_id`) -- + /// call this from `on_connect` or later. + fn put(&self, py: Python<'_>, key: String, data: Vec) -> PyResult<()> { + let agent_id = self + .agent_id + .lock() + .unwrap_or_else(PoisonError::into_inner) + .clone() + .ok_or_else(|| { + PyRuntimeError::new_err("WsStorage.put() called before on_connect -- agent_id not yet assigned") + })?; + let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); + if self + .op_tx + .send(StorageOp::Put { + agent_id, + key, + data, + reply: reply_tx, + }) + .is_err() + { + return Err(PyRuntimeError::new_err("storage worker gone")); + } + // See `get` for why a plain `blocking_recv` (not `block_in_place`) is + // correct here: we never run on a tokio worker thread. + match py.detach(|| reply_rx.blocking_recv()) { + Ok(result) => { + result?; + Ok(()) + } + Err(_) => Err(PyRuntimeError::new_err("storage reply dropped")), + } + } + + fn __repr__(&self) -> String { + format!("", self.agent_id()) + } +} + +#[expect( + clippy::multiple_inherent_impl, + reason = "pyo3 #[pymethods] expands to its own inherent impl; the Rust-only constructor lives here" +)] +impl WsStorage { + /// Construct a [`WsStorage`] over the agent-id slot and op channel. + #[must_use] + pub const fn new(agent_id: AgentIdSlot, op_tx: mpsc::UnboundedSender) -> Self { + Self { agent_id, op_tx } + } +} + +/// Holds the imported user module across the lifetime of the agent loop. +pub struct Dispatcher { + module: Py, +} + +impl Dispatcher { + /// Import the user module and run its optional `init(send, storage)` hook. + /// + /// `python_path_extras` are prepended to `sys.path` before the import so the + /// module (and its dependencies) resolve. + pub fn import( + module_name: &str, + python_path_extras: &[PathBuf], + sender: WsSender, + storage: WsStorage, + ) -> Result { + Python::attach(|py| -> Result { + if !python_path_extras.is_empty() { + let sys = py.import("sys")?; + let sys_path = sys.getattr("path")?.cast_into::()?; + for extra in python_path_extras { + let entry = PyString::new(py, &extra.to_string_lossy()); + sys_path.insert(0, entry)?; + } + } + + let module = py.import(module_name)?; + + // Sanity check: a module that defines none of the hooks can never + // be driven, so importing it is almost certainly a misconfiguration + // (wrong RUNNER_MODULE, or a misspelt hook). Fail loudly at load + // rather than connect and sit idle. + let mut has_hook = false; + for hook in HOOKS { + if module.hasattr(hook)? { + has_hook = true; + break; + } + } + if !has_hook { + return Err(PythonError::Py(format!( + "module `{module_name}` defines none of the runner hooks ({})", + HOOKS.join(", ") + ))); + } + + if module.hasattr(HOOK_INIT)? { + let py_sender = Py::new(py, sender)?; + let py_storage = Py::new(py, storage)?; + drop(module.call_method1(HOOK_INIT, (py_sender, py_storage))?); + } + Ok(Self { + module: module.unbind(), + }) + }) + } + + /// Forward the assigned `agent_id` to the optional `on_connect` hook. + pub fn on_connect(&self, agent_id: &str) -> Result<(), PythonError> { + Python::attach(|py| -> Result<(), PythonError> { + let module = self.module.bind(py); + if !module.hasattr(HOOK_ON_CONNECT)? { + return Ok(()); + } + drop(module.call_method1(HOOK_ON_CONNECT, (agent_id,))?); + Ok(()) + }) + } + + /// Dispatch a text frame to `on_text_frame`. + /// + /// Returns the handler's optional reply (a `str`, or `bytes` decoded as + /// utf-8). Outbound frames the handler queued via `WsSender` go out + /// independently. + pub fn on_text_frame(&self, text: &str) -> Result, PythonError> { + Python::attach(|py| -> Result, PythonError> { + let module = self.module.bind(py); + if !module.hasattr(HOOK_ON_TEXT_FRAME)? { + return Ok(None); + } + let result = module.call_method1(HOOK_ON_TEXT_FRAME, (text,))?; + if result.is_none() { + return Ok(None); + } + if let Ok(reply) = result.extract::() { + return Ok(Some(reply)); + } + if let Ok(raw) = result.extract::>() { + return Ok(Some(String::from_utf8_lossy(&raw).into_owned())); + } + Err(PythonError::Py( + "on_text_frame must return str, bytes, or None".to_string(), + )) + }) + } + + /// Dispatch a binary frame to `on_binary_frame`. + /// + /// Returns the handler's optional reply (`bytes`, or `str` encoded as utf-8). + pub fn on_binary_frame(&self, frame: &[u8]) -> Result>, PythonError> { + Python::attach(|py| -> Result>, PythonError> { + let module = self.module.bind(py); + if !module.hasattr(HOOK_ON_BINARY_FRAME)? { + return Ok(None); + } + let frame_obj = PyBytes::new(py, frame); + let result = module.call_method1(HOOK_ON_BINARY_FRAME, (frame_obj,))?; + if result.is_none() { + return Ok(None); + } + if let Ok(raw) = result.extract::>() { + return Ok(Some(raw)); + } + if let Ok(reply) = result.extract::() { + return Ok(Some(reply.into_bytes())); + } + Err(PythonError::Py( + "on_binary_frame must return bytes, str, or None".to_string(), + )) + }) + } + + /// Call the optional `on_shutdown` hook, best-effort. + pub fn on_shutdown(&self) -> Result<(), PythonError> { + Python::attach(|py| -> Result<(), PythonError> { + let module = self.module.bind(py); + if !module.hasattr(HOOK_ON_SHUTDOWN)? { + return Ok(()); + } + drop(module.call_method0(HOOK_ON_SHUTDOWN)?); + Ok(()) + }) + } +} diff --git a/services/ws-pyo3-runner/tests/cowsay.rs b/services/ws-pyo3-runner/tests/cowsay.rs new file mode 100644 index 0000000..56d7f3b --- /dev/null +++ b/services/ws-pyo3-runner/tests/cowsay.rs @@ -0,0 +1,143 @@ +//! Proves the runner pre-populates `sys.path` with mise-installed pipx +//! packages: launch et-ws-pyo3-runner with `cowsay_probe.py`, which does a +//! top-level `import cowsay`, and verify a frame round-trips through cowsay. +//! +//! `cowsay` is declared as `pipx:cowsay` in the always-loaded mise config but +//! is NOT on `PYO3_PYTHONPATH` (which only points at the module dir). So the only +//! way the module imports is `edge_toolkit::config::mise_python_site_packages` +//! adding cowsay's venv `site-packages` to `sys.path`. The control client +//! broadcasts a plain string; the module returns the cowsay-rendered output; +//! we assert it came back transformed (contains the payload AND the cow art). + +#![cfg(test)] +#![expect( + clippy::arithmetic_side_effects, + clippy::single_call_fn, + reason = "integration test: Instant/Duration poll-loop math and single-use helpers" +)] + +use std::error::Error; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use edge_toolkit::ws::{ClientMessage, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio_tungstenite::{connect_async, tungstenite}; + +type ControlSocket = tokio_tungstenite::WebSocketStream>; + +/// A payload free of JSON punctuation (so the server treats it as an +/// unrecognised frame and broadcasts it) and free of `^__^` (so finding that +/// marker in the reply can only come from cowsay). +const PAYLOAD: &str = "split-learning-rocks"; + +/// Open a control client and drive et-connect until we have an `agent_id`. +async fn control_client(ws_url: &str) -> Result<(ControlSocket, String), Box> { + let (mut socket, _) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { agent_id: None })?; + socket.send(tungstenite::Message::Text(connect)).await?; + + loop { + let Some(frame) = socket.next().await else { + return Err("control socket closed before connect-ack".into()); + }; + let tungstenite::Message::Text(text) = frame? else { + continue; + }; + if let Ok(ServerMessage::ConnectAck { agent_id, .. }) = serde_json::from_str::(&text) { + return Ok((socket, agent_id)); + } + } +} + +#[tokio::test(flavor = "current_thread")] +async fn cowsay_module_imports_mise_package() -> Result<(), Box> { + let server = et_ws_test_server::start(); + + // Control client registers first so the runner has a peer to broadcast to. + let (mut control, control_id) = control_client(&server.ws_url).await?; + + // Spawn the runner. PYO3_PYTHONPATH points only at the module dir (for + // cowsay_probe.py) -- cowsay itself must come from the mise site-packages + // the runner wires in, which is the whole point of the test. + let module_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let mut runner = Command::new(bin) + .env("RUNNER_MODULE", "cowsay_probe") + .env("PYO3_PYTHONPATH", &module_path) + .env("WS_SERVER_URL", &server.ws_url) + .env("RUST_LOG", std::env::var("RUST_LOG").unwrap_or_else(|_| "warn".into())) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn()?; + + let result = tokio::time::timeout(Duration::from_secs(20), cowsay_round_trip(&mut control, &control_id)).await; + + drop(runner.kill()); + drop(runner.wait()); + + let reply = result??; + // cowsay wraps the payload in a speech bubble drawn above an ASCII cow; the + // `^__^` is part of the cow and never appears in PAYLOAD, so its presence + // proves the module both imported cowsay and ran it on our input. + if !reply.contains(PAYLOAD) || !reply.contains("^__^") { + return Err(format!("reply {reply:?} is not cowsay output for {PAYLOAD:?}").into()); + } + Ok(()) +} + +/// Poll `list_agents` until the runner registers, then broadcast `PAYLOAD` and +/// return the first non-protocol text frame that comes back (the cowsay output). +async fn cowsay_round_trip(control: &mut ControlSocket, self_id: &str) -> Result> { + let deadline = std::time::Instant::now() + Duration::from_secs(15); + let mut have_peer = false; + while std::time::Instant::now() < deadline { + let req = serde_json::to_string(&ClientMessage::ListAgents)?; + control.send(tungstenite::Message::Text(req)).await?; + let poll_deadline = std::time::Instant::now() + Duration::from_millis(250); + while std::time::Instant::now() < poll_deadline { + let remaining = poll_deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if let Ok(ServerMessage::ListAgentsResponse { agents }) = + serde_json::from_str::(&text) + && agents.iter().any(|summary| summary.agent_id != self_id) + { + have_peer = true; + break; + } + } + Ok(Some(Ok(_))) => {} + _ => break, + } + } + if have_peer { + break; + } + tokio::time::sleep(Duration::from_millis(200)).await; + } + if !have_peer { + return Err("runner never registered".into()); + } + + control.send(tungstenite::Message::Text(PAYLOAD.to_string())).await?; + + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while std::time::Instant::now() < deadline { + let remaining = deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if serde_json::from_str::(&text).is_ok() { + // typed et-* envelope (status / list / ack), keep draining + continue; + } + return Ok(text); + } + Ok(Some(Ok(_))) => {} + Ok(Some(Err(e))) => return Err(format!("recv error: {e}").into()), + Ok(None) => return Err("control socket closed".into()), + Err(_) => return Err("timed out waiting for cowsay reply".into()), + } + } + Err("deadline exceeded".into()) +} diff --git a/services/ws-pyo3-runner/tests/echo.rs b/services/ws-pyo3-runner/tests/echo.rs new file mode 100644 index 0000000..e81dfc4 --- /dev/null +++ b/services/ws-pyo3-runner/tests/echo.rs @@ -0,0 +1,159 @@ +//! Smoke test: spin up an in-process et-ws-server, launch et-ws-pyo3-runner +//! with the bundled `echo.py`, and verify it (a) successfully registers as +//! an agent and (b) echoes a frame we broadcast back to us. +//! +//! Two clients connect: a control client (this test, talking +//! `tokio-tungstenite` directly) and the pyo3 runner (subprocess). The +//! control client broadcasts an unrecognised text frame; et-ws-server +//! default-broadcasts it to the runner; the echo module returns it; the +//! server default-broadcasts the runner's reply back to the control +//! client. Round-trip proves both the protocol alignment and the Python +//! dispatch. + +#![cfg(test)] +#![expect( + clippy::arithmetic_side_effects, + clippy::single_call_fn, + reason = "integration test: Instant/Duration poll-loop math and single-use helpers" +)] + +use std::error::Error; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use edge_toolkit::ws::{ClientMessage, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio_tungstenite::{connect_async, tungstenite}; + +type ControlSocket = tokio_tungstenite::WebSocketStream>; + +/// Open a control client and drive et-connect until we have an `agent_id`. +async fn control_client(ws_url: &str) -> Result<(ControlSocket, String), Box> { + let (mut socket, _) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { agent_id: None })?; + socket.send(tungstenite::Message::Text(connect)).await?; + + loop { + let Some(frame) = socket.next().await else { + return Err("control socket closed before connect-ack".into()); + }; + let tungstenite::Message::Text(text) = frame? else { + continue; + }; + if let Ok(ServerMessage::ConnectAck { agent_id, .. }) = serde_json::from_str::(&text) { + return Ok((socket, agent_id)); + } + } +} + +#[tokio::test(flavor = "current_thread")] +async fn echo_module_round_trips() -> Result<(), Box> { + let server = et_ws_test_server::start(); + + // Stage 1: control client registers first so the runner has a peer to + // broadcast back to. + let (mut control, control_id) = control_client(&server.ws_url).await?; + + // Stage 2: spawn the runner subprocess. `manifest_dir/python` holds + // the echo module. + let echo_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let mut runner = Command::new(bin) + .env("RUNNER_MODULE", "echo") + .env("PYO3_PYTHONPATH", &echo_path) + .env("WS_SERVER_URL", &server.ws_url) + // Silence the runner's logs unless the test is invoked with --nocapture + // and the operator opted in via RUST_LOG. + .env("RUST_LOG", std::env::var("RUST_LOG").unwrap_or_else(|_| "warn".into())) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn()?; + + // Stage 3: wait until the runner shows up in the registry, then send + // an arbitrary text frame and assert we get the same string back via + // the default broadcast path. + let payload = r#"{"hello":"world","from":"control"}"#; + let echo_result = tokio::time::timeout( + Duration::from_secs(20), + echo_round_trip(&mut control, payload, &control_id), + ) + .await; + + drop(runner.kill()); + drop(runner.wait()); + + let observed = echo_result??; + if observed != payload { + return Err(format!("echo response {observed:?} did not match {payload:?}").into()); + } + Ok(()) +} + +/// Poll `list_agents` until we see at least one peer (the runner), then +/// broadcast a frame and wait for it to land back on the control socket. +async fn echo_round_trip(control: &mut ControlSocket, payload: &str, self_id: &str) -> Result> { + // Wait for the runner to register. The test server has no shared + // handle into the registry, so we poll `et-list-agents` until a peer + // shows up. The runner needs ~1s to spawn + init Python + connect, + // so give it a generous deadline. + let deadline = std::time::Instant::now() + Duration::from_secs(15); + let mut have_peer = false; + while std::time::Instant::now() < deadline { + let req = serde_json::to_string(&ClientMessage::ListAgents)?; + control.send(tungstenite::Message::Text(req)).await?; + // Drain everything available for 250ms -- there may be multiple + // queued responses from earlier polls. + let poll_deadline = std::time::Instant::now() + Duration::from_millis(250); + while std::time::Instant::now() < poll_deadline { + let remaining = poll_deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if let Ok(ServerMessage::ListAgentsResponse { agents }) = + serde_json::from_str::(&text) + && agents.iter().any(|summary| summary.agent_id != self_id) + { + have_peer = true; + break; + } + } + Ok(Some(Ok(_))) => {} + _ => break, + } + } + if have_peer { + break; + } + tokio::time::sleep(Duration::from_millis(200)).await; + } + if !have_peer { + return Err("runner never registered".into()); + } + + // Default-broadcast (frame the server doesn't recognise as a typed + // et-* message gets fanned out to other agents as-is). + control.send(tungstenite::Message::Text(payload.to_string())).await?; + + // The runner echoes back a Text frame containing the same payload. + // Drain frames until we see it; ignore et-* protocol noise and our + // own list_agents_response loops still in flight. + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while std::time::Instant::now() < deadline { + let remaining = deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if serde_json::from_str::(&text).is_ok() { + // typed et-* envelope (status / list / ack), keep draining + continue; + } + if text == payload { + return Ok(text); + } + } + Ok(Some(Ok(_))) => {} + Ok(Some(Err(e))) => return Err(format!("recv error: {e}").into()), + Ok(None) => return Err("control socket closed".into()), + Err(_) => return Err("timed out waiting for echo".into()), + } + } + Err("deadline exceeded".into()) +} diff --git a/services/ws-pyo3-runner/tests/fanout.rs b/services/ws-pyo3-runner/tests/fanout.rs new file mode 100644 index 0000000..c89e075 --- /dev/null +++ b/services/ws-pyo3-runner/tests/fanout.rs @@ -0,0 +1,128 @@ +//! Verify the multi-send path: one inbound binary frame results in N +//! outbound binary frames pushed via `WsSender.binary(...)`, with no +//! reply-by-return. Test client sends a single byte `count` and asserts +//! it receives exactly `count` distinct one-byte echoes. + +#![cfg(test)] +#![expect( + clippy::arithmetic_side_effects, + clippy::as_conversions, + clippy::indexing_slicing, + clippy::single_call_fn, + reason = "integration test: Instant/Duration poll-loop math, small counts/indexes, single-use helpers" +)] + +use std::error::Error; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use edge_toolkit::ws::{ClientMessage, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio_tungstenite::{connect_async, tungstenite}; + +type ControlSocket = tokio_tungstenite::WebSocketStream>; + +async fn control_client(ws_url: &str) -> Result<(ControlSocket, String), Box> { + let (mut socket, _) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { agent_id: None })?; + socket.send(tungstenite::Message::Text(connect)).await?; + loop { + let Some(frame) = socket.next().await else { + return Err("control socket closed before connect-ack".into()); + }; + if let tungstenite::Message::Text(text) = frame? + && let Ok(ServerMessage::ConnectAck { agent_id, .. }) = serde_json::from_str::(&text) + { + return Ok((socket, agent_id)); + } + } +} + +#[tokio::test(flavor = "current_thread")] +async fn fanout_module_emits_multiple_frames() -> Result<(), Box> { + let server = et_ws_test_server::start(); + let (mut control, control_id) = control_client(&server.ws_url).await?; + + let python_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let mut runner = Command::new(bin) + .env("RUNNER_MODULE", "fanout") + .env("PYO3_PYTHONPATH", &python_path) + .env("WS_SERVER_URL", &server.ws_url) + .env("RUST_LOG", std::env::var("RUST_LOG").unwrap_or_else(|_| "warn".into())) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn()?; + + let outcome = tokio::time::timeout(Duration::from_secs(30), exercise(&mut control, &control_id)).await; + drop(runner.kill()); + drop(runner.wait()); + + let observed = outcome??; + let expected: Vec = (0u8..5).collect(); + if observed != expected { + return Err(format!("received {observed:?}, expected {expected:?}").into()); + } + Ok(()) +} + +async fn exercise(control: &mut ControlSocket, self_id: &str) -> Result, Box> { + // Wait for the runner to register so our broadcast has a peer. + let deadline = std::time::Instant::now() + Duration::from_secs(15); + let mut have_peer = false; + while std::time::Instant::now() < deadline { + let req = serde_json::to_string(&ClientMessage::ListAgents)?; + control.send(tungstenite::Message::Text(req)).await?; + let poll_until = std::time::Instant::now() + Duration::from_millis(250); + while std::time::Instant::now() < poll_until { + let remaining = poll_until - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if let Ok(ServerMessage::ListAgentsResponse { agents }) = + serde_json::from_str::(&text) + && agents.iter().any(|summary| summary.agent_id != self_id) + { + have_peer = true; + break; + } + } + Ok(Some(Ok(_))) => {} + _ => break, + } + } + if have_peer { + break; + } + tokio::time::sleep(Duration::from_millis(200)).await; + } + if !have_peer { + return Err("runner never registered".into()); + } + + // Ask for 5 frames back. + let count: u8 = 5; + control.send(tungstenite::Message::Binary(vec![count])).await?; + + // Collect exactly `count` binary frames. Ignore typed et-* envelopes. + let mut received = Vec::with_capacity(count as usize); + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while received.len() < count as usize && std::time::Instant::now() < deadline { + let remaining = deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Binary(bytes)))) => { + if bytes.len() != 1 { + return Err(format!("fanout produced {}-byte frame, expected 1", bytes.len()).into()); + } + received.push(bytes[0]); + } + Ok(Some(Ok(_))) => {} + Ok(Some(Err(e))) => return Err(format!("recv: {e}").into()), + Ok(None) => return Err("control socket closed".into()), + Err(_) => return Err("timed out waiting for fan-out frames".into()), + } + } + if received.len() != count as usize { + return Err(format!("got {} frames, expected {count}", received.len()).into()); + } + Ok(received) +} diff --git a/services/ws-pyo3-runner/tests/no_hooks.rs b/services/ws-pyo3-runner/tests/no_hooks.rs new file mode 100644 index 0000000..c3ff5d2 --- /dev/null +++ b/services/ws-pyo3-runner/tests/no_hooks.rs @@ -0,0 +1,33 @@ +//! Verify the load-time sanity check: a module defining none of the runner +//! hooks must fail to load rather than connect and sit idle. The import +//! happens in `initialize`, before any connection, so the runner exits +//! non-zero without needing a server. + +#![cfg(test)] + +use std::error::Error; +use std::process::Command; + +#[test] +fn module_without_hooks_fails_to_load() -> Result<(), Box> { + let py_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let output = Command::new(bin) + .env("RUNNER_MODULE", "no_hooks") + .env("PYO3_PYTHONPATH", &py_path) + // Safety net: if the check ever regressed and import succeeded, this + // bounds the otherwise-forever connect retry so the test fails (on the + // assertion below) instead of hanging. + .env("RUNNER_TIMEOUT", "10s") + .env("RUST_LOG", "error") + .output()?; + + if output.status.success() { + return Err(format!("a hookless module must fail to load; got {:?}", output.status).into()); + } + let stderr = String::from_utf8_lossy(&output.stderr); + if !stderr.contains("none of the runner hooks") { + return Err(format!("stderr should explain the missing hooks; got: {stderr}").into()); + } + Ok(()) +} diff --git a/services/ws-pyo3-runner/tests/storage.rs b/services/ws-pyo3-runner/tests/storage.rs new file mode 100644 index 0000000..bc729c9 --- /dev/null +++ b/services/ws-pyo3-runner/tests/storage.rs @@ -0,0 +1,139 @@ +//! Verify the `WsStorage` round-trip end to end: Python writes a blob +//! via `storage.put` and reads it back via `storage.get`, both going +//! through the runner's `/storage` HTTP client and the test server's +//! storage service. Confirms the full path (4-space indent = code block): +//! +//! Python -> WsStorage::put -> mpsc::StorageOp::Put -> storage_worker +//! -> et_rest_client put_file -> et-storage-service::agent_put_file +//! -> disk (under TempDir) +//! -> et_rest_client get_file -> bytes -> oneshot -> Python + +#![cfg(test)] +#![expect( + clippy::arithmetic_side_effects, + clippy::single_call_fn, + reason = "integration test: Instant/Duration poll-loop math and single-use helpers" +)] + +use std::error::Error; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use edge_toolkit::ws::{ClientMessage, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio_tungstenite::{connect_async, tungstenite}; + +type ControlSocket = tokio_tungstenite::WebSocketStream>; + +async fn control_client(ws_url: &str) -> Result<(ControlSocket, String), Box> { + let (mut socket, _) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { agent_id: None })?; + socket.send(tungstenite::Message::Text(connect)).await?; + loop { + let Some(frame) = socket.next().await else { + return Err("control socket closed before connect-ack".into()); + }; + if let tungstenite::Message::Text(text) = frame? + && let Ok(ServerMessage::ConnectAck { agent_id, .. }) = serde_json::from_str::(&text) + { + return Ok((socket, agent_id)); + } + } +} + +#[tokio::test(flavor = "current_thread")] +async fn storage_put_then_get_round_trip() -> Result<(), Box> { + let server = et_ws_test_server::start(); + let (mut control, control_id) = control_client(&server.ws_url).await?; + + let python_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let mut runner = Command::new(bin) + .env("RUNNER_MODULE", "storage_pingpong") + .env("PYO3_PYTHONPATH", &python_path) + .env("WS_SERVER_URL", &server.ws_url) + .env("RUST_LOG", std::env::var("RUST_LOG").unwrap_or_else(|_| "warn".into())) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn()?; + + let outcome = tokio::time::timeout(Duration::from_secs(30), exercise(&mut control, &control_id)).await; + drop(runner.kill()); + drop(runner.wait()); + + let observed = outcome??; + let expected: &[u8] = b"a quick brown fox jumps over the lazy dog"; + if observed.as_slice() != expected { + return Err(format!("stored bytes do not match: {observed:?}").into()); + } + Ok(()) +} + +async fn exercise(control: &mut ControlSocket, self_id: &str) -> Result, Box> { + // Wait for the runner to register so our broadcast has a peer. + let deadline = std::time::Instant::now() + Duration::from_secs(15); + let mut have_peer = false; + while std::time::Instant::now() < deadline { + let req = serde_json::to_string(&ClientMessage::ListAgents)?; + control.send(tungstenite::Message::Text(req)).await?; + let poll_until = std::time::Instant::now() + Duration::from_millis(250); + while std::time::Instant::now() < poll_until { + let remaining = poll_until - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if let Ok(ServerMessage::ListAgentsResponse { agents }) = + serde_json::from_str::(&text) + && agents.iter().any(|summary| summary.agent_id != self_id) + { + have_peer = true; + break; + } + } + Ok(Some(Ok(_))) => {} + _ => break, + } + } + if have_peer { + break; + } + tokio::time::sleep(Duration::from_millis(200)).await; + } + if !have_peer { + return Err("runner never registered".into()); + } + + let key = "hello.txt"; + let value: &[u8] = b"a quick brown fox jumps over the lazy dog"; + + // PUT: send `key\x00value` -- the module's `on_binary_frame` splits + // on the NUL and calls storage.put. + let mut put_frame = Vec::with_capacity(key.len() + 1 + value.len()); + put_frame.extend_from_slice(key.as_bytes()); + put_frame.push(0); + put_frame.extend_from_slice(value); + control.send(tungstenite::Message::Binary(put_frame)).await?; + + // Give the storage worker a moment to PUT to disk before we GET. + tokio::time::sleep(Duration::from_millis(200)).await; + + // GET: send `key` only -- the module's `on_binary_frame` treats + // a NUL-free frame as a get and pushes the result back via + // `send.binary(...)`. + control + .send(tungstenite::Message::Binary(key.as_bytes().to_vec())) + .await?; + + // Drain until we see the runner's binary reply. + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while std::time::Instant::now() < deadline { + let remaining = deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Binary(bytes)))) => return Ok(bytes), + Ok(Some(Ok(_))) => {} + Ok(Some(Err(e))) => return Err(format!("recv: {e}").into()), + Ok(None) => return Err("control socket closed".into()), + Err(_) => return Err("timed out waiting for storage reply".into()), + } + } + Err("deadline exceeded".into()) +} diff --git a/services/ws-pyo3-runner/tests/torch_inference.rs b/services/ws-pyo3-runner/tests/torch_inference.rs new file mode 100644 index 0000000..cb0584f --- /dev/null +++ b/services/ws-pyo3-runner/tests/torch_inference.rs @@ -0,0 +1,166 @@ +//! `PyTorch` counterpart to the wasi-graphics-info ML test: launch +//! et-ws-pyo3-runner with `torch_inference.py`, trigger it, and verify the +//! torch matmul + tiny-classifier round-trip. +//! +//! `torch` is declared `pipx:torch` in the python-only mise config, so it's +//! absent from a default `mise install`. The runner only puts it on `sys.path` +//! when it's among the current mise packages (via +//! `edge_toolkit::config::mise_python_site_packages`), so this test SKIPS unless +//! torch is reachable there -- run it under `MISE_ENV=python` after +//! `mise install pipx:torch`. When present, the control client broadcasts a +//! trigger; the module runs the workflow and returns a JSON summary we assert on. + +#![cfg(test)] +#![expect( + clippy::arithmetic_side_effects, + clippy::single_call_fn, + clippy::print_stderr, + reason = "integration test: poll-loop math, single-use helpers, eprintln skip notice" +)] + +use std::error::Error; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use edge_toolkit::ws::{ClientMessage, ServerMessage}; +use futures_util::{SinkExt as _, StreamExt as _}; +use tokio_tungstenite::{connect_async, tungstenite}; + +type ControlSocket = tokio_tungstenite::WebSocketStream>; + +/// Any non-protocol text frame triggers the module's workflow. +const TRIGGER: &str = "run-torch"; + +/// True when `pipx:torch` is reachable on a mise package `site-packages` -- the +/// exact condition under which the runner can `import torch`. Checking it here +/// keeps the skip decision identical to the runner's own capability. +fn torch_reachable() -> bool { + edge_toolkit::config::mise_python_site_packages() + .iter() + .any(|site_packages| site_packages.join("torch").is_dir()) +} + +/// Open a control client and drive et-connect until we have an `agent_id`. +async fn control_client(ws_url: &str) -> Result<(ControlSocket, String), Box> { + let (mut socket, _) = connect_async(ws_url).await?; + let connect = serde_json::to_string(&ClientMessage::Connect { agent_id: None })?; + socket.send(tungstenite::Message::Text(connect)).await?; + + loop { + let Some(frame) = socket.next().await else { + return Err("control socket closed before connect-ack".into()); + }; + let tungstenite::Message::Text(text) = frame? else { + continue; + }; + if let Ok(ServerMessage::ConnectAck { agent_id, .. }) = serde_json::from_str::(&text) { + return Ok((socket, agent_id)); + } + } +} + +#[tokio::test(flavor = "current_thread")] +async fn torch_module_runs_inference() -> Result<(), Box> { + if !torch_reachable() { + eprintln!("skipping torch_inference: pipx:torch not on any mise site-packages"); + eprintln!(" install with `MISE_ENV=python mise install pipx:torch` and re-run under MISE_ENV=python"); + return Ok(()); + } + + let server = et_ws_test_server::start(); + let (mut control, control_id) = control_client(&server.ws_url).await?; + + let module_path = format!("{}/python", env!("CARGO_MANIFEST_DIR")); + let bin = env!("CARGO_BIN_EXE_et-ws-pyo3-runner"); + let mut runner = Command::new(bin) + .env("RUNNER_MODULE", "torch_inference") + .env("PYO3_PYTHONPATH", &module_path) + .env("WS_SERVER_URL", &server.ws_url) + .env("RUST_LOG", std::env::var("RUST_LOG").unwrap_or_else(|_| "warn".into())) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn()?; + + // torch import + first op can be slow on a cold interpreter, so give the + // round-trip a generous budget. + let result = tokio::time::timeout(Duration::from_mins(1), torch_round_trip(&mut control, &control_id)).await; + + drop(runner.kill()); + drop(runner.wait()); + + let reply = result??; + let parsed: serde_json::Value = match serde_json::from_str(&reply) { + Ok(value) => value, + Err(e) => return Err(format!("reply not JSON: {e}: {reply}").into()), + }; + if parsed.get("framework").and_then(serde_json::Value::as_str) != Some("torch") { + return Err(format!("unexpected framework in {reply}").into()); + } + let c00 = parsed + .get("matmul_c00") + .and_then(serde_json::Value::as_f64) + .ok_or("matmul_c00 missing")?; + if (c00 - 2.0_f64).abs() > 1e-4_f64 { + return Err(format!("matmul_c00 {c00} != 2.0").into()); + } + if parsed.get("predicted_class").and_then(serde_json::Value::as_i64) != Some(3_i64) { + return Err(format!("predicted_class != 3 in {reply}").into()); + } + Ok(()) +} + +/// Poll `list_agents` until the runner registers, then broadcast `TRIGGER` and +/// return the first non-protocol text frame (the module's JSON summary). +async fn torch_round_trip(control: &mut ControlSocket, self_id: &str) -> Result> { + let deadline = std::time::Instant::now() + Duration::from_secs(30); + let mut have_peer = false; + while std::time::Instant::now() < deadline { + let req = serde_json::to_string(&ClientMessage::ListAgents)?; + control.send(tungstenite::Message::Text(req)).await?; + let poll_deadline = std::time::Instant::now() + Duration::from_millis(250); + while std::time::Instant::now() < poll_deadline { + let remaining = poll_deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if let Ok(ServerMessage::ListAgentsResponse { agents }) = + serde_json::from_str::(&text) + && agents.iter().any(|summary| summary.agent_id != self_id) + { + have_peer = true; + break; + } + } + Ok(Some(Ok(_))) => {} + _ => break, + } + } + if have_peer { + break; + } + tokio::time::sleep(Duration::from_millis(200)).await; + } + if !have_peer { + return Err("runner never registered".into()); + } + + control.send(tungstenite::Message::Text(TRIGGER.to_string())).await?; + + let deadline = std::time::Instant::now() + Duration::from_secs(30); + while std::time::Instant::now() < deadline { + let remaining = deadline - std::time::Instant::now(); + match tokio::time::timeout(remaining, control.next()).await { + Ok(Some(Ok(tungstenite::Message::Text(text)))) => { + if serde_json::from_str::(&text).is_ok() { + // typed et-* envelope (status / list / ack), keep draining + continue; + } + return Ok(text); + } + Ok(Some(Ok(_))) => {} + Ok(Some(Err(e))) => return Err(format!("recv error: {e}").into()), + Ok(None) => return Err("control socket closed".into()), + Err(_) => return Err("timed out waiting for torch reply".into()), + } + } + Err("deadline exceeded".into()) +} diff --git a/services/ws-server/src/config.rs b/services/ws-server/src/config.rs index 7850e38..5994912 100644 --- a/services/ws-server/src/config.rs +++ b/services/ws-server/src/config.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use edge_toolkit::config::OtlpConfig; pub use et_modules_service::ModulesConfig; pub use et_storage_service::StorageConfig; +pub use et_ws_service::WsConfig; use serde::Deserialize; use serde_default::DefaultFromSerde; use serde_inline_default::serde_inline_default; @@ -34,4 +35,8 @@ pub struct Config { /// TLS config. #[serde(default)] pub tls: TlsConfig, + /// WebSocket hub config (frame limits, etc.). + /// `serde-env` maps the inner fields as `WS_*`, e.g. `WS_MAX_FRAME_SIZE`. + #[serde(default)] + pub ws: WsConfig, } diff --git a/services/ws-server/src/lib.rs b/services/ws-server/src/lib.rs index 1bf7421..0b1de91 100644 --- a/services/ws-server/src/lib.rs +++ b/services/ws-server/src/lib.rs @@ -20,7 +20,7 @@ pub fn configure_app(cfg: &mut web::ServiceConfig, agent_registry: web::Data(cfg, &config.storage); // Must be last: registers a catch-all Files::new("/", ...) for the root module. et_modules_service::configure(cfg, &config.modules); diff --git a/services/ws-test-server/src/lib.rs b/services/ws-test-server/src/lib.rs index 517057c..ce2d02d 100644 --- a/services/ws-test-server/src/lib.rs +++ b/services/ws-test-server/src/lib.rs @@ -10,7 +10,7 @@ use std::net::TcpListener; use actix_web::{App, HttpServer, web}; use et_modules_service::{ModulesConfig, configure as configure_modules}; use et_storage_service::{StorageConfig, configure as configure_storage}; -use et_ws_service::{AgentSession, WsAgentRegistry, configure as configure_ws}; +use et_ws_service::{AgentSession, WsAgentRegistry, WsConfig, configure as configure_ws}; use tempfile::TempDir; use tracing_actix_web::TracingLogger; @@ -42,6 +42,7 @@ pub fn start() -> TestServer { let registry = web::Data::new(WsAgentRegistry::default()); let storage = web::Data::new(storage_config); let modules = modules_config; + let ws_config = WsConfig::default(); HttpServer::new(move || { // `TracingLogger` mirrors the real ws-server's pipeline: // extracts `traceparent` from incoming requests so server @@ -50,7 +51,7 @@ pub fn start() -> TestServer { .wrap(TracingLogger::default()) .app_data(registry.clone()) .app_data(storage.clone()) - .configure(configure_ws) + .configure(|cfg| configure_ws(cfg, &ws_config)) .configure(|cfg| configure_storage::(cfg, &storage)) .configure(|cfg| configure_modules(cfg, &modules)) }) diff --git a/services/ws-test-server/tests/hub_forwarding.rs b/services/ws-test-server/tests/hub_forwarding.rs index 0dc1304..20bad11 100644 --- a/services/ws-test-server/tests/hub_forwarding.rs +++ b/services/ws-test-server/tests/hub_forwarding.rs @@ -133,3 +133,48 @@ async fn unrecognised_binary_is_broadcast_verbatim() { "binary payload must be forwarded byte-for-byte" ); } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn unconnected_client_is_auto_registered_and_relays_both_ways() { + let server = et_ws_test_server::start(); + + // A peer that speaks the et-protocol, standing in for the agent runner. + let (mut peer, _peer_id) = connect_agent(&server.ws_url).await; + // A "dumb" client that never sends et-connect -- e.g. a demo frontend on + // a raw `new WebSocket(url)`. + let (mut dumb, _) = connect_async(&server.ws_url).await.expect("ws connect"); + + // The dumb client's first binary frame must be broadcast to the peer: + // sending it auto-registers the dumb client as an agent. + let activations: Vec = vec![0x10, 0x20, 0x30, 0x40]; + dumb.send(Message::binary(activations.clone())) + .await + .expect("dumb send binary"); + + let received = next_payload(&mut peer).await; + let Message::Binary(received_bytes) = received else { + panic!("expected binary frame at peer, got {received:?}"); + }; + assert_eq!( + &*received_bytes, + activations.as_slice(), + "frame from unconnected client must be broadcast to peers" + ); + + // Reverse direction: the peer's reply must reach the now auto-registered + // dumb client -- it became a broadcast recipient on its first frame. + let grads: Vec = vec![0xaa, 0xbb, 0xcc]; + peer.send(Message::binary(grads.clone())) + .await + .expect("peer send binary"); + + let reply = next_payload(&mut dumb).await; + let Message::Binary(reply_bytes) = reply else { + panic!("expected binary reply at dumb client, got {reply:?}"); + }; + assert_eq!( + &*reply_bytes, + grads.as_slice(), + "auto-registered client must receive peer broadcasts" + ); +} diff --git a/services/ws-wasi-runner/src/host/error.rs b/services/ws-wasi-runner/src/host/error.rs index 72edf8a..a38b4ca 100644 --- a/services/ws-wasi-runner/src/host/error.rs +++ b/services/ws-wasi-runner/src/host/error.rs @@ -100,3 +100,13 @@ impl WsDecodeErrExt for Result { self.map_err(|err| WsError::Decode(format!("{context}: {err}"))) } } + +/// Transparent conversion so `et_ws_runner_common::connect_and_register`'s error +/// cascades through `?` in `WsBackend::connect` -- no `.map_err` closure at the +/// call site. `WsError` is WIT-generated, so this hand `From` impl stands in for +/// a thiserror `#[from]`. +impl From for WsError { + fn from(err: et_ws_runner_common::ConnectError) -> Self { + Self::Transport(format!("ws connect/register: {err}")) + } +} diff --git a/services/ws-wasi-runner/src/host/mod.rs b/services/ws-wasi-runner/src/host/mod.rs index 0f0a4c8..8dc87a4 100644 --- a/services/ws-wasi-runner/src/host/mod.rs +++ b/services/ws-wasi-runner/src/host/mod.rs @@ -31,6 +31,9 @@ pub struct HostState { /// WebSocket URL of the ws-server (e.g. `ws://localhost:8080/ws`). pub ws_url: String, + /// How long the host waits for `et-connect-ack`; `None` waits forever. + pub connect_ack_timeout: Option, + /// Typed REST client for the ws-server (modules + per-agent storage). /// Generated by `cargo-progenitor` from `generated/specs/rest.yaml`. pub rest: et_rest_client::Client, @@ -46,13 +49,14 @@ impl HostState { clippy::same_name_method, reason = "convention: HostState::new mirrors WasiCtxBuilder/ResourceTable/Client constructors used here" )] - pub fn new(http_base: &str, ws_url: String) -> Self { + pub fn new(http_base: &str, ws_url: String, connect_ack_timeout: Option) -> Self { let wasi_ctx = WasiCtxBuilder::new().inherit_stdio().inherit_env().build(); Self { wasi_ctx, resource_table: ResourceTable::new(), ws_url, + connect_ack_timeout, rest: et_rest_client::Client::new(http_base), ws: Arc::new(Mutex::new(None)), wasi_nn_ctx: wasi_nn::new_ctx(), diff --git a/services/ws-wasi-runner/src/host/ws.rs b/services/ws-wasi-runner/src/host/ws.rs index 88bdd15..a13bbbc 100644 --- a/services/ws-wasi-runner/src/host/ws.rs +++ b/services/ws-wasi-runner/src/host/ws.rs @@ -39,13 +39,6 @@ impl crate::bindings::et::ws_messages::messages::Host for HostState {} type WsSink = SplitSink>, tungstenite::Message>; -/// How often the heartbeat task pings the server. Server-side -/// `CONNECTION_TIMEOUT` (services/ws/src/lib.rs:18) is 15 s; pinging at 5 s -/// gives 3x headroom so a slow runner (CI ARM, debug build, large model) -/// still keeps the connection alive across long compute gaps between -/// `connect()` and the first `ClientEvent` the guest sends. -const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); - use crate::bindings::et::ws_wasi::ws::WsError; use crate::host::error::WsDecodeErrExt as _; use crate::host::error::WsTransportErrExt as _; @@ -66,31 +59,30 @@ impl WsBackend { clippy::single_call_fn, reason = "inherent constructor; used once by ::connect" )] - async fn connect(ws_url: &str) -> Result { - let (stream, _) = tokio_tungstenite::connect_async(ws_url) - .await - .map_tungstenite_err(&format!("ws connect {ws_url}"))?; - let (mut sink, mut stream) = stream.split(); - - // Drive the registration handshake immediately so the agent_id is - // known by the time `connect()` returns. - let connect_msg = - serde_json::to_string(&ClientMessage::Connect { agent_id: None }).map_decode_err("serialize connect")?; - sink.send(tungstenite::Message::text(connect_msg)) - .await - .map_tungstenite_err("send connect")?; + async fn connect(ws_url: &str, ack_timeout: Option) -> Result { + // The shared helper opens the socket and completes the et-connect + // handshake (with bounded retries), so the agent_id is known the moment + // it returns -- no polling for ConnectAck afterwards. + // `ConnectError` cascades to `WsError` via `From` (see host/error.rs). + let (socket, assigned_id, status) = + et_ws_runner_common::connect_and_register(ws_url, None, ack_timeout).await?; + let (sink, mut stream) = socket.split(); let (tx, rx) = mpsc::unbounded_channel::(); - let agent_id = Arc::new(Mutex::new(None)); - let connection_state = Arc::new(Mutex::new(State::Connecting)); + // The handshake consumed the et-connect-ack frame; re-surface it to the + // guest's `recv()` so guests that read it still see it as the first message. + drop(tx.send(ServerMessage::ConnectAck { + agent_id: assigned_id.clone(), + status, + })); - // Reader pump: convert every Text/Binary data frame into a + let agent_id = Arc::new(Mutex::new(Some(assigned_id))); + let connection_state = Arc::new(Mutex::new(State::Connected)); + + // Reader pump: convert every subsequent Text/Binary data frame into a // `ServerMessage` via `ServerMessage::from_*_frame` (foreign frames land - // in `RelayText`/`RelayBinary`); route `ConnectAck` into - // `agent_id` + `connection_state`; drop control frames and - // et-prefixed-but-malformed text with a warn -- they can't be - // surfaced through the typed catalog. - let agent_id_clone = Arc::clone(&agent_id); + // in `RelayText`/`RelayBinary`) and forward it to the guest inbox; drop + // control frames and et-prefixed-but-malformed text with a warn. let state_clone = Arc::clone(&connection_state); let reader = tokio::spawn(async move { while let Some(msg) = stream.next().await { @@ -112,10 +104,6 @@ impl WsBackend { tungstenite::Message::Binary(bytes) => ServerMessage::from_binary_frame(bytes.clone()), _ => continue, }; - if let ServerMessage::ConnectAck { agent_id, .. } = &parsed { - *agent_id_clone.lock().await = Some(agent_id.clone()); - *state_clone.lock().await = State::Connected; - } if tx.send(parsed).is_err() { break; } @@ -132,11 +120,9 @@ impl WsBackend { let pinger_sink = Arc::clone(&sink_arc); let pinger_state = Arc::clone(&connection_state); let pinger = tokio::spawn(async move { - let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); - // First tick fires immediately; skip it so we don't ping before - // the connect handshake even sees a Connected state. - let _: tokio::time::Instant = interval.tick().await; + // 5s heartbeat (vs the server's 15s idle timeout); first immediate + // tick already consumed so we don't ping before the handshake. + let mut interval = et_ws_runner_common::heartbeat_interval().await; loop { let _: tokio::time::Instant = interval.tick().await; if !matches!(*pinger_state.lock().await, State::Connecting | State::Connected) { @@ -176,15 +162,10 @@ impl Host for HostState { return Err(WsError::AlreadyConnected); } } - let backend = WsBackend::connect(&self.ws_url).await?; - // Wait briefly for ConnectAck before returning, so guests can call - // agent_id() right after connect() and get a value. - for _ in 0_u32..50 { - if matches!(backend.current_state().await, State::Connected) { - break; - } - tokio::time::sleep(Duration::from_millis(20)).await; - } + // `connect_and_register` already completed the handshake, so the + // backend comes back `Connected` with its agent_id set -- guests can + // call `agent_id()` immediately, no poll-wait needed. + let backend = WsBackend::connect(&self.ws_url, self.connect_ack_timeout).await?; { let mut slot = self.ws.lock().await; *slot = Some(backend); diff --git a/services/ws-wasi-runner/src/lib.rs b/services/ws-wasi-runner/src/lib.rs index 4f05ac2..027d62d 100644 --- a/services/ws-wasi-runner/src/lib.rs +++ b/services/ws-wasi-runner/src/lib.rs @@ -30,16 +30,26 @@ pub use self::host::HostState; /// The whole call is wrapped in a `run_module` span -- every outgoing /// request inherits its trace context, and ws-server's request span ends /// up as a child of it. -pub async fn run_module(module_name: &str, ws_url: &str) -> Result<(), RunnerError> { +pub async fn run_module( + module_name: &str, + ws_url: &str, + connect_ack_timeout: Option, +) -> Result<(), RunnerError> { let span = tracing::info_span!("run_module", module = module_name); - run_module_inner(module_name, ws_url).instrument(span).await + run_module_inner(module_name, ws_url, connect_ack_timeout) + .instrument(span) + .await } #[expect( clippy::single_call_fn, reason = "span-instrumented body of run_module; the split is mandatory to scope the tracing span" )] -async fn run_module_inner(module_name: &str, ws_url: &str) -> Result<(), RunnerError> { +async fn run_module_inner( + module_name: &str, + ws_url: &str, + connect_ack_timeout: Option, +) -> Result<(), RunnerError> { let http_base = derive_http_base(ws_url)?; let rest = et_rest_client::Client::new(&http_base); @@ -68,7 +78,7 @@ async fn run_module_inner(module_name: &str, ws_url: &str) -> Result<(), RunnerE bindings::Runner::add_to_linker::>(&mut linker, |state| state)?; wasmtime_wasi_nn::wit::add_to_linker(&mut linker, host::wasi_nn::view)?; - let host_state = HostState::new(&http_base, ws_url.to_string()); + let host_state = HostState::new(&http_base, ws_url.to_string(), connect_ack_timeout); let mut store = Store::new(&engine, host_state); let module = bindings::Runner::instantiate_async(&mut store, &component, &linker).await?; diff --git a/services/ws-wasi-runner/src/main.rs b/services/ws-wasi-runner/src/main.rs index dbfe8e2..7aef972 100644 --- a/services/ws-wasi-runner/src/main.rs +++ b/services/ws-wasi-runner/src/main.rs @@ -22,7 +22,7 @@ async fn main() -> Result<(), Box> { let module = &config.runner.module; let ws_url = &config.ws.server_url; let timeout = config.runner.timeout; - let run = run_module(module, ws_url); + let run = run_module(module, ws_url, config.ws.connect_ack_timeout); // `None` outcome == timed out; `Some(_)` carries the module's own result. let outcome = if let Some(limit) = timeout { info!("et-ws-wasi-runner: module={module} server={ws_url} timeout={limit:?}"); diff --git a/services/ws-web-runner/Cargo.toml b/services/ws-web-runner/Cargo.toml index fd807f0..ef99d9b 100644 --- a/services/ws-web-runner/Cargo.toml +++ b/services/ws-web-runner/Cargo.toml @@ -25,6 +25,10 @@ edge-toolkit.workspace = true et-rest-client = { workspace = true, features = ["tracing"] } et-ws-runner-common.workspace = true futures-util.workspace = true +# Direct dep so we can configure the REST client's reqwest retry policy: the +# default `ProtocolNacks` policy doesn't cover the HTTP/1 keep-alive race (see +# lib.rs `build_rest_client`). Features match et-rest-client's native build. +reqwest = { workspace = true, features = ["json", "query", "rustls", "stream"] } serde.workspace = true serde-env.workspace = true sys_traits.workspace = true diff --git a/services/ws-web-runner/src/error.rs b/services/ws-web-runner/src/error.rs index e1d61cb..088404d 100644 --- a/services/ws-web-runner/src/error.rs +++ b/services/ws-web-runner/src/error.rs @@ -20,6 +20,9 @@ pub enum RunnerError { #[error("deno runtime error: {0}")] DenoGeneric(#[from] deno_core::error::AnyError), + + #[error("http client build error: {0}")] + HttpClient(#[from] reqwest::Error), } /// Maps any `Display` error into a generic `JsErrorBox`. diff --git a/services/ws-web-runner/src/lib.rs b/services/ws-web-runner/src/lib.rs index db402b9..8938fce 100644 --- a/services/ws-web-runner/src/lib.rs +++ b/services/ws-web-runner/src/lib.rs @@ -9,6 +9,8 @@ //! standard web platform extensions (fetch, `WebSocket`, `WebStorage`, timers, //! crypto, WebGPU). +use std::time::Duration; + use et_ws_runner_common::{derive_http_base, fetch_main_field}; pub mod config; @@ -32,7 +34,7 @@ pub async fn run_module(module_name: &str, ws_url: &str) -> Result<(), RunnerErr let _ignore = deno_runtime::deno_tls::rustls::crypto::aws_lc_rs::default_provider().install_default(); let http_base = derive_http_base(ws_url)?; - let rest = et_rest_client::Client::new(&http_base); + let rest = build_rest_client(&http_base)?; let main = fetch_main_field(&rest, module_name).await?; let module_base_url = format!("{http_base}/modules/{module_name}"); @@ -43,3 +45,40 @@ pub async fn run_module(module_name: &str, ws_url: &str) -> Result<(), RunnerErr runtime::run_js_module(&entry_url, &http_base, ws_url, rest).await?; Ok(()) } + +/// Build the REST client with a reqwest retry policy that replays transport- +/// level send failures. +/// +/// The pooled keep-alive race: the ws-server can close an idle connection while +/// the slow `MainWorker` bootstrap runs, so the next `send()` fails with "error +/// sending request". reqwest's default `ProtocolNacks` policy does NOT cover +/// this -- it only retries h2 `REFUSED_STREAM` / h3 timeouts, and we build +/// reqwest without the `http2` feature, so it's a no-op for these h1 fetches. +/// So classify any transport error (a send that produced no response) as +/// retryable, scoped to the ws-server host, with no budget so the idempotent, +/// low-volume module GETs always get their retry. +#[expect( + clippy::single_call_fn, + clippy::result_large_err, + reason = "split out of run_module for readability; RunnerError::Common wraps a ~136 B BootstrapError" +)] +fn build_rest_client(http_base: &str) -> Result { + let host = reqwest::Url::parse(http_base) + .ok() + .and_then(|url| url.host_str().map(str::to_owned)) + .unwrap_or_default(); + let retry = reqwest::retry::for_host(host).no_budget().classify_fn(|req_rep| { + if req_rep.error().is_some() { + req_rep.retryable() + } else { + req_rep.success() + } + }); + let dur = Duration::from_secs(15); + let client = reqwest::Client::builder() + .connect_timeout(dur) + .timeout(dur) + .retry(retry) + .build()?; + Ok(et_rest_client::Client::new_with_client(http_base, client)) +} diff --git a/services/ws-web-runner/src/runtime.rs b/services/ws-web-runner/src/runtime.rs index f715a14..2df65e3 100644 --- a/services/ws-web-runner/src/runtime.rs +++ b/services/ws-web-runner/src/runtime.rs @@ -84,22 +84,13 @@ impl deno_core::ModuleLoader for ServerModuleLoader { let client = self.rest.client().clone(); ModuleLoadResponse::Async( async move { - // Retry transport-level send failures. reqwest reuses pooled - // keep-alive connections; the server can close an idle one in - // the gap since the previous request (MainWorker bootstrap can - // outlast the server's keep-alive), so the first send() fails - // with "error sending request". A fresh attempt dials anew. - let mut attempt = 0u8; - let response = loop { - match client.get(url.as_str()).send().await { - Ok(response) => break response, - Err(e) if (e.is_request() || e.is_connect()) && attempt < 2 => { - attempt = attempt.saturating_add(1); - tracing::warn!(url = %url, attempt, error = %e, "module fetch send error; retrying"); - } - Err(e) => return Err(e).map_js_err(), - } - }; + // The keep-alive race (server closes a pooled idle connection + // while the slow MainWorker bootstrap runs, so `send()` fails + // with "error sending request") is handled by the reqwest retry + // policy configured on this client in `lib.rs::build_rest_client` + // -- a transport-error classifier, since reqwest's default + // `ProtocolNacks` policy doesn't cover the HTTP/1 case. + let response = client.get(url.as_str()).send().await.map_js_err()?; let body = response.error_for_status().map_js_err()?.text().await.map_js_err()?; let specifier = ModuleSpecifier::parse(url.as_str()).map_js_err()?; diff --git a/services/ws/Cargo.toml b/services/ws/Cargo.toml index f0192e0..1362e00 100644 --- a/services/ws/Cargo.toml +++ b/services/ws/Cargo.toml @@ -12,17 +12,24 @@ doctest = false actix-web.workspace = true actix-ws.workspace = true bytes.workspace = true +bytesize.workspace = true chrono.workspace = true edge-toolkit.workspace = true fs-err.workspace = true futures-util.workspace = true opentelemetry.workspace = true serde.workspace = true +serde-inline-default.workspace = true +serde_default.workspace = true serde_json.workspace = true serde_yaml.workspace = true tokio = { workspace = true, features = ["macros", "rt", "sync", "time"] } tracing.workspace = true uuid.workspace = true +[dev-dependencies] +serde-env.workspace = true +temp-env.workspace = true + [lints] workspace = true diff --git a/services/ws/src/lib.rs b/services/ws/src/lib.rs index b5758ff..af14ee4 100644 --- a/services/ws/src/lib.rs +++ b/services/ws/src/lib.rs @@ -12,13 +12,76 @@ use opentelemetry::{ global, trace::{Span, Tracer as _}, }; +use serde::Deserialize; +use serde_default::DefaultFromSerde; +use serde_inline_default::serde_inline_default; use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; use tracing::{error, info, warn}; use uuid::Uuid; -pub const CONNECTION_TIMEOUT: Duration = Duration::from_secs(15); +/// Default idle timeout before the hub closes a quiet connection. +pub const DEFAULT_CONNECTION_TIMEOUT: Duration = Duration::from_secs(15); pub const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(1); +/// Default max WebSocket frame size (64 MiB). +/// +/// Large binary payloads fanned out via default broadcast (e.g. tensors) +/// easily blow past actix-ws's 64 KiB default. Override via the +/// `WS_MAX_FRAME_SIZE` env var, as a human byte size (`serde-env` translates +/// `[ws] max_frame_size` to `WS_MAX_FRAME_SIZE`). +pub const DEFAULT_MAX_FRAME_SIZE: usize = 64 * 1024 * 1024; + +/// Runtime knobs for the WebSocket hub. Populated by `serde-env` in +/// `et-ws-server::main`, then handed to `configure`. +#[serde_inline_default] +#[derive(Clone, Debug, DefaultFromSerde, Deserialize)] +#[non_exhaustive] +pub struct WsConfig { + /// Largest single WebSocket frame the hub will accept. Frames above this + /// are dropped by actix-ws before they reach the handler, so callers + /// shipping big tensors / blobs need to raise it above their payload size. + /// `WS_MAX_FRAME_SIZE` takes a human byte size (e.g. `64MiB`, `64MB`, + /// `512KiB`) or a plain byte count; unset defaults to 64 MiB. + #[serde(default = "default_max_frame_size", deserialize_with = "deserialize_byte_size")] + pub max_frame_size: usize, + + /// Idle period before the hub closes a connection, as a humantime + /// duration (e.g. `15s`, `1m30s`). Unset defaults to 15s; + /// `none`/`off`/`disabled` turns the idle timeout off (the hub never closes + /// a connection for inactivity), which suits a frontend that sits idle. + #[serde( + default = "default_connection_timeout", + deserialize_with = "edge_toolkit::config::deserialize_optional_humantime" + )] + pub connection_timeout: Option, +} + +const fn default_max_frame_size() -> usize { + DEFAULT_MAX_FRAME_SIZE +} + +/// Parse `WS_MAX_FRAME_SIZE` as a human byte size (e.g. `64MiB`, `64MB`, +/// `512KiB`) or a plain byte count, via `bytesize`. +fn deserialize_byte_size<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + // `bytesize`'s own `Deserialize` parses the human size ("64MiB", "512KiB", + // a bare byte count); its `D::Error` cascades through `?`, no `.map_err`. + // `usize::try_from` only narrows on 32-bit hosts, where clamping a frame + // cap to `usize::MAX` is harmless. + let size = ::deserialize(deserializer)?; + Ok(usize::try_from(size.as_u64()).unwrap_or(usize::MAX)) +} + +#[expect( + clippy::unnecessary_wraps, + reason = "serde default fn must return the field type Option; the default is always Some" +)] +const fn default_connection_timeout() -> Option { + Some(DEFAULT_CONNECTION_TIMEOUT) +} + /// Outbound envelope written to an agent's websocket session. /// /// `Json` is the normal path for protocol messages. `Text` and `Binary` carry @@ -82,11 +145,19 @@ struct Connection { registry: WsAgentRegistry, session: Session, outbox: AgentSession, + /// Idle timeout for this connection, or `None` to never time out. + idle_timeout: Option, } impl Connection { #[expect(clippy::single_call_fn, reason = "inherent constructor; used once by ws_handler")] - fn new(registry: WsAgentRegistry, client_ip: String, session: Session, outbox: AgentSession) -> Self { + fn new( + registry: WsAgentRegistry, + client_ip: String, + session: Session, + outbox: AgentSession, + idle_timeout: Option, + ) -> Self { info!("New WebSocket connection for client IP {}", client_ip); Self { agent_id: None, @@ -95,6 +166,7 @@ impl Connection { registry, session, outbox, + idle_timeout, } } @@ -119,6 +191,23 @@ impl Connection { (assigned_id, status) } + /// This connection's agent id, auto-registering one on first use. + /// + /// A client that never sends `et-connect` -- e.g. a frontend that speaks + /// only its own protocol -- still joins the hub relay: its first + /// unrecognised frame implicitly registers the session, so the frame is + /// broadcast to the other agents and this client then receives the + /// relayed replies. et-protocol agents send `et-connect` first, so they + /// are already assigned by the time they relay and this is a no-op. + fn ensure_assigned_agent(&mut self) -> String { + if let Some(id) = self.assigned_agent_id() { + return id.to_string(); + } + let (id, _status) = self.assign_or_reconnect_agent(None); + info!("Auto-registered relay client {} as agent {id}", self.client_ip); + id + } + async fn send_json(&mut self, response: &ServerMessage) { match serde_json::to_string(response) { Ok(json) => { @@ -286,14 +375,8 @@ impl Connection { } AggregatedMessage::Binary(bytes) => { self.mark_activity(); - if let Some(from_agent_id) = self.assigned_agent_id().map(str::to_string) { - self.broadcast_raw_binary(&from_agent_id, &bytes); - } else { - warn!( - "Dropping binary frame from unassigned client {}: agent must connect first", - self.client_ip - ); - } + let from_agent_id = self.ensure_assigned_agent(); + self.broadcast_raw_binary(&from_agent_id, &bytes); } AggregatedMessage::Close(reason) => { self.mark_activity(); @@ -491,14 +574,8 @@ impl Connection { ); } ClientMessage::RelayText { content } => { - if let Some(from_agent_id) = self.assigned_agent_id().map(str::to_string) { - self.broadcast_raw_text(&from_agent_id, &content); - } else { - warn!( - "Dropping relay-text from unassigned client {}: agent must connect first", - self.client_ip - ); - } + let from_agent_id = self.ensure_assigned_agent(); + self.broadcast_raw_text(&from_agent_id, &content); } ClientMessage::RelayBinary { content } => { // A binary tungstenite frame is dispatched @@ -507,14 +584,8 @@ impl Connection { // `{"type":"et-relay-binary",...}` as a // text frame, honour it by relaying the // payload as a binary frame. - if let Some(from_agent_id) = self.assigned_agent_id().map(str::to_string) { - self.broadcast_raw_binary(&from_agent_id, &Bytes::from(content)); - } else { - warn!( - "Dropping relay-binary from unassigned client {}: agent must connect first", - self.client_ip - ); - } + let from_agent_id = self.ensure_assigned_agent(); + self.broadcast_raw_binary(&from_agent_id, &Bytes::from(content)); } }, } @@ -569,20 +640,22 @@ impl Connection { } } _ = heartbeat.tick() => { - let idle_for = Instant::now().saturating_duration_since(self.last_activity); - if idle_for > CONNECTION_TIMEOUT { - warn!( - "WebSocket connection timed out for client {} after {:?} of inactivity", - self.current_agent_id(), - idle_for - ); - let _closed: Result<(), actix_ws::Closed> = self.session.clone().close(Some(CloseReason { - code: CloseCode::Policy, - description: Some(format!( - "connection timed out after {CONNECTION_TIMEOUT:?} of inactivity" - )), - })).await; - break; + if let Some(timeout) = self.idle_timeout { + let idle_for = Instant::now().saturating_duration_since(self.last_activity); + if idle_for > timeout { + warn!( + "WebSocket connection timed out for client {} after {:?} of inactivity", + self.current_agent_id(), + idle_for + ); + let _closed: Result<(), actix_ws::Closed> = self.session.clone().close(Some(CloseReason { + code: CloseCode::Policy, + description: Some(format!( + "connection timed out after {timeout:?} of inactivity" + )), + })).await; + break; + } } } } @@ -608,6 +681,7 @@ pub async fn ws_handler( req: HttpRequest, body: web::Payload, registry: web::Data, + config: web::Data, ) -> Result { let tracer = global::tracer("ws-server"); let mut span = tracer.start("ws.connect"); @@ -623,10 +697,18 @@ pub async fn ws_handler( .unwrap_or_else(|| "unknown".to_string()); let (response, session, msg_stream) = actix_ws::handle(&req, body)?; - let stream = msg_stream.max_frame_size(64 * 1024).aggregate_continuations(); + let stream = msg_stream + .max_frame_size(config.max_frame_size) + .aggregate_continuations(); let (tx, rx) = mpsc::unbounded_channel::(); - let conn = Connection::new(registry.get_ref().clone(), client_ip, session, tx); + let conn = Connection::new( + registry.get_ref().clone(), + client_ip, + session, + tx, + config.connection_timeout, + ); let _join = actix_web::rt::spawn(async move { conn.run(stream, rx).await; @@ -636,6 +718,8 @@ pub async fn ws_handler( Ok(response) } -pub fn configure(cfg: &mut web::ServiceConfig) { - let _routed = cfg.route("/ws", web::get().to(ws_handler)); +pub fn configure(cfg: &mut web::ServiceConfig, config: &WsConfig) { + let _routed = cfg + .app_data(web::Data::new(config.clone())) + .route("/ws", web::get().to(ws_handler)); } diff --git a/services/ws/tests/config.rs b/services/ws/tests/config.rs new file mode 100644 index 0000000..30142d9 --- /dev/null +++ b/services/ws/tests/config.rs @@ -0,0 +1,66 @@ +//! `WS_MAX_FRAME_SIZE` deserialization, through the real `serde_env::from_env` +//! path the ws-server uses, with the process env set via `temp-env`. Verifies +//! the human-byte-size parsing (`64MiB`, plain byte counts) and the 64 MiB +//! default when the variable is absent. +#![cfg(test)] +#![expect( + clippy::expect_used, + clippy::decimal_literal_representation, + reason = "test code: expect panics carry context; byte sizes read clearer as decimal MiB math than hex" +)] + +use et_ws_service::WsConfig; +use serde::Deserialize; + +// Mirror the ws-server's nesting so the env key is `WS_MAX_FRAME_SIZE`. +#[derive(Debug, Deserialize)] +struct Wrapper { + #[serde(default)] + ws: WsConfig, +} + +fn load() -> WsConfig { + serde_env::from_env::().expect("parse WsConfig from env").ws +} + +#[test] +fn max_frame_size_absent_defaults_to_64_mib() { + temp_env::with_var_unset("WS_MAX_FRAME_SIZE", || { + assert_eq!(load().max_frame_size, 64 * 1024 * 1024); + }); +} + +#[test] +fn max_frame_size_parses_human_size() { + temp_env::with_var("WS_MAX_FRAME_SIZE", Some("32MiB"), || { + assert_eq!(load().max_frame_size, 32 * 1024 * 1024); + }); +} + +#[test] +fn max_frame_size_parses_plain_byte_count() { + temp_env::with_var("WS_MAX_FRAME_SIZE", Some("1048576"), || { + assert_eq!(load().max_frame_size, 1_048_576); + }); +} + +#[test] +fn connection_timeout_absent_defaults_to_15s() { + temp_env::with_var_unset("WS_CONNECTION_TIMEOUT", || { + assert_eq!(load().connection_timeout, Some(std::time::Duration::from_secs(15))); + }); +} + +#[test] +fn connection_timeout_parses_humantime() { + temp_env::with_var("WS_CONNECTION_TIMEOUT", Some("1m30s"), || { + assert_eq!(load().connection_timeout, Some(std::time::Duration::from_secs(90))); + }); +} + +#[test] +fn connection_timeout_none_sentinel_disables() { + temp_env::with_var("WS_CONNECTION_TIMEOUT", Some("none"), || { + assert_eq!(load().connection_timeout, None); + }); +} diff --git a/utilities/cli/tests/module_package_json.rs b/utilities/cli/tests/module_package_json.rs index 5fbb16d..721f55a 100644 --- a/utilities/cli/tests/module_package_json.rs +++ b/utilities/cli/tests/module_package_json.rs @@ -5,9 +5,8 @@ reason = "test code: setup failures and missing JSON fields should fail the test" )] -use std::fs; - use et_cli::generate_module_package_json; +use fs_err as fs; use serde_json::Value; use tempfile::tempdir; diff --git a/utilities/cli/tests/scenario_generation.rs b/utilities/cli/tests/scenario_generation.rs index 3fc4161..21e1229 100644 --- a/utilities/cli/tests/scenario_generation.rs +++ b/utilities/cli/tests/scenario_generation.rs @@ -5,11 +5,10 @@ reason = "test code: setup failures and missing JSON fields should fail the test" )] -use std::fs; - use et_cli::{ docker_image_module_paths, generate_deployment, module_package_json, regenerate_verification, scenario_module_paths, }; +use fs_err as fs; use tempfile::tempdir; #[test] diff --git a/utilities/int-gen/src/openapi.rs b/utilities/int-gen/src/openapi.rs index 6bfb650..048d01d 100644 --- a/utilities/int-gen/src/openapi.rs +++ b/utilities/int-gen/src/openapi.rs @@ -134,6 +134,7 @@ pub fn render_rust_client() -> Result { let ast = syn::parse2(tokens).expect("progenitor always emits valid Rust"); let body = prettyplease::unparse(&ast); let body = inject_wasm_baseurl_fallback(&body); + let body = inject_retry_exec(&body); // progenitor wraps each handler doc as a `/**...*/` block, immediately // appending its own `Sends a METHOD request to /path` line at +4 spaces; // any non-empty description from the `OpenAPI` spec then leaves that @@ -186,3 +187,77 @@ fn inject_wasm_baseurl_fallback(body: &str) -> String { ); body.replacen(needle, replacement, 1) } + +/// Give the generated client's `ClientHooks::exec` an exponential-backoff +/// retry loop (native targets only). +/// +/// progenitor's per-call flow is `pre()` -> `exec()` -> `post()`, and `exec`'s +/// default just calls `self.client().execute(request)` once. We override it so +/// every REST call (module discovery, asset fetch, per-agent storage) tolerates +/// a transient failure -- in particular a ws-server that isn't up yet at +/// startup -- by retrying with backoff. +/// +/// NOTE: we hand-inject this only because reqwest's own retry support +/// (`ClientBuilder::retries`, shipped in 0.12.23) has **no backoff yet** -- its +/// `tower::retry::Policy::retry` returns `std::future::Ready<()>` (retries fire +/// immediately) and the upstream `backoff` field is commented out behind a +/// `// TODO? backoff futures...`. When reqwest ships backoff, delete this hook +/// and configure `.retries()` on the `reqwest::Client` instead (see +/// ). The hook is +/// `#[cfg(not(wasm32))]` because `tokio::time::sleep` + `SystemTime` don't work +/// under `wasm32-unknown-unknown`; WASM consumers keep the default `exec`. +#[expect( + clippy::single_call_fn, + reason = "post-process step kept separate for readability; the named function documents intent" +)] +fn inject_retry_exec(body: &str) -> String { + // progenitor emits an empty `impl ClientHooks` that takes the trait + // defaults; we swap it for one carrying a custom `exec`. + let needle = "impl ClientHooks<()> for &Client {}"; + let replacement = r#"impl ClientHooks<()> for &Client { + // Injected by `utilities/int-gen` (inject_retry_exec): retry request + // execution with exponential backoff. reqwest's native retry has no + // backoff yet -- remove this and use `ClientBuilder::retries` once it does. + #[cfg(not(target_arch = "wasm32"))] + async fn exec( + &self, + request: ::reqwest::Request, + _info: &OperationInfo, + ) -> ::reqwest::Result<::reqwest::Response> { + use ::retry_policies::policies::ExponentialBackoff; + use ::retry_policies::{RetryDecision, RetryPolicy as _}; + let policy = ExponentialBackoff::builder() + .retry_bounds( + ::core::time::Duration::from_millis(250), + ::core::time::Duration::from_secs(5), + ) + .build_with_total_retry_duration(::core::time::Duration::from_secs(30)); + let started = ::std::time::SystemTime::now(); + let mut n_past_retries: u32 = 0; + loop { + // Retry only when the request can be replayed (no streaming body). + let Some(attempt) = request.try_clone() else { + return self.client().execute(request).await; + }; + match self.client().execute(attempt).await { + Ok(response) => return Ok(response), + Err(err) => match policy.should_retry(started, n_past_retries) { + RetryDecision::Retry { execute_after } => { + let wait = execute_after + .duration_since(::std::time::SystemTime::now()) + .unwrap_or_default(); + ::tokio::time::sleep(wait).await; + n_past_retries = n_past_retries.saturating_add(1); + } + RetryDecision::DoNotRetry => return Err(err), + }, + } + } + } +}"#; + assert!( + body.contains(needle), + "progenitor's empty `impl ClientHooks` moved; update inject_retry_exec's needle" + ); + body.replacen(needle, replacement, 1) +}