From 57c3c482e3db9bac28d4012a523600193081b9d1 Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Tue, 9 Jun 2026 05:57:17 +0200 Subject: [PATCH 1/3] Add freqresp! optimization benchmark Standalone benchmark comparing strategies for freqresp! on a 48-state SISO state-space system over 150 frequencies: - baseline / serial Hessenberg ldiv2! - threaded outer loop (Threads.@spawn, Polyester.@batch, OhMyThreads) - @turbo / @tturbo ldiv2! (real/imag split, since @turbo lacks complex support) - modal / pole-residue reformulation (the Reactant/XLA/GPU-friendly form) Findings: the per-frequency loop dominates over the factorization; @turbo gives ~1.5x, threading ~6-7x, and the modal reformulation 13-54x (at the cost of robustness, depending on cond(V)). @tturbo and Reactant are not worth it at this problem size. See benchmark_freqresp/README.md for full results. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmark_freqresp/Project.toml | 6 + benchmark_freqresp/README.md | 127 +++++++++ benchmark_freqresp/benchmark.jl | 109 ++++++++ benchmark_freqresp/variants.jl | 439 ++++++++++++++++++++++++++++++++ 4 files changed, 681 insertions(+) create mode 100644 benchmark_freqresp/Project.toml create mode 100644 benchmark_freqresp/README.md create mode 100644 benchmark_freqresp/benchmark.jl create mode 100644 benchmark_freqresp/variants.jl diff --git a/benchmark_freqresp/Project.toml b/benchmark_freqresp/Project.toml new file mode 100644 index 000000000..cb18dbe2f --- /dev/null +++ b/benchmark_freqresp/Project.toml @@ -0,0 +1,6 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +ControlSystemsBase = "aaaaaaaa-a6ca-5380-bf3e-84a91bcd477e" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" +Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" diff --git a/benchmark_freqresp/README.md b/benchmark_freqresp/README.md new file mode 100644 index 000000000..fe905bbd3 --- /dev/null +++ b/benchmark_freqresp/README.md @@ -0,0 +1,127 @@ +# `freqresp!` optimization benchmark + +Standalone benchmark of strategies for `freqresp!` on the target use case: +a **48-state SISO continuous state-space system over 150 frequencies**. + +Nothing in the package source is modified — all candidate implementations live in +`variants.jl`. The package's own `freqresp!` is the correctness oracle. + +## Run + +```bash +julia --project=. -e 'using Pkg; Pkg.develop(path="../lib/ControlSystemsBase"); \ + Pkg.add(["BenchmarkTools","LoopVectorization","Polyester","OhMyThreads"]); \ + Pkg.instantiate(); Pkg.precompile()' + +JULIA_EXCLUSIVE=1 julia -t auto --project=. benchmark.jl # threads fixed at startup +``` + +## Variants + +| ID | Strategy | +|-----|----------------------------------------------------------------------| +| V0 | `ControlSystemsBase.freqresp!` (oracle, full call incl. factorization) | +| V1 | serial hand-rolled loop, original `ldiv2!` (loop-only) | +| V2 | `Threads.@spawn` over contiguous frequency chunks | +| V2b | `Polyester.@batch` over chunks | +| V2c | `OhMyThreads.@tasks` with `TaskLocalValue` buffers | +| V3 | serial loop, `@turbo` `ldiv2!` (real/imag split) | +| V4 | `@spawn` + `@turbo` `ldiv2!` | +| V4b | `Polyester.@batch` + `@turbo` `ldiv2!` | + +All variants operate on pre-factored matrices (`A=F.H`, `C=complex.(sys.C*Q)`, +`B=Q\sys.B`, `D`) so the "loop-only" timing isolates the per-frequency strategy from the +one-time Hessenberg factorization. All pass the `isapprox(·, ·; rtol=1e-10)` correctness +gate against the package baseline. + +## The `@turbo` / complex caveat + +LoopVectorization's `@turbo` does **not** support `Complex`. A bare `@turbo` on the +complex `ldiv2!` loops will not compile, and a naive `reinterpret`-to-real is *silently +wrong* (complex multiply has re/im cross terms). `ldiv2_turbo!` therefore carries the +state as separate real/imag `Float64` arrays and writes out the complex arithmetic by +hand, so the two hot `j = 1:k-2` loops (X update, u update) become valid real `@turbo` +loops. Result matches the baseline to ~1e-14. + +## Results (this machine: 24 logical cores) + +Loop-only time, µs (lower is better): + +| Variant | -t 4 | -t 8 | -t 24 | +|--------------------|-------|-------|-------| +| V1 serial | 545 | 566 | 565 | +| V3 turbo | 360 | 372 | 372 | +| V2 threaded | 147 | 90 | 106 | +| V2b polyester | 203 | 108 | 95 | +| V2c omt | 154 | 96 | 127 | +| V4 threaded+turbo | **104** | **78** | 101 | +| V4b polyester+turbo| 142 | 80 | **93** | + +Full-call baseline (V0, factorization included): ~625–655 µs. + +## Findings + +1. **The per-frequency loop, not the factorization, dominates.** The serial loop is + ~565 µs while the one-time Hessenberg factorization is only ~85 µs (full call ≈ loop + + factorization ≈ 650 µs). So optimizing the loop is worthwhile. + +2. **`@turbo` is a solid, free win: ~1.5×** (565 → ~370 µs serial), more than expected for + such short loops, with no threads and negligible accuracy loss. It does require the + manual real/imag split. + +3. **Threading gives ~6–7×** but scales sub-linearly: at this size **8 threads beats 24** + (150 freqs / 24 threads ≈ 6 freqs each — task overhead and oversubscription dominate). + The sweet spot here is ~8 threads. + +4. **Best overall is threading + `@turbo`** (~78 µs at 8 threads, a **~7.2×** speedup over + serial). `@spawn`+`@turbo` (V4) is consistently near-best across thread counts; + `Polyester`+`@turbo` (V4b) edges ahead only at 24 threads. + +5. **Scheduler choice is secondary.** `@spawn`, `Polyester`, and `OhMyThreads` are within + ~1.4× of each other; `@spawn` is the most robust across thread counts and adds no + dependency. + +## Reactant.jl and the modal reformulation + +Reactant.jl traces Julia into MLIR/XLA and wants vectorized array programs. The custom +Givens-based `ldiv2!` (scalar indexing, data-dependent control flow, `givensAlgorithm`) +does **not** trace into efficient XLA. To use Reactant you must first rewrite the +computation as batched array ops — and the natural rewrite is the **modal / pole-residue** +form: eigendecompose `A = V Λ V⁻¹` once, then + +``` +R(ω) = D + Σⱼ resⱼ / (iω − λⱼ), resⱼ = (C·V)ⱼ · (V⁻¹·B)ⱼ +``` + +Every frequency is then a pure broadcast + reduction over an `nx × nfreq` grid — no +per-frequency linear solve. Benchmarked in plain Julia (variants M1/M2/M3): + +| Variant | µs (8 threads) | +|--------------------|----------------| +| M1 modal serial | 42 | +| M2 modal broadcast | 59 | +| M3 modal threaded | **10** | + +So the modal form alone is **13×** (serial) to **54×** (threaded) faster than the current +serial Hessenberg loop, and beats the best Hessenberg micro-optimization (threaded+`@turbo`, +~73 µs) by 7×. Accuracy matched the baseline to ~1e-14 here (`cond(V)=147`). + +**Verdict on Reactant for this problem:** not worth it. The data is tiny (48×150 ≈ 7200 +complex numbers) and the modal form already runs in ~10 µs on CPU — below the per-call XLA +dispatch / GPU kernel-launch latency (tens of µs). Reactant would add a heavy dependency +and long compile times for no gain at this size. It would only pay off at much larger scale +(thousands of states, very many frequencies, or batches of systems) or on GPU — and there +the same modal array program is what you would feed it. + +**Caveat:** the modal form trades robustness for speed. It relies on the eigendecomposition, +so accuracy degrades with `cond(V)` for non-normal / near-defective `A`. That conditioning +risk is exactly why the package defaults to the Hessenberg solve. Modal is an excellent +fast path for well-conditioned systems, not a safe drop-in default. + +## Recommendation + +For the package, the lowest-risk improvement is dropping in the real/imag-split `@turbo` +`ldiv2!` (≈1.5× for free, no new threading semantics). If the loop is a known hotspot, +add an opt-in threaded path (`@spawn` over chunks with per-task buffers) — combined with +`@turbo` it reaches ~7× — but keep the default serial, since threading helps only for +many frequencies and benefits from a bounded thread count. diff --git a/benchmark_freqresp/benchmark.jl b/benchmark_freqresp/benchmark.jl new file mode 100644 index 000000000..a1c7926bd --- /dev/null +++ b/benchmark_freqresp/benchmark.jl @@ -0,0 +1,109 @@ +# Benchmark freqresp! strategies for a 48-state SISO state-space system over 150 frequencies. +# +# Launch with multiple threads to exercise the threaded variants: +# julia -t auto --project=. benchmark.jl +# +# Variants (see variants.jl): +# V0 baseline ControlSystemsBase.freqresp! (correctness oracle, full call) +# V1 serial hand-rolled transcription, loop-only +# V2 threaded Threads.@spawn over chunks +# V2b polyester Polyester.@batch over chunks +# V2c omt OhMyThreads task-local buffers +# V3 turbo @turbo (real/imag split) ldiv2, serial +# V4 threaded+turbo @spawn + @turbo ldiv2 + +using ControlSystemsBase, BenchmarkTools, LinearAlgebra, Printf +import LoopVectorization + +include("variants.jl") + +const NTH = Threads.nthreads() +@info "Benchmark configuration" threads=NTH +NTH == 1 && @warn "Running with a single thread — threaded variant timings are meaningless. Relaunch with `julia -t auto`." + +# --- Target system & frequencies --- +G = ssrand(1, 1, 48) # 48-state SISO, continuous +w = exp10.(LinRange(-2, 4, 150)) # 150 log-spaced frequencies +ny, nu = size(G) +const T = ComplexF64 +make_R() = Array{T,3}(undef, ny, nu, length(w)) + +# --- Pre-factored matrices for the loop-only variants --- +F = hessenberg(G.A) +Q = Matrix(F.Q) +A = F.H +C = complex.(G.C * Q) +B = Q \ G.B +D = G.D + +# Modal / pole-residue precompute (one-time eigendecomposition) — the Reactant-friendly form +E = eigen(G.A) +λ = ComplexF64.(E.values) # nx eigenvalues +V = ComplexF64.(E.vectors) +ctil = vec(complex.(G.C) * V) # C·V (nx,) +btil = V \ ComplexF64.(G.B[:, 1]) # V⁻¹·B (nx,) +res = ctil .* btil # residues (nx,) +condV = cond(V) # conditioning of the modal transform + +# A wrapped "full call" variant that does the factorization inside (apples-to-apples w/ V0) +function freqresp_serial_full!(R, sys, w) + F = hessenberg(sys.A); Q = Matrix(F.Q) + freqresp_serial!(R, F.H, complex.(sys.C*Q), Q\sys.B, sys.D, w) +end + +# ---------------------------------------------------------------------------- +# Correctness: every variant must match the package baseline. +# ---------------------------------------------------------------------------- +Rref = make_R(); ControlSystemsBase.freqresp!(Rref, G, w) + +loop_variants = [ + ("V1 serial", (R)->freqresp_serial!(R, A, C, B, D, w)), + ("V2 threaded", (R)->freqresp_threaded!(R, A, C, B, D, w)), + ("V2b polyester", (R)->freqresp_polyester!(R, A, C, B, D, w)), + ("V2c omt", (R)->freqresp_omt!(R, A, C, B, D, w)), + ("V3 turbo", (R)->freqresp_turbo!(R, A, C, B, D, w)), + ("V3t tturbo", (R)->freqresp_tturbo!(R, A, C, B, D, w)), + ("V4 threaded+turbo", (R)->freqresp_threaded_turbo!(R, A, C, B, D, w)), + ("V4b polyester+turbo",(R)->freqresp_polyester_turbo!(R, A, C, B, D, w)), + ("M1 modal", (R)->freqresp_modal!(R, λ, res, D, w)), + ("M2 modal broadcast", (R)->freqresp_modal_broadcast!(R, λ, res, D, w)), + ("M3 modal threaded", (R)->freqresp_modal_threaded!(R, λ, res, D, w)), +] +@info "Modal transform conditioning" condV + +println("\n=== Correctness (rtol=1e-10 vs ControlSystemsBase.freqresp!) ===") +for (name, f) in loop_variants + R = make_R(); f(R) + relerr = maximum(abs, R .- Rref) / maximum(abs, Rref) + ok = relerr < 1e-8 + @printf(" %-20s %s relerr=%.3e\n", name, ok ? "OK " : "LOOSE", relerr) +end + +# ---------------------------------------------------------------------------- +# Timing +# ---------------------------------------------------------------------------- +Rb = make_R() + +println("\n=== Loop-only timing (pre-factored matrices passed in) ===") +results = Tuple{String,Float64}[] +for (name, f) in loop_variants + t = @belapsed $f($Rb) + push!(results, (name, t)) + @printf(" %-20s %8.2f µs\n", name, t*1e6) +end + +println("\n=== Full-call timing (factorization included, as real freqresp usage) ===") +tb = @belapsed ControlSystemsBase.freqresp!($Rb, $G, $w) +@printf(" %-20s %8.2f µs\n", "V0 baseline", tb*1e6) +tf = @belapsed freqresp_serial_full!($Rb, $G, $w) +@printf(" %-20s %8.2f µs\n", "V1 serial (full)", tf*1e6) + +# --- Summary table sorted by loop-only time --- +println("\n=== Summary (loop-only, fastest first) ===") +sort!(results, by = x -> x[2]) +fastest = results[1][2] +for (name, t) in results + @printf(" %-20s %8.2f µs (%.2fx vs fastest)\n", name, t*1e6, t/fastest) +end +@printf("\n Full call baseline (V0): %.2f µs — factorization dominates at this size.\n", tb*1e6) +println(" Threads used: $NTH") diff --git a/benchmark_freqresp/variants.jl b/benchmark_freqresp/variants.jl new file mode 100644 index 000000000..b63b3ad29 --- /dev/null +++ b/benchmark_freqresp/variants.jl @@ -0,0 +1,439 @@ +# Candidate freqresp! implementations for benchmarking. +# +# Internals `_freq` and `ldiv2!` are unexported in ControlSystemsBase, so the modified +# variants copy them here (verbatim for the serial baseline, real/imag-split for @turbo) +# so they can be mutated freely without touching the package source. +# +# The benchmark target system is a CONTINUOUS state-space system, so the frequency map is +# `_freqc(w) = complex(0, w)` throughout. (Discrete would use `cis(w*Ts)`.) + +using LinearAlgebra +using LinearAlgebra: UpperHessenberg, givensAlgorithm +import LoopVectorization # provides @turbo +using LoopVectorization: @turbo, @tturbo +import Polyester # provides @batch +import OhMyThreads + +_freqc(w) = complex(0.0, w) + +# --------------------------------------------------------------------------- +# Copied verbatim from lib/ControlSystemsBase/src/freqresp.jl:168-206 +# (renamed to keep it independent of the package's own definition) +# --------------------------------------------------------------------------- +function ldiv2_orig!(u, cs, F::UpperHessenberg, B::AbstractVecOrMat; shift::Number=false) + LinearAlgebra.checksquare(F) + m = size(F,1) + m != size(B,1) && throw(DimensionMismatch("wrong right-hand-side # rows != $m")) + LinearAlgebra.require_one_based_indexing(B) + n = size(B,2) + H = F.data + μ = shift + copyto!(u, 1, H, m*(m-1)+1, m) # u .= H[:,m] + u[m] += μ + X = B + @inbounds for k = m:-1:2 + c, s, ρ = givensAlgorithm(u[k], H[k,k-1]) + cs[k] = (c, s) + for i = 1:n + X[k,i] /= ρ + t₁ = s * X[k,i]; t₂ = c * X[k,i] + @simd for j = 1:k-2 + X[j,i] -= u[j]*t₂ + H[j,k-1]*t₁ + end + X[k-1,i] -= u[k-1]*t₂ + (H[k-1,k-1] + μ) * t₁ + end + @simd for j = 1:k-2 + u[j] = H[j,k-1]*c - u[j]*s' + end + u[k-1] = (H[k-1,k-1] + μ) * c - u[k-1]*s' + end + for i = 1:n + τ₁ = X[1,i] / u[1] + @inbounds for j = 2:m + τ₂ = X[j,i] + c, s = cs[j] + X[j-1,i] = c*τ₁ + s*τ₂ + τ₁ = c*τ₂ - s'τ₁ + end + X[m,i] = τ₁ + end + return X +end + +# --------------------------------------------------------------------------- +# Real/imag-split rewrite enabling @turbo on the two hot loops. +# @turbo does NOT support Complex, so all complex arithmetic is written out by hand +# over separate real/imag Float64 arrays. H is real already; givens `c` is real, `s`/`ρ` +# complex. The two loops over `j = 1:k-2` (X update and u update) carry no cross-iteration +# dependency, so they are valid @turbo targets. +# u, X stored split as (ur,ui) and (Xr,Xi); cs stores (c::Real, s::Complex). +# --------------------------------------------------------------------------- +function ldiv2_turbo!(ur, ui, cs, H::AbstractMatrix{<:Real}, Xr, Xi, μ::Complex) + m = size(H, 1) + n = size(Xr, 2) + μr = real(μ); μi = imag(μ) + @inbounds for j in 1:m # u .= H[:,m]; then u[m] += μ + ur[j] = H[j,m]; ui[j] = 0.0 + end + ur[m] += μr; ui[m] += μi + @inbounds for k = m:-1:2 + c, s, ρ = givensAlgorithm(complex(ur[k], ui[k]), H[k,k-1]) + cs[k] = (c, s) + sr = real(s); si = imag(s) + ρr = real(ρ); ρi = imag(ρ) + ρd = ρr*ρr + ρi*ρi + hr = H[k-1,k-1] + μr; hi = μi # (H[k-1,k-1] + μ) + for i = 1:n + xr = Xr[k,i]; xi = Xi[k,i] # X[k,i] /= ρ + nxr = (xr*ρr + xi*ρi)/ρd + nxi = (xi*ρr - xr*ρi)/ρd + Xr[k,i] = nxr; Xi[k,i] = nxi + t1r = sr*nxr - si*nxi; t1i = sr*nxi + si*nxr # t₁ = s*X[k,i] + t2r = c*nxr; t2i = c*nxi # t₂ = c*X[k,i] + @turbo for j = 1:k-2 # X[j,i] -= u[j]*t₂ + H[j,k-1]*t₁ + Xr[j,i] -= ur[j]*t2r - ui[j]*t2i + H[j,k-1]*t1r + Xi[j,i] -= ur[j]*t2i + ui[j]*t2r + H[j,k-1]*t1i + end + # X[k-1,i] -= u[k-1]*t₂ + (H[k-1,k-1]+μ)*t₁ + Xr[k-1,i] -= ur[k-1]*t2r - ui[k-1]*t2i + (hr*t1r - hi*t1i) + Xi[k-1,i] -= ur[k-1]*t2i + ui[k-1]*t2r + (hr*t1i + hi*t1r) + end + @turbo for j = 1:k-2 # u[j] = H[j,k-1]*c - u[j]*s' (s' = conj(s)) + urj = ur[j]; uij = ui[j] + ur[j] = H[j,k-1]*c - (urj*sr + uij*si) + ui[j] = -(uij*sr - urj*si) + end + urk = ur[k-1]; uik = ui[k-1] # u[k-1] = (H[k-1,k-1]+μ)*c - u[k-1]*s' + ur[k-1] = hr*c - (urk*sr + uik*si) + ui[k-1] = hi*c - (uik*sr - urk*si) + end + @inbounds for i = 1:n # back substitution + u1r = ur[1]; u1i = ui[1] + d = u1r*u1r + u1i*u1i + x1r = Xr[1,i]; x1i = Xi[1,i] + τ1r = (x1r*u1r + x1i*u1i)/d # τ₁ = X[1,i]/u[1] + τ1i = (x1i*u1r - x1r*u1i)/d + for j = 2:m + τ2r = Xr[j,i]; τ2i = Xi[j,i] + c, s = cs[j]; sr = real(s); si = imag(s) + Xr[j-1,i] = c*τ1r + (sr*τ2r - si*τ2i) # X[j-1,i] = c*τ₁ + s*τ₂ + Xi[j-1,i] = c*τ1i + (sr*τ2i + si*τ2r) + nτ1r = c*τ2r - (sr*τ1r + si*τ1i) # τ₁ = c*τ₂ - s'τ₁ + nτ1i = c*τ2i - (sr*τ1i - si*τ1r) + τ1r = nτ1r; τ1i = nτ1i + end + Xr[m,i] = τ1r; Xi[m,i] = τ1i + end + return +end + +# --------------------------------------------------------------------------- +# Same as ldiv2_turbo! but with @tturbo (LoopVectorization's threaded @turbo) on the +# two hot j-loops. Expected to be slower here: the loops are only k-2 <= 46 long and +# sit inside a sequentially-dependent outer k loop, so per-loop thread dispatch overhead +# dominates. Measured for completeness. +# --------------------------------------------------------------------------- +function ldiv2_tturbo!(ur, ui, cs, H::AbstractMatrix{<:Real}, Xr, Xi, μ::Complex) + m = size(H, 1) + n = size(Xr, 2) + μr = real(μ); μi = imag(μ) + @inbounds for j in 1:m + ur[j] = H[j,m]; ui[j] = 0.0 + end + ur[m] += μr; ui[m] += μi + @inbounds for k = m:-1:2 + c, s, ρ = givensAlgorithm(complex(ur[k], ui[k]), H[k,k-1]) + cs[k] = (c, s) + sr = real(s); si = imag(s) + ρr = real(ρ); ρi = imag(ρ) + ρd = ρr*ρr + ρi*ρi + hr = H[k-1,k-1] + μr; hi = μi + for i = 1:n + xr = Xr[k,i]; xi = Xi[k,i] + nxr = (xr*ρr + xi*ρi)/ρd + nxi = (xi*ρr - xr*ρi)/ρd + Xr[k,i] = nxr; Xi[k,i] = nxi + t1r = sr*nxr - si*nxi; t1i = sr*nxi + si*nxr + t2r = c*nxr; t2i = c*nxi + @tturbo for j = 1:k-2 + Xr[j,i] -= ur[j]*t2r - ui[j]*t2i + H[j,k-1]*t1r + Xi[j,i] -= ur[j]*t2i + ui[j]*t2r + H[j,k-1]*t1i + end + Xr[k-1,i] -= ur[k-1]*t2r - ui[k-1]*t2i + (hr*t1r - hi*t1i) + Xi[k-1,i] -= ur[k-1]*t2i + ui[k-1]*t2r + (hr*t1i + hi*t1r) + end + @tturbo for j = 1:k-2 + urj = ur[j]; uij = ui[j] + ur[j] = H[j,k-1]*c - (urj*sr + uij*si) + ui[j] = -(uij*sr - urj*si) + end + urk = ur[k-1]; uik = ui[k-1] + ur[k-1] = hr*c - (urk*sr + uik*si) + ui[k-1] = hi*c - (uik*sr - urk*si) + end + @inbounds for i = 1:n + u1r = ur[1]; u1i = ui[1] + d = u1r*u1r + u1i*u1i + x1r = Xr[1,i]; x1i = Xi[1,i] + τ1r = (x1r*u1r + x1i*u1i)/d + τ1i = (x1i*u1r - x1r*u1i)/d + for j = 2:m + τ2r = Xr[j,i]; τ2i = Xi[j,i] + c, s = cs[j]; sr = real(s); si = imag(s) + Xr[j-1,i] = c*τ1r + (sr*τ2r - si*τ2i) + Xi[j-1,i] = c*τ1i + (sr*τ2i + si*τ2r) + nτ1r = c*τ2r - (sr*τ1r + si*τ1i) + nτ1i = c*τ2i - (sr*τ1i - si*τ1r) + τ1r = nτ1r; τ1i = nτ1i + end + Xr[m,i] = τ1r; Xi[m,i] = τ1i + end + return +end + +@inline function _solve_tturbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, μ) + copyto!(Xr, Br); copyto!(Xi, Bi) + ldiv2_tturbo!(ur, ui, cs, H, Xr, Xi, μ) + @inbounds @simd for j in eachindex(Bc) + Bc[j] = complex(Xr[j], Xi[j]) + end + return Bc +end + +# =========================================================================== +# Variants. All operate on PRE-FACTORED matrices (A=F.H, C=complex(sys.C*Q), +# B=Q\sys.B, D) so the timed region isolates the per-frequency loop strategy. +# `R` is (ny, nu, nw) ComplexF64. Continuous-time frequency map assumed. +# =========================================================================== + +# --------------------------------------------------------------------------- +# Modal / pole-residue formulation. This is the algorithm a Reactant.jl / XLA / GPU +# approach would actually compile: eigendecompose A = V Λ V⁻¹ ONCE, then every frequency +# is a pure elementwise broadcast + reduction (no per-frequency linear solve): +# R(ω) = D - Σⱼ resⱼ/(λⱼ + iω), resⱼ = (C·V)ⱼ · (V⁻¹·B)ⱼ +# Faster but less numerically robust than the Hessenberg solve for non-normal/defective A +# (accuracy depends on cond(V)). Inputs: λ (nx complex eigenvalues), res (nx residues), D. +# --------------------------------------------------------------------------- +function freqresp_modal!(R, λ, res, D, w) + nx = length(λ) + d = D[1, 1] + @inbounds for i in eachindex(w) + s = complex(0.0, w[i]) + acc = zero(eltype(R)) + @simd for j in 1:nx + acc += res[j] / (s - λ[j]) + end + R[1, 1, i] = d + acc + end + R +end + +# Fully-broadcast modal form (allocates the nx×nfreq grid) — closest to what a Reactant +# trace produces; representative of the XLA-friendly array program. +function freqresp_modal_broadcast!(R, λ, res, D, w) + s = reshape(complex.(0.0, w), 1, :) # 1 × nfreq + grid = res ./ (s .- λ) # nx × nfreq + R[1, 1, :] .= D[1, 1] .+ vec(sum(grid; dims = 1)) + R +end + +# Threaded modal loop +function freqresp_modal_threaded!(R, λ, res, D, w) + nx = length(λ) + d = D[1, 1] + Threads.@threads for i in eachindex(w) + s = complex(0.0, w[i]) + acc = zero(eltype(R)) + @inbounds @simd for j in 1:nx + acc += res[j] / (s - λ[j]) + end + @inbounds R[1, 1, i] = d + acc + end + R +end + +# V1: serial, hand-rolled transcription of the package loop (sanity vs V0 baseline) +function freqresp_serial!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + Bc = Vector{T}(undef, nx) + u = Vector{T}(undef, nx) + cs = Vector{Tuple{real(T),T}}(undef, nx) + @inbounds for i in eachindex(w) + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + copyto!(Bc, B) + ldiv2_orig!(u, cs, A, Bc; shift = -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + R +end + +# V2: threaded outer loop with Threads.@spawn over contiguous chunks; per-chunk buffers +function freqresp_threaded!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + nch = max(1, Threads.nthreads()) + chunks = Iterators.partition(eachindex(w), cld(length(w), nch)) + @sync for ch in chunks + Threads.@spawn begin + Bc = Vector{T}(undef, nx) + u = Vector{T}(undef, nx) + cs = Vector{Tuple{real(T),T}}(undef, nx) + @inbounds for i in ch + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + copyto!(Bc, B) + ldiv2_orig!(u, cs, A, Bc; shift = -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + end + end + R +end + +# V2b: Polyester.@batch threading (low spawn overhead) over contiguous chunks +function freqresp_polyester!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + nch = max(1, Threads.nthreads()) + chunks = collect(Iterators.partition(eachindex(w), cld(length(w), nch))) + Polyester.@batch for ci in eachindex(chunks) + ch = chunks[ci] + Bc = Vector{T}(undef, nx) + u = Vector{T}(undef, nx) + cs = Vector{Tuple{real(T),T}}(undef, nx) + @inbounds for i in ch + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + copyto!(Bc, B) + ldiv2_orig!(u, cs, A, Bc; shift = -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + end + R +end + +# V2c: OhMyThreads with task-local buffers +function freqresp_omt!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + tls_Bc = OhMyThreads.TaskLocalValue{Vector{T}}(() -> Vector{T}(undef, nx)) + tls_u = OhMyThreads.TaskLocalValue{Vector{T}}(() -> Vector{T}(undef, nx)) + tls_cs = OhMyThreads.TaskLocalValue{Vector{Tuple{real(T),T}}}(() -> Vector{Tuple{real(T),T}}(undef, nx)) + OhMyThreads.@tasks for i in eachindex(w) + Bc = tls_Bc[]; u = tls_u[]; cs = tls_cs[] + Ri = @view R[:, :, i] + copyto!(Ri, D) + if !isinf(w[i]) + copyto!(Bc, B) + ldiv2_orig!(u, cs, A, Bc; shift = -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + end + R +end + +# Helper: solve one frequency with the split @turbo ldiv2, writing complex result into Bc +@inline function _solve_turbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, μ) + copyto!(Xr, Br); copyto!(Xi, Bi) + ldiv2_turbo!(ur, ui, cs, H, Xr, Xi, μ) + @inbounds @simd for j in eachindex(Bc) + Bc[j] = complex(Xr[j], Xi[j]) + end + return Bc +end + +# V3: serial loop using the @turbo (real/imag-split) ldiv2 +function freqresp_turbo!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + H = A.data + Br = real.(B); Bi = imag.(B) + Bc = Vector{T}(undef, nx) + Xr = Matrix{Float64}(undef, nx, 1); Xi = Matrix{Float64}(undef, nx, 1) + ur = Vector{Float64}(undef, nx); ui = Vector{Float64}(undef, nx) + cs = Vector{Tuple{Float64,ComplexF64}}(undef, nx) + @inbounds for i in eachindex(w) + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + _solve_turbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + R +end + +# V3t: serial outer loop, @tturbo (intra-loop threaded) ldiv2 +function freqresp_tturbo!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + H = A.data + Br = real.(B); Bi = imag.(B) + Bc = Vector{T}(undef, nx) + Xr = Matrix{Float64}(undef, nx, 1); Xi = Matrix{Float64}(undef, nx, 1) + ur = Vector{Float64}(undef, nx); ui = Vector{Float64}(undef, nx) + cs = Vector{Tuple{Float64,ComplexF64}}(undef, nx) + @inbounds for i in eachindex(w) + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + _solve_tturbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + R +end + +# V4b: Polyester.@batch + @turbo ldiv2 (combination of the two best individual strategies) +function freqresp_polyester_turbo!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + H = A.data + Br = real.(B); Bi = imag.(B) + nch = max(1, Threads.nthreads()) + chunks = collect(Iterators.partition(eachindex(w), cld(length(w), nch))) + Polyester.@batch for ci in eachindex(chunks) + ch = chunks[ci] + Bc = Vector{T}(undef, nx) + Xr = Matrix{Float64}(undef, nx, 1); Xi = Matrix{Float64}(undef, nx, 1) + ur = Vector{Float64}(undef, nx); ui = Vector{Float64}(undef, nx) + cs = Vector{Tuple{Float64,ComplexF64}}(undef, nx) + @inbounds for i in ch + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + _solve_turbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + end + R +end + +# V4: threaded (@spawn) + @turbo ldiv2 +function freqresp_threaded_turbo!(R, A, C, B, D, w) + nx = size(A, 1) + T = eltype(R) + H = A.data + Br = real.(B); Bi = imag.(B) + nch = max(1, Threads.nthreads()) + chunks = Iterators.partition(eachindex(w), cld(length(w), nch)) + @sync for ch in chunks + Threads.@spawn begin + Bc = Vector{T}(undef, nx) + Xr = Matrix{Float64}(undef, nx, 1); Xi = Matrix{Float64}(undef, nx, 1) + ur = Vector{Float64}(undef, nx); ui = Vector{Float64}(undef, nx) + cs = Vector{Tuple{Float64,ComplexF64}}(undef, nx) + @inbounds for i in ch + Ri = @view R[:, :, i] + copyto!(Ri, D) + isinf(w[i]) && continue + _solve_turbo!(Bc, Xr, Xi, ur, ui, cs, H, Br, Bi, -_freqc(w[i])) + mul!(Ri, C, Bc, -1, 1) + end + end + end + R +end From 8d9f0fdc450a6ab2c7215d8c0c6e5db9a964f57c Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Tue, 9 Jun 2026 06:01:22 +0200 Subject: [PATCH 2/3] Document why the ldiv2 back-substitution loop is not @turbo-able Verified empirically: @turbo on the back-substitution loop trips LoopVectorization check_args and silently falls back to a scalar loop. The inner j-loop is a sequential recurrence (scan) and the independent column axis is length 1 for SISO, so no vectorization is possible. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmark_freqresp/variants.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmark_freqresp/variants.jl b/benchmark_freqresp/variants.jl index b63b3ad29..af7819987 100644 --- a/benchmark_freqresp/variants.jl +++ b/benchmark_freqresp/variants.jl @@ -107,6 +107,11 @@ function ldiv2_turbo!(ur, ui, cs, H::AbstractMatrix{<:Real}, Xr, Xi, μ::Complex ur[k-1] = hr*c - (urk*sr + uik*si) ui[k-1] = hi*c - (uik*sr - urk*si) end + # NOTE: this back-substitution loop is NOT @turbo-able. The inner j-loop is a + # sequential recurrence (τ₁ carried across iterations — a scan), which @turbo cannot + # vectorize, and the only independent axis (columns i) is length 1 for SISO. Annotating + # it with @turbo merely trips check_args and falls back to a plain @inbounds @fastmath + # loop (no actual vectorization). @inbounds for i = 1:n # back substitution u1r = ur[1]; u1i = ui[1] d = u1r*u1r + u1i*u1i From 76e983202cad4b618ddd14af4a1a59189d6ee597 Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Tue, 9 Jun 2026 06:05:20 +0200 Subject: [PATCH 3/3] Document modal-form accuracy limits with empirical evidence Add modal_accuracy.jl demonstrating, against a BigFloat reference, that the modal/pole-residue form's accuracy is governed by cond(V): it fails for defective A (repeated poles -> relerr 1.0) and near-defective/non-normal A (clustered eigenvalues -> relerr ~1e3), while the Hessenberg solve stays at machine precision. Evaluating near a pole is not a modal-specific weakness. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmark_freqresp/README.md | 42 +++++++++++++++++++++++--- benchmark_freqresp/modal_accuracy.jl | 45 ++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 benchmark_freqresp/modal_accuracy.jl diff --git a/benchmark_freqresp/README.md b/benchmark_freqresp/README.md index fe905bbd3..cfb91b863 100644 --- a/benchmark_freqresp/README.md +++ b/benchmark_freqresp/README.md @@ -113,10 +113,44 @@ and long compile times for no gain at this size. It would only pay off at much l (thousands of states, very many frequencies, or batches of systems) or on GPU — and there the same modal array program is what you would feed it. -**Caveat:** the modal form trades robustness for speed. It relies on the eigendecomposition, -so accuracy degrades with `cond(V)` for non-normal / near-defective `A`. That conditioning -risk is exactly why the package defaults to the Hessenberg solve. Modal is an excellent -fast path for well-conditioned systems, not a safe drop-in default. +### When the modal form degrades or fails + +The modal form's accuracy is governed by `cond(V)` (the conditioning of the eigenvector +matrix). Checked against a 256-bit `BigFloat` resolvent reference: + +| System | cond(V) | Hessenberg relerr | modal relerr | +|-------------------------------------|---------|-------------------|--------------| +| random (near-normal), nx=20 | 2e1 | 2e-15 | 8e-15 ✓ | +| 8 repeated poles at −1 | 1.5e110 | 1e-15 | **1.0** ✗ | +| 12 repeated poles at −1 | 7.6e172 | 2e-15 | **1.0** ✗ | +| clustered eigenvalues (non-normal) | 1e20 | 8e-16 | **8.7e2** ✗ | +| lightly damped, eval across pole | 1e0 | 6e-15 | 2e-14 ✓ | + +Failure conditions, in order of severity: + +1. **Defective (non-diagonalizable) `A`** — a repeated eigenvalue with a non-trivial Jordan + block. `A = VΛV⁻¹` does not exist (`V` singular), and the simple-pole form cannot + represent the response (needs `rⱼ/(s−λ)ᵏ` terms). Total failure (relerr ≈ 1). +2. **Near-defective / strongly non-normal `A`** — diagonalizable but `cond(V)` huge. + Forming `V⁻¹B` and the residue sum loses ≈ `log₁₀(cond(V))` digits with catastrophic + cancellation; error scales with `cond(V)`. +3. **Clustered eigenvalues** — nearly-parallel eigenvectors → ill-conditioned `V` (a milder + case 2). + +These are common in control (repeated integrators, cascaded identical lags, +Butterworth/Bessel clustered poles). Note that evaluating *near a pole* (`iω ≈ λⱼ`) is **not** +a modal-specific weakness — the response genuinely blows up there for every method; with a +well-conditioned `V` the modal form is as accurate as Hessenberg across the resonance. + +**Why the Hessenberg solve is immune:** it computes `(iωI − A)⁻¹B` via a backward-stable +linear solve whose accuracy depends on `cond(iωI − A)` (benign except genuinely at a pole), +**not** on `cond(V)`, and it never diagonalizes so it handles defective `A` fine. That +robustness is why the package defaults to it. + +**Caveat / rule of thumb:** the modal fast path is safe only when `A` is near-normal with +simple, well-separated poles. Guard it by checking `cond(V)` — beyond ~`1/√eps ≈ 1e8` half +the digits are gone, and it should fall back to the Hessenberg solve. It is an excellent +opt-in fast path for well-conditioned systems, not a safe drop-in default. ## Recommendation diff --git a/benchmark_freqresp/modal_accuracy.jl b/benchmark_freqresp/modal_accuracy.jl new file mode 100644 index 000000000..38f78a6ea --- /dev/null +++ b/benchmark_freqresp/modal_accuracy.jl @@ -0,0 +1,45 @@ +using ControlSystemsBase, LinearAlgebra, Printf + +# Modal / pole-residue evaluation (Float64), returns response + cond(V) +function modal_eval(A,B,C,D,w) + E = eigen(A) + λ = ComplexF64.(E.values); V = ComplexF64.(E.vectors) + ctil = vec(complex.(C)*V); btil = V \ ComplexF64.(B[:,1]) + res = ctil .* btil + R = ComplexF64[ D[1,1] + sum(res ./ (complex(0.0,ω) .- λ)) for ω in w ] + R, cond(V) +end + +# High-precision reference: resolvent in Complex{BigFloat} +function ref_eval(A,B,C,D,w) + setprecision(256) do + Ab=Complex{BigFloat}.(A); Bb=Complex{BigFloat}.(B[:,1]) + Cb=Complex{BigFloat}.(C); d=Complex{BigFloat}(D[1,1]) + [ (Cb*((complex(BigFloat(0),BigFloat(ω))*I - Ab)\Bb))[1] + d for ω in w ] + end +end + +relerr(R, Rref) = maximum(abs.(ComplexF64.(R) .- ComplexF64.(Rref))) / maximum(abs, ComplexF64.(Rref)) + +function report(name, sys, w) + A,B,C,D = sys.A, sys.B, sys.C, sys.D + Rref = ref_eval(A,B,C,D,w) + Rh = vec(freqresp(sys, w)) # package Hessenberg path + Rm, κ = modal_eval(A,B,C,D,w) + @printf("%-32s nx=%2d cond(V)=%.2e Hessenberg relerr=%.2e modal relerr=%.2e\n", + name, sys.nx, κ, relerr(Rh,Rref), relerr(Rm,Rref)) +end + +w = exp10.(LinRange(-2, 2, 60)) + +report("random (normal-ish)", ssrand(1,1,20), w) +report("8 repeated poles at -1", ss(tf(1.0,[1.0,1.0]))^8, w) +report("12 repeated poles at -1", ss(tf(1.0,[1.0,1.0]))^12, w) +# clustered (non-defective but near-defective) eigenvalues via upper-triangular A +let n=20 + A = triu(0.1 .* randn(n,n)); A[diagind(A)] .= -1.0 .- 0.001 .* (1:n) # tightly clustered evals + B = randn(n,1); C = randn(1,n); D = zeros(1,1) + report("clustered eigenvalues (triu)", ss(A,B,C,D), w) +end +# lightly damped pole near imaginary axis, evaluate across it +report("lightly damped (zeta=1e-4)", ss(tf(1.0,[1.0, 2e-4, 1.0])), exp10.(LinRange(-1,1,200)))