From fcdc69106945cfd898ed7c30fb1772d525e84889 Mon Sep 17 00:00:00 2001 From: davidshukhin Date: Fri, 12 Jun 2026 15:04:20 -0400 Subject: [PATCH] feat(fpga+arm): expose per-event flow to ARM via AXI4-Lite FLOW bundle (ENTR-31) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the last open item of SYSTEM_ANALYSIS.md: the ARM read flow vectors from a register space the FPGA never wrote (and would have indexed past the 128-word sim register file). - fpga/top_level.cpp: FLOW_EXPORT loop in COMPUTE_EVASION packs the first 1024 (pos, flow) pairs โ€” x/y UQ4.12, vx/vy Q8.8 โ€” into flow_out[2048] on a new s_axilite FLOW bundle, plus flow_count and a seqlock flow_seq written last for torn-read detection - arm/fpga_interface.h (new): FpgaInterface moved out of drone_main.cpp; read_flow_vectors() decodes the FLOW window with seqlock retry; stale REG_FLOW_PRED_BASE float-memcpy scheme deleted; sim register file sized to the full window; pack helpers shared with tests - fpga/hls_compat.h: Vitis-compatible range(hi, lo) on ap_fixed/ap_ufixed - fpga/build.tcl: FLOW bundle directives - tests: testbench test_flow_export (count/seq/bit-exact-vs-DEBUG/decode ranges/generation advance); ARM register round-trip + end-to-end TTC from register-decoded flow + clamping edge cases - docs: fpga/README.md FLOW register map + read protocol; SYSTEM_ANALYSIS item 10 and summary row marked FIXED make test-fpga 22/22, make test-arm 29/29, clang-format clean. --- SYSTEM_ANALYSIS.md | 8 +- arm/drone_main.cpp | 132 +------------------ arm/fpga_interface.h | 175 ++++++++++++++++++++++++++ fpga/README.md | 21 +++- fpga/build.tcl | 3 + fpga/hls_compat.h | 12 ++ fpga/testbench.cpp | 74 ++++++++++- fpga/top_level.cpp | 53 +++++++- test/test_arm_collision_predictor.cpp | 69 ++++++++++ 9 files changed, 411 insertions(+), 136 deletions(-) create mode 100644 arm/fpga_interface.h diff --git a/SYSTEM_ANALYSIS.md b/SYSTEM_ANALYSIS.md index 79ac939..66575f9 100644 --- a/SYSTEM_ANALYSIS.md +++ b/SYSTEM_ANALYSIS.md @@ -102,6 +102,12 @@ The brace initialization has wrong types and the layout doesn't match the struct The `#ifdef MAVLINK_AVAILABLE` guard means the real implementation is gated. The manual stub (`pack_set_position_target_local_ned_manual`) sends malformed packets (CRC always 0). On real hardware, the flight controller would reject these packets. ### 10. `drone_main.cpp` simulation path won't work +**FIXED** โ€” flow is now exported via the s_axilite `FLOW` bundle (first 1024 +events as packed UQ4.12 positions + Q8.8 flows, `flow_count` + seqlock +`flow_seq`); `read_flow_vectors()` in `arm/fpga_interface.h` decodes it with +torn-read retry, and the sim register file covers the full window. See +`fpga/README.md` ยง FLOW bundle. Original finding: + The `FpgaInterface` class uses `new volatile uint32_t[128]()` for simulated registers, but: - The register offsets (e.g., `REG_FLOW_PRED_BASE`) assume specific memory layout - The `read_flow_vectors()` method expects flow data at specific register offsets @@ -225,7 +231,7 @@ Most `.h` and `.cpp` files lack copyright/license headers despite the repo havin | ๐Ÿ”ด CRITICAL | Encoder weights never initialized | `fpga/top_level.cpp:91` | Zero flow output on FPGA | | ๐Ÿ”ด CRITICAL | Testbench uses wrong struct types | `fpga/testbench.cpp:424` | Compilation or silent data corruption | | ๐ŸŸ  MODERATE | MAVLink stub sends bad CRC | `arm/mavlink_bridge.h:455` | PX4 rejects packets | -| ๐ŸŸ  MODERATE | ARM reads flow from non-existent AXI regs | `arm/drone_main.cpp:111-128` | ARM gets zero flow vectors | +| ๐ŸŸ  MODERATE | ~~ARM reads flow from non-existent AXI regs~~ **FIXED**: s_axilite `FLOW` bundle + seqlock decode | `fpga/top_level.cpp` / `arm/fpga_interface.h` | ARM reads real per-event flow | | ๐ŸŸ  MODERATE | `ap_axiu<48,0,0,0>` invalid template | `fpga/top_level.cpp:27` | AXI interface may be malformed | | ๐ŸŸ  MODERATE | Stream is LIFO in simulation | `fpga/hls_compat.h:213-214` | C-sim differs from RTL sim | | ๐ŸŸ  MODERATE | `>> i` shift on ap_fixed not implemented | `fpga/encoder_systolic.h:79-80` | Won't compile in sim | diff --git a/arm/drone_main.cpp b/arm/drone_main.cpp index 1565abf..b4fae41 100644 --- a/arm/drone_main.cpp +++ b/arm/drone_main.cpp @@ -6,7 +6,7 @@ // // Architecture: // while(1): -// 1. Read flow vectors from FPGA encoder output (AXI4-Stream DMA) +// 1. Read flow vectors from FPGA encoder output (AXI4-Lite FLOW bundle) // 2. Convert to EventFlow structs // 3. Run CollisionPredictor::assess() โ†’ ThreatAssessment // 4. Run EvasionController::compute_command() โ†’ EvasionCommand @@ -25,6 +25,7 @@ #include "collision_predictor.h" #include "evasion_controller.h" +#include "fpga_interface.h" #include "kalman_tracker.h" // FreeRTOS or bare-metal alternatives @@ -39,26 +40,6 @@ #define SLEEP_MS(ms) std::this_thread::sleep_for(std::chrono::milliseconds(ms)) #endif -// --------------------------------------------------------------------------- -// FPGA AXI Register Map (AXI4-Lite base addresses) -// These map to the control_regs_t struct in fpga/top_level.cpp -// --------------------------------------------------------------------------- -#define FPGA_BASE_ADDR 0x43C00000 // Example AXI address -#define REG_ENABLE (FPGA_BASE_ADDR + 0x00) -#define REG_ENABLE_MOTORS (FPGA_BASE_ADDR + 0x04) -#define REG_INFERENCE_PERIOD (FPGA_BASE_ADDR + 0x08) -#define REG_MANUAL_VX (FPGA_BASE_ADDR + 0x0C) -#define REG_MANUAL_VY (FPGA_BASE_ADDR + 0x10) -#define REG_MANUAL_VZ (FPGA_BASE_ADDR + 0x14) -#define REG_MANUAL_YAW (FPGA_BASE_ADDR + 0x18) -#define REG_MANUAL_MODE (FPGA_BASE_ADDR + 0x1C) -#define REG_EVENT_COUNT (FPGA_BASE_ADDR + 0x20) -#define REG_FLOW_PRED_BASE (FPGA_BASE_ADDR + 0x100) // Flow data from encoder - -// DMA configuration for AXI4-Stream from encoder -#define DMA_RX_BASE 0x40400000 -#define DMA_MAX_PACKET_SIZE (4096 * 8) // 4096 events ร— 8 bytes per flow vec - using namespace drone; // --------------------------------------------------------------------------- @@ -67,115 +48,6 @@ using namespace drone; static std::atomic g_running{true}; static std::atomic g_motors_armed{false}; -// --------------------------------------------------------------------------- -// Simulated FPGA register access (replace with actual MMIO for hardware) -// On actual Zynq: mmap /dev/mem โ†’ volatile pointer to FPGA AXI region -// --------------------------------------------------------------------------- -class FpgaInterface { - public: - FpgaInterface() { - // On real hardware: mmap FPGA AXI region - // void* ptr = mmap(NULL, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, FPGA_BASE_ADDR); - // registers_ = reinterpret_cast(ptr); - - // Simulation: allocate local memory for testing - registers_ = new volatile uint32_t[128](); - } - - ~FpgaInterface() { delete[] registers_; } - - void write_register(uint32_t offset, uint32_t value) { - registers_[(offset - FPGA_BASE_ADDR) / 4] = value; - } - - uint32_t read_register(uint32_t offset) { return registers_[(offset - FPGA_BASE_ADDR) / 4]; } - - // ----------------------------------------------------------------- - // Read flow vectors from FPGA encoder output (AXI4-Stream) - // Returns event_count flow vectors - // - // NOTE: In the current FPGA top_level.cpp, flow_pred data is stored - // in internal static BRAM arrays and only debug_flow[2] is exposed - // via AXI4-Lite. A future enhancement should map flow_pred[] to an - // AXI-readable address range (e.g., s_axilite bundle=MEM_FLOW) so - // the ARM can read per-event flow vectors for collision prediction. - // Until then, this function returns dummy data. - // ----------------------------------------------------------------- - int read_flow_vectors(std::vector& events, int max_events) { - events.clear(); - - uint32_t event_count = read_register(REG_EVENT_COUNT); - if (event_count == 0 || event_count > static_cast(max_events)) { - return 0; - } - - events.reserve(event_count); - - // Read packed flow + position data from FPGA BRAM - for (uint32_t i = 0; i < event_count; ++i) { - // Each event: 2 ร— 32bit for flow (vx, vy as float) + position data - uint32_t base = (REG_FLOW_PRED_BASE - FPGA_BASE_ADDR) / 4 + i * 4; - - EventFlow ev; - // Reconstruct float from fixed-point (INT16.Q8 โ†’ float) - uint32_t vx_raw = registers_[base + 0]; - uint32_t vy_raw = registers_[base + 1]; - uint32_t x_raw = registers_[base + 2]; - uint32_t y_raw = registers_[base + 3]; - std::memcpy(&ev.vx, &vx_raw, sizeof(float)); - std::memcpy(&ev.vy, &vy_raw, sizeof(float)); - std::memcpy(&ev.x, &x_raw, sizeof(float)); - std::memcpy(&ev.y, &y_raw, sizeof(float)); - ev.t = i; // Sequential within this batch - - events.push_back(ev); - } - - return event_count; - } - - // ----------------------------------------------------------------- - // Write velocity command to FPGA PWM module - // ----------------------------------------------------------------- - void write_velocity_command(const EvasionCommand& cmd) { - // Convert float velocities to fixed-point for FPGA - // ap_fixed<16,4>: [-8.0, 8.0) range, Q4.12 - int32_t vx_fp = static_cast(cmd.velocity_x * 4096.0f); // 2^12 - int32_t vy_fp = static_cast(cmd.velocity_y * 4096.0f); - int32_t vz_fp = static_cast(cmd.velocity_z * 4096.0f); - int32_t yaw_fp = static_cast(cmd.yaw_rate * 4096.0f); - - // Clamp to INT16 range - auto clamp_int16 = [](int32_t v) -> uint32_t { - if (v > 32767) v = 32767; - if (v < -32768) v = -32768; - return static_cast(v & 0xFFFF); - }; - - write_register(REG_MANUAL_VX, clamp_int16(vx_fp)); - write_register(REG_MANUAL_VY, clamp_int16(vy_fp)); - write_register(REG_MANUAL_VZ, clamp_int16(vz_fp)); - write_register(REG_MANUAL_YAW, clamp_int16(yaw_fp)); - - // Set manual mode to feed computed commands to PWM - write_register(REG_MANUAL_MODE, 1); - } - - void enable(bool motors) { - write_register(REG_ENABLE, 1); - write_register(REG_ENABLE_MOTORS, motors ? 1 : 0); - write_register(REG_INFERENCE_PERIOD, 100000); // 1kHz inference trigger - } - - void disable() { - write_register(REG_ENABLE, 0); - write_register(REG_ENABLE_MOTORS, 0); - } - - private: - volatile uint32_t* registers_; -}; - // --------------------------------------------------------------------------- // Telemetry / logging // --------------------------------------------------------------------------- diff --git a/arm/fpga_interface.h b/arm/fpga_interface.h new file mode 100644 index 0000000..ca9ec2c --- /dev/null +++ b/arm/fpga_interface.h @@ -0,0 +1,175 @@ +// arm/fpga_interface.h โ€” ARM-side AXI4-Lite interface to the FPGA pipeline +// +// Register map mirrors fpga/top_level.cpp: +// CTRL bundle @ FPGA_BASE_ADDR : control_regs_t fields +// FLOW bundle @ FPGA_FLOW_BASE : per-event (pos, flow) export +// +// NOTE: base addresses and FLOW offsets are design-time placeholders. The +// authoritative offsets come from the Vitis-generated +// xcollision_avoidance_top_hw.h after synthesis โ€” confirm before flight. +#pragma once + +#include +#include + +#include "collision_predictor.h" +#include "evasion_controller.h" + +// --------------------------------------------------------------------------- +// FPGA AXI Register Map (AXI4-Lite base addresses) +// These map to the control_regs_t struct in fpga/top_level.cpp +// --------------------------------------------------------------------------- +#define FPGA_BASE_ADDR 0x43C00000 // Example AXI address +#define REG_ENABLE (FPGA_BASE_ADDR + 0x00) +#define REG_ENABLE_MOTORS (FPGA_BASE_ADDR + 0x04) +#define REG_INFERENCE_PERIOD (FPGA_BASE_ADDR + 0x08) +#define REG_MANUAL_VX (FPGA_BASE_ADDR + 0x0C) +#define REG_MANUAL_VY (FPGA_BASE_ADDR + 0x10) +#define REG_MANUAL_VZ (FPGA_BASE_ADDR + 0x14) +#define REG_MANUAL_YAW (FPGA_BASE_ADDR + 0x18) +#define REG_MANUAL_MODE (FPGA_BASE_ADDR + 0x1C) +#define REG_EVENT_COUNT (FPGA_BASE_ADDR + 0x20) + +// FLOW bundle: per-event flow export (see fpga/top_level.cpp FLOW_EXPORT). +// 2 words per entry i: +// REG_FLOW_DATA + 8*i bits[15:0]=x (UQ4.12) bits[31:16]=y (UQ4.12) +// REG_FLOW_DATA + 8*i + 4 bits[15:0]=vx (Q8.8) bits[31:16]=vy (Q8.8) +#define FPGA_FLOW_BASE 0x43C10000 +#define REG_FLOW_COUNT (FPGA_FLOW_BASE + 0x10) +#define REG_FLOW_SEQ (FPGA_FLOW_BASE + 0x18) +#define REG_FLOW_DATA (FPGA_FLOW_BASE + 0x2000) +#define FLOW_MAX_OUT 1024 // mirrors fpga/top_level.cpp โ€” keep in sync + +// Simulated register file spans CTRL base through the end of the FLOW window +#define REG_SPACE_WORDS ((FPGA_FLOW_BASE + 0x2000 + 8 * FLOW_MAX_OUT - FPGA_BASE_ADDR) / 4) + +// --------------------------------------------------------------------------- +// Simulated FPGA register access (replace with actual MMIO for hardware) +// On actual Zynq: mmap /dev/mem โ†’ volatile pointer to FPGA AXI region +// --------------------------------------------------------------------------- +class FpgaInterface { + public: + FpgaInterface() { + // On real hardware: mmap FPGA AXI region + // void* ptr = mmap(NULL, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, FPGA_BASE_ADDR); + // registers_ = reinterpret_cast(ptr); + + // Simulation: allocate local memory for testing + registers_ = new volatile uint32_t[REG_SPACE_WORDS](); + } + + ~FpgaInterface() { delete[] registers_; } + + void write_register(uint32_t offset, uint32_t value) { + registers_[(offset - FPGA_BASE_ADDR) / 4] = value; + } + + uint32_t read_register(uint32_t offset) { return registers_[(offset - FPGA_BASE_ADDR) / 4]; } + + // ----------------------------------------------------------------- + // Fixed-point packing โ€” single source of truth for the FLOW word + // format, shared by decode below and the register round-trip tests. + // ----------------------------------------------------------------- + static uint32_t pack_position(float x, float y) { + return (static_cast(to_uq4_12(y)) << 16) | to_uq4_12(x); + } + + static uint32_t pack_flow(float vx, float vy) { + return (static_cast(to_q8_8(vy)) << 16) | to_q8_8(vx); + } + + // ----------------------------------------------------------------- + // Read per-event flow vectors from the FPGA FLOW bundle (AXI4-Lite). + // Seqlock protocol: snapshot flow_seq, read count + data, reread + // flow_seq โ€” a mismatch means the FPGA exported mid-read; retry. + // ----------------------------------------------------------------- + int read_flow_vectors(std::vector& events, int max_events) { + events.clear(); + + for (int attempt = 0; attempt < 3; ++attempt) { + uint32_t seq_before = read_register(REG_FLOW_SEQ); + + uint32_t count = read_register(REG_FLOW_COUNT); + if (count == 0) return 0; + if (count > FLOW_MAX_OUT) count = FLOW_MAX_OUT; + if (count > static_cast(max_events)) count = max_events; + + events.reserve(count); + for (uint32_t i = 0; i < count; ++i) { + uint32_t pos = read_register(REG_FLOW_DATA + 8 * i); + uint32_t flw = read_register(REG_FLOW_DATA + 8 * i + 4); + + drone::EventFlow ev; + ev.x = static_cast(pos & 0xFFFF) / 4096.0f; // UQ4.12 + ev.y = static_cast(pos >> 16) / 4096.0f; + ev.vx = static_cast(flw & 0xFFFF) / 256.0f; // Q8.8 + ev.vy = static_cast(flw >> 16) / 256.0f; + ev.t = i; // Sequential within this batch + events.push_back(ev); + } + + if (read_register(REG_FLOW_SEQ) == seq_before) { + return static_cast(count); + } + events.clear(); // torn read โ€” retry + } + return 0; + } + + // ----------------------------------------------------------------- + // Write velocity command to FPGA PWM module + // ----------------------------------------------------------------- + void write_velocity_command(const drone::EvasionCommand& cmd) { + // Convert float velocities to fixed-point for FPGA + // ap_fixed<16,4>: [-8.0, 8.0) range, Q4.12 + int32_t vx_fp = static_cast(cmd.velocity_x * 4096.0f); // 2^12 + int32_t vy_fp = static_cast(cmd.velocity_y * 4096.0f); + int32_t vz_fp = static_cast(cmd.velocity_z * 4096.0f); + int32_t yaw_fp = static_cast(cmd.yaw_rate * 4096.0f); + + // Clamp to INT16 range + auto clamp_int16 = [](int32_t v) -> uint32_t { + if (v > 32767) v = 32767; + if (v < -32768) v = -32768; + return static_cast(v & 0xFFFF); + }; + + write_register(REG_MANUAL_VX, clamp_int16(vx_fp)); + write_register(REG_MANUAL_VY, clamp_int16(vy_fp)); + write_register(REG_MANUAL_VZ, clamp_int16(vz_fp)); + write_register(REG_MANUAL_YAW, clamp_int16(yaw_fp)); + + // Set manual mode to feed computed commands to PWM + write_register(REG_MANUAL_MODE, 1); + } + + void enable(bool motors) { + write_register(REG_ENABLE, 1); + write_register(REG_ENABLE_MOTORS, motors ? 1 : 0); + write_register(REG_INFERENCE_PERIOD, 100000); // 1kHz inference trigger + } + + void disable() { + write_register(REG_ENABLE, 0); + write_register(REG_ENABLE_MOTORS, 0); + } + + private: + // UQ4.12 with round-to-nearest, saturating at the 16-bit rails + static uint16_t to_uq4_12(float v) { + float s = v * 4096.0f + 0.5f; + if (s < 0.0f) s = 0.0f; + if (s > 65535.0f) s = 65535.0f; + return static_cast(s); + } + + // Q8.8 two's-complement with round-to-nearest, saturating + static uint16_t to_q8_8(float v) { + int32_t s = static_cast(v * 256.0f + (v >= 0 ? 0.5f : -0.5f)); + if (s > 32767) s = 32767; + if (s < -32768) s = -32768; + return static_cast(s & 0xFFFF); + } + + volatile uint32_t* registers_; +}; diff --git a/fpga/README.md b/fpga/README.md index 307b1f1..6b46325 100644 --- a/fpga/README.md +++ b/fpga/README.md @@ -89,7 +89,26 @@ Base address: `0x43C00000` (AXI4-Lite slave) | 0x18 | `manual_yaw` | R/W | Manual yaw rate | | 0x1C | `manual_mode` | R/W | 1=manual, 0=auto | | 0x20 | `event_count` | RO | Events in ring buffer | -| 0x100+ | `flow_pred` | RO | Flow vector data (4096ร—8B) | + +### FLOW bundle โ€” per-event flow export + +Base address: `0x43C10000` (second AXI4-Lite slave, bundle `FLOW`). Offsets +below follow standard Vitis s_axilite allocation but are **design-time +placeholders** โ€” the generated `xcollision_avoidance_top_hw.h` is +authoritative after synthesis. + +| Offset | Register | Access | Description | +|--------|----------|--------|-------------| +| 0x10 | `flow_count` | RO | Valid entries (0..1024) | +| 0x18 | `flow_seq` | RO | Export generation counter (seqlock) | +| 0x2000 + 8i | `flow_out[2i]` | RO | bits[15:0]=x (UQ4.12), bits[31:16]=y (UQ4.12) | +| 0x2004 + 8i | `flow_out[2i+1]` | RO | bits[15:0]=vx (Q8.8), bits[31:16]=vy (Q8.8) | + +Read protocol (torn-read safe): read `flow_seq`, then `flow_count` and the +data words, then `flow_seq` again โ€” if it changed, the FPGA exported a new +batch mid-read; retry. Export is bounded to the first 1024 events of a batch +(8KB window); the ARM clusterer needs only 30โ€“50 events per object. See +`arm/fpga_interface.h` for the matching decode. ## Hardware Integration diff --git a/fpga/build.tcl b/fpga/build.tcl index 3c8d4f2..11fcf3a 100644 --- a/fpga/build.tcl +++ b/fpga/build.tcl @@ -45,6 +45,9 @@ create_clock -period 10 -name clk set_directive_interface -mode s_axilite -bundle CTRL "collision_avoidance_top" set_directive_interface -mode s_axilite -bundle CTRL "collision_avoidance_top" ctrl_regs set_directive_interface -mode s_axilite -bundle DEBUG "collision_avoidance_top" debug_flow +set_directive_interface -mode s_axilite -bundle FLOW "collision_avoidance_top" flow_out +set_directive_interface -mode s_axilite -bundle FLOW "collision_avoidance_top" flow_count +set_directive_interface -mode s_axilite -bundle FLOW "collision_avoidance_top" flow_seq # Data interfaces (BRAM/stream for high-throughput paths) set_directive_interface -mode bram "collision_avoidance_top" events diff --git a/fpga/hls_compat.h b/fpga/hls_compat.h index 79ab2da..072d0c0 100644 --- a/fpga/hls_compat.h +++ b/fpga/hls_compat.h @@ -220,6 +220,12 @@ class ap_fixed { r.val_ = sat(val_ << n); return r; } + // raw fixed-point bit-slice โ€” mirrors Vitis ap_fixed::range(hi, lo) + ap_uint range(int hi = W - 1, int lo = 0) const { + const int w = hi - lo + 1; + const uint64_t mask = (w >= 64) ? ~0ULL : ((1ULL << w) - 1); + return ap_uint((static_cast(val_) >> lo) & mask); + } // Comparisons intentionally omitted: they resolve through the implicit // float conversion (member overloads would be ambiguous against it // when comparing with int/float literals). @@ -281,6 +287,12 @@ class ap_ufixed { bool operator<=(ap_ufixed o) const { return val_ <= o.val_; } bool operator==(ap_ufixed o) const { return val_ == o.val_; } bool operator!=(ap_ufixed o) const { return val_ != o.val_; } + // raw fixed-point bit-slice โ€” mirrors Vitis ap_ufixed::range(hi, lo) + ap_uint range(int hi = W - 1, int lo = 0) const { + const int w = hi - lo + 1; + const uint64_t mask = (w >= 64) ? ~0ULL : ((1ULL << w) - 1); + return ap_uint((val_ >> lo) & mask); + } }; // =========================================================================== diff --git a/fpga/testbench.cpp b/fpga/testbench.cpp index 0f1ca71..f590b68 100644 --- a/fpga/testbench.cpp +++ b/fpga/testbench.cpp @@ -78,7 +78,15 @@ SimulatedEvents generate_static_noise_event_stream(int n_events) { void feed_events_to_pipeline(control_regs_t& ctrl, aer_bus_t& aer_bus, ap_uint<64>& timestamp, const SimulatedEvents& events, motor_outputs_t& motor_out, - enc_out_t debug_flow[2], int steps = 100) { + enc_out_t debug_flow[2], int steps = 100, + ap_uint<32>* flow_out = nullptr, ap_uint<32>* flow_count = nullptr, + ap_uint<32>* flow_seq = nullptr) { + // Tests that don't inspect the FLOW bundle share throwaway buffers + static ap_uint<32> default_flow_out[FLOW_OUT_WORDS]; + static ap_uint<32> default_flow_count, default_flow_seq; + if (!flow_out) flow_out = default_flow_out; + if (!flow_count) flow_count = &default_flow_count; + if (!flow_seq) flow_seq = &default_flow_seq; size_t event_idx = 0; ctrl.enable = 1; ctrl.enable_motors = 1; @@ -100,7 +108,8 @@ void feed_events_to_pipeline(control_regs_t& ctrl, aer_bus_t& aer_bus, ap_uint<6 aer_bus.req = 1; event_idx++; } - collision_avoidance_top(ctrl, aer_bus, timestamp, motor_out, debug_flow); + collision_avoidance_top(ctrl, aer_bus, timestamp, motor_out, debug_flow, flow_out, + *flow_count, *flow_seq); } } @@ -343,6 +352,66 @@ void test_top_level_evasion_response() { } } +void test_flow_export() { + printf("\n=== Test: per-event flow export (AXI FLOW bundle) ===\n"); + control_regs_t ctrl; + aer_bus_t aer_bus = {0}; + ap_uint<64> timestamp(0); + motor_outputs_t motor_out = {0}; + enc_out_t debug_flow[2] = {coord_enc_t(0), coord_enc_t(0)}; + static ap_uint<32> flow_out[FLOW_OUT_WORDS]; + ap_uint<32> flow_count(0), flow_seq(0); + ctrl.enable = 1; + ctrl.enable_motors = 1; + ctrl.manual_mode = 0; + ctrl.inference_period = 5000; + auto looming_events = generate_looming_object_event_stream(2048); + feed_events_to_pipeline(ctrl, aer_bus, timestamp, looming_events, motor_out, debug_flow, 8400, + flow_out, &flow_count, &flow_seq); + + TEST("Flow export produces a bounded, non-zero count"); + ASSERT(flow_count.val > 0 && flow_count.val <= FLOW_MAX_OUT, "flow_count=%llu out of (0, %d]", + static_cast(flow_count.val), FLOW_MAX_OUT); + + TEST("Flow export completed at least one generation"); + ASSERT(flow_seq.val >= 1, "flow_seq never advanced"); + + TEST("Entry 0 flow word matches the DEBUG bundle bit-for-bit"); + ASSERT((flow_out[1].val & 0xFFFF) == debug_flow[0].range(15, 0).val && + ((flow_out[1].val >> 16) & 0xFFFF) == debug_flow[1].range(15, 0).val, + "FLOW/DEBUG mismatch: flow_out[1]=0x%08llx debug=(0x%04llx, 0x%04llx)", + static_cast(flow_out[1].val), + static_cast(debug_flow[0].range(15, 0).val), + static_cast(debug_flow[1].range(15, 0).val)); + + TEST("Decoded positions and flows are in range"); + bool in_range = true; + int n_check = static_cast(flow_count.val); + if (n_check > 10) n_check = 10; + for (int i = 0; i < n_check; i++) { + // UQ4.12 positions, Q8.8 flows โ€” same decode the ARM uses + float x = static_cast(flow_out[2 * i].val & 0xFFFF) / 4096.0f; + float y = static_cast((flow_out[2 * i].val >> 16) & 0xFFFF) / 4096.0f; + float vx = static_cast(flow_out[2 * i + 1].val & 0xFFFF) / 256.0f; + float vy = static_cast((flow_out[2 * i + 1].val >> 16) & 0xFFFF) / 256.0f; + if (!(x >= 0.0f && x < 1.0f && y >= 0.0f && y < 1.0f) || std::fabs(vx) >= 128.0f || + std::fabs(vy) >= 128.0f) { + in_range = false; + break; + } + } + ASSERT(in_range, "decoded entry escaped UQ4.12/Q8.8 range"); + + TEST("Generation counter advances on a new batch"); + uint64_t seq_first = flow_seq.val; + auto more_events = generate_looming_object_event_stream(2048); + feed_events_to_pipeline(ctrl, aer_bus, timestamp, more_events, motor_out, debug_flow, 8400, + flow_out, &flow_count, &flow_seq); + ASSERT(flow_seq.val > seq_first, "flow_seq did not advance: %llu -> %llu", + static_cast(seq_first), + static_cast(flow_seq.val)); +} + void test_ring_buffer() { printf("\n=== Test: ring buffer ===\n"); event_unpacked_t ring_buf[RING_BUFFER_SIZE]; @@ -397,6 +466,7 @@ int main() { test_encoder_systolic(); test_ring_buffer(); test_top_level_evasion_response(); + test_flow_export(); printf("\n============================================\n"); printf(" Results: %d/%d assertions passed", PASS_count, PASS_count + errors); diff --git a/fpga/top_level.cpp b/fpga/top_level.cpp index 0f2815e..c0ed7ae 100644 --- a/fpga/top_level.cpp +++ b/fpga/top_level.cpp @@ -28,6 +28,17 @@ typedef ap_axiu<48, 4, 0, 0> axis_event_t; #endif +// --------------------------------------------------------------------------- +// Per-event flow export (s_axilite bundle FLOW) +// Bounded to the first FLOW_MAX_OUT events of a batch: a full 4096-entry +// window would need 32KB of AXI4-Lite space and ~8k single-beat PS reads +// per 100Hz frame; 1024 entries (8KB) read in ~0.6ms and exceed what the +// ARM clusterer consumes (min_events_per_cluster is 30-50). +// Mirrored by FLOW_MAX_OUT in arm/fpga_interface.h โ€” keep in sync. +// --------------------------------------------------------------------------- +#define FLOW_MAX_OUT 1024 +#define FLOW_OUT_WORDS (2 * FLOW_MAX_OUT) + // --------------------------------------------------------------------------- // Top-Level Control Registers (AXI4-Lite addressable) // --------------------------------------------------------------------------- @@ -51,16 +62,26 @@ struct control_regs_t { // aer_timestamp : System timestamp for event tagging // motor_out : 4-channel PWM output struct // debug_flow : Debug output: first event's (vx, vy) for monitoring +// flow_out : Per-event (pos, flow) export, 2 words per entry i: +// word 2i bits[15:0]=x (UQ4.12) bits[31:16]=y (UQ4.12) +// word 2i+1 bits[15:0]=vx (Q8.8) bits[31:16]=vy (Q8.8) +// flow_count : Valid entries in flow_out (0..FLOW_MAX_OUT) +// flow_seq : Export generation counter (seqlock) โ€” written last; +// ARM rereads it to detect a torn read and retries // --------------------------------------------------------------------------- void collision_avoidance_top(control_regs_t& ctrl_regs, aer_bus_t& aer_bus, ap_uint<64> aer_timestamp, motor_outputs_t& motor_out, - enc_out_t debug_flow[2] // (vx, vy) of first event for debug -) { + enc_out_t debug_flow[2], // (vx, vy) of first event for debug + ap_uint<32> flow_out[FLOW_OUT_WORDS], ap_uint<32>& flow_count, + ap_uint<32>& flow_seq) { #pragma HLS INTERFACE s_axilite port = return bundle = CTRL #pragma HLS INTERFACE s_axilite port = ctrl_regs bundle = CTRL #pragma HLS INTERFACE ap_none port = aer_timestamp #pragma HLS INTERFACE ap_none port = motor_out #pragma HLS INTERFACE s_axilite port = debug_flow bundle = DEBUG +#pragma HLS INTERFACE s_axilite port = flow_out bundle = FLOW +#pragma HLS INTERFACE s_axilite port = flow_count bundle = FLOW +#pragma HLS INTERFACE s_axilite port = flow_seq bundle = FLOW // ------------------------------------------------------------------- // Internal state @@ -235,6 +256,34 @@ void collision_avoidance_top(control_regs_t& ctrl_regs, aer_bus_t& aer_bus, debug_flow[0] = flow_pred[0][0]; debug_flow[1] = flow_pred[0][1]; + // Export per-event (pos, flow) to the AXI4-Lite FLOW bundle. + // ring_buffer_events is untouched between NORMALIZE and here + // (no ingest outside COLLECT_EVENTS), so flow_pred[i] maps to + // ring_buffer_events[(read_ptr + i) & RB_ADDR_MASK]. + // Seqlock: data, then count, then seq โ€” seq lands last so the + // ARM can detect a read that straddled an export. + static ap_uint<32> flow_generation = 0; + event_cnt_t export_count = rb_count; + if (export_count > FLOW_MAX_OUT) export_count = FLOW_MAX_OUT; + ap_uint<12> exp_ptr; + if (rb_write_ptr >= rb_count) { + exp_ptr = rb_write_ptr - rb_count; + } else { + exp_ptr = rb_write_ptr + RING_BUFFER_SIZE - rb_count; + } + FLOW_EXPORT: + for (event_cnt_t i = 0; i < export_count; i++) { +#pragma HLS PIPELINE II = 2 + ap_uint<12> addr = (exp_ptr + i) & RB_ADDR_MASK; + flow_out[2 * i] = (ap_uint<32>(ring_buffer_events[addr].y.range(15, 0)) << 16) | + ap_uint<32>(ring_buffer_events[addr].x.range(15, 0)); + flow_out[2 * i + 1] = (ap_uint<32>(flow_pred[i][1].range(15, 0)) << 16) | + ap_uint<32>(flow_pred[i][0].range(15, 0)); + } + flow_count = ap_uint<32>(export_count); + flow_generation++; + flow_seq = flow_generation; + // Generate motor outputs // Evasion: move away from mean flow direction // Vertical: slight altitude increase when threat detected diff --git a/test/test_arm_collision_predictor.cpp b/test/test_arm_collision_predictor.cpp index 0e83307..d50d5a3 100644 --- a/test/test_arm_collision_predictor.cpp +++ b/test/test_arm_collision_predictor.cpp @@ -17,6 +17,7 @@ #include "arm/collision_predictor.h" #include "arm/evasion_controller.h" +#include "arm/fpga_interface.h" using namespace drone; @@ -282,6 +283,71 @@ void test_safe_bearing() { result.safe_bearing_confidence); } +// =========================================================================== +// FPGA INTERFACE TESTS โ€” FLOW bundle register decode +// =========================================================================== + +void test_fpga_flow_register_roundtrip() { + printf("\n=== FPGA flow register round-trip ===\n"); + + FpgaInterface fpga; + auto truth = make_looming_flow_vectors(500, 1.5f); + for (size_t i = 0; i < truth.size(); i++) { + fpga.write_register(REG_FLOW_DATA + 8 * i, + FpgaInterface::pack_position(truth[i].x, truth[i].y)); + fpga.write_register(REG_FLOW_DATA + 8 * i + 4, + FpgaInterface::pack_flow(truth[i].vx, truth[i].vy)); + } + fpga.write_register(REG_FLOW_COUNT, static_cast(truth.size())); + fpga.write_register(REG_FLOW_SEQ, 1); + + std::vector events; + int n = fpga.read_flow_vectors(events, 4096); + + TEST("Round-trip returns all packed entries"); + CHECK(n == static_cast(truth.size()) && events.size() == truth.size(), + "expected %zu, got %d", truth.size(), n); + + TEST("Decoded values match within quantization error"); + bool within_eps = true; + const float pos_eps = 1.0f / 4096.0f + 1e-5f; // UQ4.12 LSB + const float flow_eps = 1.0f / 256.0f + 1e-5f; // Q8.8 LSB + for (int i = 0; i < n; i++) { + if (std::fabs(events[i].x - truth[i].x) > pos_eps || + std::fabs(events[i].y - truth[i].y) > pos_eps || + std::fabs(events[i].vx - truth[i].vx) > flow_eps || + std::fabs(events[i].vy - truth[i].vy) > flow_eps) { + within_eps = false; + break; + } + } + CHECK(within_eps, "quantization error exceeded UQ4.12/Q8.8 bounds"); + + TEST("Decoded flow still drives collision prediction end-to-end"); + CollisionPredictor::Config cfg; + cfg.cluster_radius = 0.15f; + cfg.flow_similarity_threshold = 0.5f; + cfg.min_events_per_cluster = 10; + cfg.safety_time_threshold = 2.0f; + CollisionPredictor predictor(cfg); + ThreatAssessment result = predictor.assess(events); + CHECK(result.threat_detected && !result.objects.empty() && result.objects[0].ttc < 100.0f, + "register-decoded flow failed to produce a finite-TTC threat"); + + TEST("Zero count returns no events"); + fpga.write_register(REG_FLOW_COUNT, 0); + CHECK(fpga.read_flow_vectors(events, 4096) == 0, "expected 0 events for zero count"); + + TEST("Oversized count clamps to FLOW_MAX_OUT"); + fpga.write_register(REG_FLOW_COUNT, 5000); + CHECK(fpga.read_flow_vectors(events, 4096) == FLOW_MAX_OUT, "expected clamp to %d", + FLOW_MAX_OUT); + + TEST("max_events caps the returned batch"); + fpga.write_register(REG_FLOW_COUNT, static_cast(truth.size())); + CHECK(fpga.read_flow_vectors(events, 50) == 50, "expected 50 events"); +} + // =========================================================================== // EVASION CONTROLLER TESTS // =========================================================================== @@ -474,6 +540,9 @@ int main() { test_multi_object_clustering(); test_safe_bearing(); + printf("\n--- FPGA Interface Tests ---\n"); + test_fpga_flow_register_roundtrip(); + printf("\n--- Evasion Controller Tests ---\n"); test_evasion_controller_levels(); test_evasion_hysteresis();