From 5385302180374f7a772cf8d35625a5450d46bfd7 Mon Sep 17 00:00:00 2001
From: Ryan Breen <ryan@ryanbreen.com>
Date: Tue, 3 Mar 2026 15:42:32 -0500
Subject: [PATCH] feat: MSI interrupt-driven VirtIO GPU + VirGL command
 encoding fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace busy-polling with MSI interrupt + WFI completion for VirtIO GPU
commands. Each VirGL frame previously burned ~50ms spinning in a tight
poll loop with SeqCst fences. Now the guest halts with WFI, Parallels
processes the command immediately, and delivers an MSI interrupt to wake
the vCPU. Result: 10 FPS → 2000-3000 FPS.

Key changes:
- SPI allocation infrastructure (allocate_msi_spi) so GPU and xHCI
  don't collide on the same GICv2m SPI
- Cached queue notify addresses — single write_volatile instead of
  3 MMIO ops per notification
- MSI setup + interrupt handler with WFI completion loop
- GPU MSI dispatch in aarch64 exception handler

VirGL command encoding fixes:
- Rasterizer fill mode was POINT (2) instead of FILL (0) — all
  triangles rendered as invisible 1-pixel dots
- SET_VERTEX_BUFFERS had spurious start_slot DWORD making length%3!=0,
  causing silent EINVAL from host (vertex buffers never bound)
- SHADER_FRAGMENT constant was 4 (TESS_EVAL) instead of 1
- Shader compilation never triggered (bit 31 not set in offset field)

Also adds VirGL command encoder module (virgl.rs), Linux reference
implementations for Parallels VirGL benchmarking, and minor cleanups.

Co-Authored-By: Ryan Breen <rbreen@gmail.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 kernel/src/arch_impl/aarch64/exception.rs     |    6 +
 .../src/arch_impl/aarch64/timer_interrupt.rs  |    5 -
 kernel/src/drivers/mod.rs                     |    9 +-
 kernel/src/drivers/usb/xhci.rs                |   10 +-
 kernel/src/drivers/virtio/gpu_pci.rs          | 1354 ++++++++++++++++-
 kernel/src/drivers/virtio/mod.rs              |    2 +
 kernel/src/drivers/virtio/pci_transport.rs    |   41 +
 kernel/src/drivers/virtio/virgl.rs            |  396 +++++
 kernel/src/graphics/arm64_fb.rs               |   19 +
 kernel/src/platform_config.rs                 |   40 +-
 kernel/src/signal/delivery.rs                 |   10 +-
 kernel/src/syscall/graphics.rs                |  221 ++-
 libs/libbreenix/src/graphics.rs               |   79 +
 scripts/parallels/gl_bench.c                  |  423 +++++
 scripts/parallels/gl_display.c                |  470 ++++++
 scripts/parallels/vm-type.sh                  |   97 ++
 userspace/programs/src/bounce.rs              |  205 ++-
 17 files changed, 3282 insertions(+), 105 deletions(-)
 create mode 100644 kernel/src/drivers/virtio/virgl.rs
 create mode 100644 scripts/parallels/gl_bench.c
 create mode 100644 scripts/parallels/gl_display.c
 create mode 100755 scripts/parallels/vm-type.sh

diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs
index e0026f20..0e6192df 100644
--- a/kernel/src/arch_impl/aarch64/exception.rs
+++ b/kernel/src/arch_impl/aarch64/exception.rs
@@ -1028,6 +1028,12 @@ pub extern "C" fn handle_irq() {
                         crate::drivers::usb::xhci::handle_interrupt();
                     }
                 }
+                // VirtIO GPU PCI interrupt dispatch (MSI completion)
+                if let Some(gpu_irq) = crate::drivers::virtio::gpu_pci::get_irq() {
+                    if irq_id == gpu_irq {
+                        crate::drivers::virtio::gpu_pci::handle_interrupt();
+                    }
+                }
             }
 
             // Should not happen - GIC filters invalid IDs (1020+)
diff --git a/kernel/src/arch_impl/aarch64/timer_interrupt.rs b/kernel/src/arch_impl/aarch64/timer_interrupt.rs
index 38784d41..45de82c0 100644
--- a/kernel/src/arch_impl/aarch64/timer_interrupt.rs
+++ b/kernel/src/arch_impl/aarch64/timer_interrupt.rs
@@ -168,11 +168,6 @@ pub extern "C" fn timer_interrupt_handler() {
     // Increment timer interrupt counter (used for debugging when needed)
     let _count = TIMER_INTERRUPT_COUNT.fetch_add(1, Ordering::Relaxed) + 1;
 
-    // Debug breadcrumb: print '.' every 200 ticks (~1 second) to verify timer is alive
-    if cpu_id == 0 && _count % 200 == 0 {
-        raw_serial_char(b'.');
-    }
-
     // CPU 0 only: poll input devices (single-device, not safe from multiple CPUs)
     if cpu_id == 0 {
         poll_keyboard_to_stdin();
diff --git a/kernel/src/drivers/mod.rs b/kernel/src/drivers/mod.rs
index e80783fd..adabb5a8 100644
--- a/kernel/src/drivers/mod.rs
+++ b/kernel/src/drivers/mod.rs
@@ -97,7 +97,14 @@ pub fn init() -> usize {
         serial_println!("[drivers] Found {} VirtIO PCI devices", virtio_devices.len());
 
         match virtio::gpu_pci::init() {
-            Ok(()) => serial_println!("[drivers] VirtIO GPU (PCI) initialized"),
+            Ok(()) => {
+                serial_println!("[drivers] VirtIO GPU (PCI) initialized");
+                // Attempt to initialize VirGL 3D acceleration if the device supports it
+                match virtio::gpu_pci::virgl_init() {
+                    Ok(()) => serial_println!("[drivers] VirGL 3D acceleration active"),
+                    Err(e) => serial_println!("[drivers] VirGL init skipped: {}", e),
+                }
+            }
             Err(e) => serial_println!("[drivers] VirtIO GPU (PCI) init failed: {}", e),
         }
 
diff --git a/kernel/src/drivers/usb/xhci.rs b/kernel/src/drivers/usb/xhci.rs
index c1cf7665..7c1efbec 100644
--- a/kernel/src/drivers/usb/xhci.rs
+++ b/kernel/src/drivers/usb/xhci.rs
@@ -4122,7 +4122,7 @@ fn setup_xhci_msi(pci_dev: &crate::drivers::pci::Device) -> u32 {
     // On Parallels ARM64, GICv2m is at 0x02250000 (discovered from MADT).
     const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000;
     let gicv2m_base = crate::platform_config::gicv2m_base_phys();
-    let (base, spi_base, spi_count) = if gicv2m_base != 0 {
+    let (base, _spi_base, spi_count) = if gicv2m_base != 0 {
         // Already probed
         (
             gicv2m_base,
@@ -4146,8 +4146,12 @@ fn setup_xhci_msi(pci_dev: &crate::drivers::pci::Device) -> u32 {
         return 0;
     }
 
-    // Step 3: Allocate first available SPI for XHCI
-    let spi = spi_base;
+    // Step 3: Allocate next available SPI for XHCI
+    let spi = crate::platform_config::allocate_msi_spi();
+    if spi == 0 {
+        xhci_trace_note(0, "err:alloc_spi");
+        return 0;
+    }
     let intid = spi; // GIC INTID = SPI number for GICv2m
 
     // Step 4: Program PCI MSI registers
diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs
index 2aea0374..ad23db37 100644
--- a/kernel/src/drivers/virtio/gpu_pci.rs
+++ b/kernel/src/drivers/virtio/gpu_pci.rs
@@ -9,7 +9,7 @@
 
 use super::pci_transport::VirtioPciDevice;
 use core::ptr::read_volatile;
-use core::sync::atomic::{fence, AtomicBool, Ordering};
+use core::sync::atomic::{fence, AtomicBool, AtomicU32, Ordering};
 use spin::Mutex;
 
 /// Lock protecting the GPU PCI command path (PCI_CMD_BUF, PCI_RESP_BUF,
@@ -35,9 +35,23 @@ mod cmd {
     pub const RESOURCE_ATTACH_BACKING: u32 = 0x0106;
     pub const RESOURCE_DETACH_BACKING: u32 = 0x0107;
 
+    // 3D commands (VirGL)
+    pub const CTX_CREATE: u32 = 0x0200;
+    pub const CTX_DESTROY: u32 = 0x0201;
+    pub const CTX_ATTACH_RESOURCE: u32 = 0x0202;
+    pub const CTX_DETACH_RESOURCE: u32 = 0x0203;
+    pub const RESOURCE_CREATE_3D: u32 = 0x0204;
+    pub const TRANSFER_TO_HOST_3D: u32 = 0x0205;
+    pub const TRANSFER_FROM_HOST_3D: u32 = 0x0206;
+    pub const SUBMIT_3D: u32 = 0x0207;
+
+    // Capability commands
+    pub const GET_CAPSET_INFO: u32 = 0x0110;
+
     // Response types
     pub const RESP_OK_NODATA: u32 = 0x1100;
     pub const RESP_OK_DISPLAY_INFO: u32 = 0x1101;
+    pub const RESP_OK_CAPSET_INFO: u32 = 0x1102;
     pub const RESP_ERR_UNSPEC: u32 = 0x1200;
 }
 
@@ -158,6 +172,106 @@ struct VirtioGpuResourceFlush {
     padding: u32,
 }
 
+/// GET_CAPSET_INFO request
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuGetCapsetInfo {
+    hdr: VirtioGpuCtrlHdr,
+    capset_index: u32,
+    padding: u32,
+}
+
+/// GET_CAPSET_INFO response
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuRespCapsetInfo {
+    hdr: VirtioGpuCtrlHdr,
+    capset_id: u32,
+    capset_max_version: u32,
+    capset_max_size: u32,
+    padding: u32,
+}
+
+// VirtIO GPU config space offsets
+const GPU_CFG_EVENTS_READ: usize = 0;
+const GPU_CFG_EVENTS_CLEAR: usize = 4;
+const GPU_CFG_NUM_SCANOUTS: usize = 8;
+const GPU_CFG_NUM_CAPSETS: usize = 12;
+const VIRTIO_GPU_EVENT_DISPLAY: u32 = 1 << 0;
+
+// =============================================================================
+// VirtIO GPU 3D (VirGL) Protocol Structures
+// =============================================================================
+
+/// Create a 3D rendering context
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct VirtioGpuCtxCreate {
+    hdr: VirtioGpuCtrlHdr,
+    nlen: u32,
+    context_init: u32, // 0 for VirGL
+    debug_name: [u8; 64],
+}
+
+/// Attach/detach a resource to/from a 3D context
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuCtxResource {
+    hdr: VirtioGpuCtrlHdr,
+    resource_id: u32,
+    padding: u32,
+}
+
+/// Create a 3D resource (texture, render target, buffer)
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuResourceCreate3d {
+    hdr: VirtioGpuCtrlHdr,
+    resource_id: u32,
+    target: u32,
+    format: u32,
+    bind: u32,
+    width: u32,
+    height: u32,
+    depth: u32,
+    array_size: u32,
+    last_level: u32,
+    nr_samples: u32,
+    flags: u32,
+    padding: u32,
+}
+
+/// Transfer host 3D command — used for both TRANSFER_TO_HOST_3D (0x0205)
+/// and TRANSFER_FROM_HOST_3D (0x0206). Copies between guest backing and
+/// host-side texture. Linux's DRM driver calls TRANSFER_TO_HOST_3D before
+/// RESOURCE_FLUSH even for VirGL-rendered content — it serves as a
+/// synchronization point that tells the host the resource is display-ready.
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuTransferHost3d {
+    hdr: VirtioGpuCtrlHdr,
+    box_x: u32,
+    box_y: u32,
+    box_z: u32,
+    box_w: u32,
+    box_h: u32,
+    box_d: u32,
+    offset: u64,
+    resource_id: u32,
+    level: u32,
+    stride: u32,
+    layer_stride: u32,
+}
+
+/// Submit 3D command buffer header (followed immediately by VirGL command data)
+#[repr(C, packed)]
+#[derive(Clone, Copy, Default)]
+struct VirtioGpuCmdSubmit {
+    hdr: VirtioGpuCtrlHdr,
+    size: u32, // size in bytes of the VirGL command buffer
+    // NO padding — VirGL data follows immediately at offset 28
+}
+
 // =============================================================================
 // Virtqueue Structures
 // =============================================================================
@@ -232,6 +346,15 @@ struct PciCmdBuffer {
 static mut PCI_CMD_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] };
 static mut PCI_RESP_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] };
 
+/// Larger command buffer for 3D submissions (Submit3D header + VirGL payload).
+/// 16KB accommodates shader text + inline vertex data for 12 circle draws.
+#[repr(C, align(64))]
+struct Pci3dCmdBuffer {
+    data: [u8; 16384],
+}
+static mut PCI_3D_CMD_BUF: Pci3dCmdBuffer = Pci3dCmdBuffer { data: [0; 16384] };
+
+
 // Default framebuffer dimensions (Parallels: set_scanout configures display mode)
 // 1728x1080 matches the QEMU resolution for consistent performance comparison.
 const DEFAULT_FB_WIDTH: u32 = 1728;
@@ -242,12 +365,38 @@ const FB_MAX_HEIGHT: u32 = 1600;
 const FB_SIZE: usize = (FB_MAX_WIDTH * FB_MAX_HEIGHT * 4) as usize;
 const BYTES_PER_PIXEL: usize = 4;
 const RESOURCE_ID: u32 = 1;
+/// Resource ID for the VirGL 3D render target (with BIND_SCANOUT)
+const RESOURCE_3D_ID: u32 = 2;
+/// Resource ID for the VirGL vertex buffer
+const RESOURCE_VB_ID: u32 = 3;
+/// VirGL 3D context ID
+const VIRGL_CTX_ID: u32 = 1;
+/// Maximum circles we can render per frame
+const MAX_CIRCLES: usize = 16;
+/// Vertices per circle (triangle fan: center + N perimeter + closing vertex)
+const CIRCLE_SEGMENTS: usize = 16;
+/// Vertices per circle = center + segments + 1 (close fan)
+const VERTS_PER_CIRCLE: usize = CIRCLE_SEGMENTS + 2;
+/// Bytes per vertex: position (4×f32) + color (4×f32) = 32 bytes
+const BYTES_PER_VERTEX: usize = 32;
+/// Vertex buffer size: enough for MAX_CIRCLES circles
+const VB_SIZE: usize = MAX_CIRCLES * VERTS_PER_CIRCLE * BYTES_PER_VERTEX;
 
 // VirtIO standard feature bits
 const VIRTIO_F_VERSION_1: u64 = 1 << 32;
 // VirtIO GPU feature bits (requested but not required)
 #[allow(dead_code)]
 const VIRTIO_GPU_F_EDID: u64 = 1 << 1;
+// VirtIO GPU 3D (VirGL) acceleration
+const VIRTIO_GPU_F_VIRGL: u64 = 1 << 0;
+
+/// Whether VirGL 3D acceleration was successfully negotiated with the device.
+static VIRGL_ENABLED: AtomicBool = AtomicBool::new(false);
+/// Whether SET_SCANOUT has been issued for the 3D resource.
+/// Deferred until the first virgl_render_frame call so that the mmap/GOP
+/// path keeps working until userspace explicitly opts into VirGL.
+static VIRGL_SCANOUT_ACTIVE: AtomicBool = AtomicBool::new(false);
+
 
 #[repr(C, align(4096))]
 struct PciFramebuffer {
@@ -256,10 +405,25 @@ struct PciFramebuffer {
 
 static mut PCI_FRAMEBUFFER: PciFramebuffer = PciFramebuffer { pixels: [0; FB_SIZE] };
 
+/// Separate backing for 3D resource — NOT shared with the 2D resource.
+/// Linux's Mesa/virgl creates independent GEM buffers for each resource.
+/// Sharing backing between 2D and 3D resources may cause the hypervisor
+/// to handle SET_SCANOUT incorrectly.
+const FB_3D_SIZE: usize = (DEFAULT_FB_WIDTH * DEFAULT_FB_HEIGHT * 4) as usize;
+
+#[repr(C, align(4096))]
+struct Pci3dFramebuffer {
+    pixels: [u8; FB_3D_SIZE],
+}
+static mut PCI_3D_FRAMEBUFFER: Pci3dFramebuffer = Pci3dFramebuffer { pixels: [0; FB_3D_SIZE] };
+
 // =============================================================================
 // GPU PCI Device State
 // =============================================================================
 
+/// VirtIO GPU fence flag — tells the host to signal completion via fence_id.
+const VIRTIO_GPU_FLAG_FENCE: u32 = 1;
+
 /// Combined GPU PCI device state (transport + GPU state)
 struct GpuPciDeviceState {
     device: VirtioPciDevice,
@@ -267,11 +431,22 @@ struct GpuPciDeviceState {
     height: u32,
     resource_id: u32,
     last_used_idx: u16,
+    /// Monotonically increasing fence counter for GPU synchronization.
+    /// Each fenced command gets a unique fence_id; the host signals completion
+    /// by echoing this ID in the response. Required for TRANSFER_FROM_HOST_3D
+    /// to ensure DMA writes complete before reading backing memory.
+    next_fence_id: u64,
 }
 
 static mut GPU_PCI_STATE: Option<GpuPciDeviceState> = None;
 static GPU_PCI_INITIALIZED: AtomicBool = AtomicBool::new(false);
 
+/// GIC INTID (SPI number) allocated for GPU MSI. 0 = polling mode.
+static GPU_IRQ: AtomicU32 = AtomicU32::new(0);
+
+/// Set by the interrupt handler to wake the WFI loop in send_command().
+static GPU_CMD_COMPLETE: AtomicBool = AtomicBool::new(false);
+
 // =============================================================================
 // Helpers
 // =============================================================================
@@ -297,6 +472,109 @@ pub fn is_initialized() -> bool {
     GPU_PCI_INITIALIZED.load(Ordering::Acquire)
 }
 
+/// Check if VirGL 3D acceleration was negotiated with the device.
+pub fn is_virgl_enabled() -> bool {
+    VIRGL_ENABLED.load(Ordering::Acquire)
+}
+
+/// Disable VirGL for display purposes (e.g., Parallels can't route VirGL output to screen).
+pub fn disable_virgl() {
+    VIRGL_ENABLED.store(false, Ordering::Release);
+}
+
+// =============================================================================
+// MSI Interrupt Support
+// =============================================================================
+
+/// Set up PCI MSI for the VirtIO GPU through GICv2m.
+///
+/// Follows the same pattern as xHCI MSI setup: find MSI cap, probe GICv2m,
+/// allocate SPI, program MSI registers, configure GIC.
+///
+/// Returns the allocated SPI number, or 0 if MSI is unavailable.
+#[cfg(target_arch = "aarch64")]
+fn setup_gpu_msi(pci_dev: &crate::drivers::pci::Device) -> u32 {
+    use crate::arch_impl::aarch64::gic;
+
+    // Step 1: Find MSI capability
+    let msi_cap = match pci_dev.find_msi_capability() {
+        Some(offset) => offset,
+        None => {
+            crate::serial_println!("[virtio-gpu-pci] No MSI capability found, using polling");
+            return 0;
+        }
+    };
+
+    // Step 2: Ensure GICv2m is probed
+    const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000;
+    let gicv2m_base = crate::platform_config::gicv2m_base_phys();
+    let base = if gicv2m_base != 0 {
+        gicv2m_base
+    } else if crate::platform_config::probe_gicv2m(PARALLELS_GICV2M_BASE) {
+        PARALLELS_GICV2M_BASE
+    } else {
+        crate::serial_println!("[virtio-gpu-pci] GICv2m not available, using polling");
+        return 0;
+    };
+
+    // Step 3: Allocate SPI
+    let spi = crate::platform_config::allocate_msi_spi();
+    if spi == 0 {
+        crate::serial_println!("[virtio-gpu-pci] No SPIs available, using polling");
+        return 0;
+    }
+
+    // Step 4: Program PCI MSI registers
+    let msi_address = (base + 0x40) as u32;
+    let msi_data = spi as u16;
+    pci_dev.configure_msi(msi_cap, msi_address, msi_data);
+    pci_dev.disable_intx();
+
+    // Step 5: Configure GIC for this SPI (edge-triggered)
+    gic::configure_spi_edge_triggered(spi);
+
+    crate::serial_println!("[virtio-gpu-pci] MSI configured: SPI={}", spi);
+    spi
+}
+
+/// Handle GPU MSI interrupt — called from exception.rs IRQ dispatch.
+///
+/// Wakes the WFI loop in send_command() by setting GPU_CMD_COMPLETE.
+/// Follows the xHCI pattern: disable SPI, clear pending, ack, re-enable.
+#[cfg(target_arch = "aarch64")]
+pub fn handle_interrupt() {
+    use crate::arch_impl::aarch64::gic;
+
+    let irq = GPU_IRQ.load(Ordering::Relaxed);
+    if irq == 0 {
+        return;
+    }
+
+    gic::disable_spi(irq);
+    gic::clear_spi_pending(irq);
+
+    // Read ISR to auto-acknowledge the VirtIO interrupt condition
+    if GPU_PCI_INITIALIZED.load(Ordering::Acquire) {
+        unsafe {
+            let ptr = &raw const GPU_PCI_STATE;
+            if let Some(ref state) = *ptr {
+                state.device.read_interrupt_status();
+            }
+        }
+    }
+
+    GPU_CMD_COMPLETE.store(true, Ordering::Release);
+
+    gic::clear_spi_pending(irq);
+    gic::enable_spi(irq);
+}
+
+/// Get the GIC INTID for the GPU interrupt (for exception dispatch).
+pub fn get_irq() -> Option<u32> {
+    let irq = GPU_IRQ.load(Ordering::Relaxed);
+    if irq != 0 { Some(irq) } else { None }
+}
+
 // =============================================================================
 // Initialization
 // =============================================================================
@@ -324,9 +602,23 @@ pub fn init() -> Result<(), &'static str> {
     // VIRTIO_F_VERSION_1 is mandatory for PCI modern transport — without it,
     // Parallels's GPU device accepts the feature set but ignores subsequent
     // state-modifying commands (create_resource, attach_backing, etc.).
-    let requested = VIRTIO_F_VERSION_1 | VIRTIO_GPU_F_EDID;
+    let requested = VIRTIO_F_VERSION_1 | VIRTIO_GPU_F_EDID | VIRTIO_GPU_F_VIRGL;
+
+    // Log raw device-offered features before negotiation
+    let device_feats = virtio.read_device_features();
+    crate::serial_println!("[virtio-gpu-pci] Device features: {:#018x}", device_feats);
+    crate::serial_println!("[virtio-gpu-pci] VIRGL offered: {}", device_feats & VIRTIO_GPU_F_VIRGL != 0);
+
     virtio.init(requested)?;
-    let _negotiated = virtio.device_features() & requested;
+
+    // Check what was actually negotiated
+    let negotiated = virtio.device_features() & requested;
+    let virgl_on = negotiated & VIRTIO_GPU_F_VIRGL != 0;
+    crate::serial_println!("[virtio-gpu-pci] Negotiated: {:#018x} (VIRGL={})", negotiated, virgl_on);
+    VIRGL_ENABLED.store(virgl_on, Ordering::Release);
+    crate::serial_println!("[virtio-gpu-pci] VIRGL_ENABLED stored={}, readback={}, addr={:#x}",
+        virgl_on, VIRGL_ENABLED.load(Ordering::Acquire),
+        &VIRGL_ENABLED as *const _ as usize);
 
     // Set up control queue (queue 0)
     virtio.select_queue(0);
@@ -361,9 +653,38 @@ pub fn init() -> Result<(), &'static str> {
     virtio.set_queue_used(queue_phys + 4096);
     virtio.set_queue_ready(true);
 
-    // Mark device ready
+    // Cache queue 0 notify address to avoid 2 MMIO reads per notification
+    virtio.cache_queue_notify_addr(0);
+
+    // Set up MSI interrupt before driver_ok so the device can signal completions
+    #[cfg(target_arch = "aarch64")]
+    let msi_spi = setup_gpu_msi(virtio.pci_device());
+    #[cfg(not(target_arch = "aarch64"))]
+    let msi_spi = 0u32;
+
+    // Mark device ready — MUST happen before sending any commands (Linux: virtio_device_ready())
     virtio.driver_ok();
 
+    // Enable the MSI SPI after driver_ok so the device can actually fire interrupts
+    #[cfg(target_arch = "aarch64")]
+    if msi_spi != 0 {
+        GPU_IRQ.store(msi_spi, Ordering::Release);
+        crate::arch_impl::aarch64::gic::enable_spi(msi_spi);
+        crate::serial_println!("[virtio-gpu-pci] MSI SPI {} enabled", msi_spi);
+    }
+
+    // Read device-specific config (Linux reads num_scanouts + num_capsets here)
+    let num_scanouts = virtio.read_config_u32(GPU_CFG_NUM_SCANOUTS);
+    let num_capsets = virtio.read_config_u32(GPU_CFG_NUM_CAPSETS);
+    crate::serial_println!("[virtio-gpu-pci] Config: num_scanouts={}, num_capsets={}", num_scanouts, num_capsets);
+
+    // Check and clear pending display events (Linux: virtio_gpu_config_changed_work_func)
+    let events = virtio.read_config_u32(GPU_CFG_EVENTS_READ);
+    if events & VIRTIO_GPU_EVENT_DISPLAY != 0 {
+        crate::serial_println!("[virtio-gpu-pci] Clearing pending DISPLAY event (events_read={:#x})", events);
+        virtio.write_config_u32(GPU_CFG_EVENTS_CLEAR, events & VIRTIO_GPU_EVENT_DISPLAY);
+    }
+
     // Store initial state with default dimensions (will be updated after display query)
     unsafe {
         let ptr = &raw mut GPU_PCI_STATE;
@@ -373,11 +694,23 @@ pub fn init() -> Result<(), &'static str> {
             height: DEFAULT_FB_HEIGHT,
             resource_id: RESOURCE_ID,
             last_used_idx: 0,
+            next_fence_id: 1,
         });
     }
     // Don't set GPU_PCI_INITIALIZED yet — the GPU commands below can fail.
-    // If create_resource/attach_backing/set_scanout/flush time out, leaving
-    // the flag true would mislead other code into thinking the device is usable.
+
+    // GET_CAPSET_INFO for each capset (Linux does this before GET_DISPLAY_INFO)
+    for idx in 0..num_capsets {
+        match get_capset_info(idx) {
+            Ok((id, max_ver, max_size)) => {
+                crate::serial_println!("[virtio-gpu-pci] Capset {}: id={}, max_ver={}, max_size={}",
+                    idx, id, max_ver, max_size);
+            }
+            Err(e) => {
+                crate::serial_println!("[virtio-gpu-pci] GET_CAPSET_INFO[{}] failed: {}", idx, e);
+            }
+        }
+    }
 
     // Query display info to see what Parallels reports as native resolution.
     let display_dims = get_display_info();
@@ -484,16 +817,15 @@ fn send_command(
         fence(Ordering::SeqCst);
     }
 
-    // Notify device via PCI transport
-    state.device.notify_queue(0);
+    // Signal that we're waiting for a completion, then notify device
+    GPU_CMD_COMPLETE.store(false, Ordering::Release);
+    state.device.notify_queue_fast(0);
 
-    // Spin-wait for used ring.
-    // The timeout must be generous: TRANSFER_TO_HOST_2D transfers up to 4MB
-    // (full framebuffer) and QEMU processes this in its event loop.
-    // 10M iterations is safe.
+    // Wait for used ring update — WFI if MSI is available, spin_loop otherwise.
+    let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0;
     let mut timeout = 10_000_000u32;
     loop {
-        fence(Ordering::SeqCst);
+        fence(Ordering::Acquire);
         let used_idx = unsafe {
             let q = &raw const PCI_CTRL_QUEUE;
             read_volatile(&(*q).used.idx)
@@ -506,7 +838,17 @@ fn send_command(
         if timeout == 0 {
             return Err("GPU PCI command timeout");
         }
-        core::hint::spin_loop();
+        if use_msi {
+            // WFI halts the vCPU until an interrupt arrives. The hypervisor
+            // processes the VirtIO command while the guest is halted, then
+            // delivers the MSI interrupt to wake us.
+            #[cfg(target_arch = "aarch64")]
+            unsafe { core::arch::asm!("wfi", options(nomem, nostack)); }
+            #[cfg(not(target_arch = "aarch64"))]
+            core::hint::spin_loop();
+        } else {
+            core::hint::spin_loop();
+        }
     }
 
     Ok(())
@@ -528,12 +870,18 @@ fn send_command_expect_ok(
     )?;
 
     // Read response — use read_volatile to defeat caching (DMA coherency)
-    let resp_type = unsafe {
+    let (resp_type, resp_flags, resp_fence) = unsafe {
         let resp_ptr = &raw const PCI_RESP_BUF;
-        core::ptr::read_volatile(&(*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr)).type_)
+        let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr);
+        (
+            core::ptr::read_volatile(&hdr.type_),
+            core::ptr::read_volatile(&hdr.flags),
+            core::ptr::read_volatile(&hdr.fence_id),
+        )
     };
     if resp_type != cmd::RESP_OK_NODATA {
-        crate::serial_println!("[virtio-gpu-pci] Command failed: resp_type={:#x}", resp_type);
+        crate::serial_println!("[virtio-gpu-pci] Command failed: resp_type={:#x} flags={:#x} fence={}",
+            resp_type, resp_flags, resp_fence);
         return Err("GPU PCI command failed");
     }
     Ok(())
@@ -591,6 +939,57 @@ fn get_display_info() -> Result<(u32, u32), &'static str> {
     })
 }
 
+/// Query capability set info (Linux: virtio_gpu_get_capsets).
+/// Returns (capset_id, max_version, max_size).
+fn get_capset_info(capset_index: u32) -> Result<(u32, u32, u32), &'static str> {
+    with_device_state(|state| {
+        let cmd_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64);
+        let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64);
+
+        unsafe {
+            let cmd_ptr = &raw mut PCI_CMD_BUF;
+            let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuGetCapsetInfo);
+            *cmd = VirtioGpuGetCapsetInfo {
+                hdr: VirtioGpuCtrlHdr {
+                    type_: cmd::GET_CAPSET_INFO,
+                    flags: 0,
+                    fence_id: 0,
+                    ctx_id: 0,
+                    padding: 0,
+                },
+                capset_index,
+                padding: 0,
+            };
+        }
+
+        send_command(
+            state,
+            cmd_phys,
+            core::mem::size_of::<VirtioGpuGetCapsetInfo>() as u32,
+            resp_phys,
+            core::mem::size_of::<VirtioGpuRespCapsetInfo>() as u32,
+        )?;
+
+        unsafe {
+            let resp_ptr = &raw const PCI_RESP_BUF;
+            let resp = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuRespCapsetInfo);
+            let resp_type = core::ptr::read_volatile(&resp.hdr.type_);
+
+            if resp_type != cmd::RESP_OK_CAPSET_INFO {
+                crate::serial_println!("[virtio-gpu-pci] GET_CAPSET_INFO: unexpected resp_type={:#x} (expected {:#x})",
+                    resp_type, cmd::RESP_OK_CAPSET_INFO);
+                return Err("GET_CAPSET_INFO failed");
+            }
+
+            Ok((
+                core::ptr::read_volatile(&resp.capset_id),
+                core::ptr::read_volatile(&resp.capset_max_version),
+                core::ptr::read_volatile(&resp.capset_max_size),
+            ))
+        }
+    })
+}
+
 fn create_resource() -> Result<(), &'static str> {
     with_device_state(|state| {
         framebuffer_len(state)?;
@@ -755,7 +1154,503 @@ fn resource_flush_cmd(
 }
 
 // =============================================================================
-// Public API
+// 3D (VirGL) Command Helpers
+// =============================================================================
+
+/// Create a VirGL 3D rendering context.
+fn virgl_ctx_create_cmd(state: &mut GpuPciDeviceState, ctx_id: u32, name: &[u8]) -> Result<(), &'static str> {
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCtxCreate);
+        let mut debug_name = [0u8; 64];
+        let copy_len = name.len().min(63);
+        debug_name[..copy_len].copy_from_slice(&name[..copy_len]);
+        *cmd = VirtioGpuCtxCreate {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::CTX_CREATE,
+                flags: 0,
+                fence_id: 0,
+                ctx_id,
+                padding: 0,
+            },
+            nlen: copy_len as u32,
+            context_init: 0, // VirGL context
+            debug_name,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuCtxCreate>() as u32)
+}
+
+/// Attach a resource to a VirGL context.
+fn virgl_ctx_attach_resource_cmd(state: &mut GpuPciDeviceState, ctx_id: u32, resource_id: u32) -> Result<(), &'static str> {
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCtxResource);
+        *cmd = VirtioGpuCtxResource {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::CTX_ATTACH_RESOURCE,
+                flags: 0,
+                fence_id: 0,
+                ctx_id,
+                padding: 0,
+            },
+            resource_id,
+            padding: 0,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuCtxResource>() as u32)
+}
+
+/// Create a 3D resource (texture / render target / buffer).
+fn virgl_resource_create_3d_cmd(
+    state: &mut GpuPciDeviceState,
+    resource_id: u32,
+    target: u32,
+    fmt: u32,
+    bind: u32,
+    width: u32,
+    height: u32,
+    depth: u32,
+    array_size: u32,
+) -> Result<(), &'static str> {
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuResourceCreate3d);
+        *cmd = VirtioGpuResourceCreate3d {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::RESOURCE_CREATE_3D,
+                flags: 0,
+                fence_id: 0,
+                ctx_id: 0,
+                padding: 0,
+            },
+            resource_id,
+            target,
+            format: fmt,
+            bind,
+            width,
+            height,
+            depth,
+            array_size,
+            last_level: 0,
+            nr_samples: 0,
+            flags: 0,
+            padding: 0,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuResourceCreate3d>() as u32)
+}
+
+/// Attach backing memory to a 3D resource.
+///
+/// Uses PCI_3D_FRAMEBUFFER (separate BSS RAM) as the backing store.
+/// CRITICAL: Must NOT share backing with the 2D resource (PCI_FRAMEBUFFER).
+/// Linux's Mesa/virgl creates independent GEM buffers for each resource.
+/// Sharing backing may cause the hypervisor to mishandle SET_SCANOUT.
+fn virgl_attach_backing_cmd(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> {
+    let fb_addr = virt_to_phys(&raw const PCI_3D_FRAMEBUFFER as u64);
+    let actual_len = (state.width * state.height * 4).min(FB_3D_SIZE as u32);
+    crate::serial_println!("[virgl] attach_backing: 3D RAM phys=0x{:x}, len={} (SEPARATE from 2D)", fb_addr, actual_len);
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut PciAttachBackingCmd);
+        *cmd = PciAttachBackingCmd {
+            cmd: VirtioGpuResourceAttachBacking {
+                hdr: VirtioGpuCtrlHdr {
+                    type_: cmd::RESOURCE_ATTACH_BACKING,
+                    flags: 0,
+                    fence_id: 0,
+                    ctx_id: 0,
+                    padding: 0,
+                },
+                resource_id,
+                nr_entries: 1,
+            },
+            entry: VirtioGpuMemEntry {
+                addr: fb_addr,
+                length: actual_len,
+                padding: 0,
+            },
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<PciAttachBackingCmd>() as u32)
+}
+
+/// Flush a specific resource to the display (SET_SCANOUT must point at it first).
+fn resource_flush_3d(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> {
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuResourceFlush);
+        *cmd = VirtioGpuResourceFlush {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::RESOURCE_FLUSH,
+                flags: 0,
+                fence_id: 0,
+                ctx_id: 0,
+                padding: 0,
+            },
+            r_x: 0,
+            r_y: 0,
+            r_width: state.width,
+            r_height: state.height,
+            resource_id,
+            padding: 0,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuResourceFlush>() as u32)
+}
+
+/// Transfer a 3D resource from guest backing to host texture (upload).
+///
+/// NOTE: Not used for VirGL-rendered 3D resources — the host GPU already has
+/// the rendered data. Kept for potential future use with CPU-written resources.
+#[allow(dead_code)]
+fn transfer_to_host_3d(
+    state: &mut GpuPciDeviceState,
+    resource_id: u32,
+    x: u32,
+    y: u32,
+    w: u32,
+    h: u32,
+) -> Result<(), &'static str> {
+    let stride = state.width * 4;
+    let offset = (y as u64) * (stride as u64) + (x as u64) * 4;
+
+    let fence_id = state.next_fence_id;
+    state.next_fence_id += 1;
+
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuTransferHost3d);
+        *cmd = VirtioGpuTransferHost3d {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::TRANSFER_TO_HOST_3D,
+                flags: VIRTIO_GPU_FLAG_FENCE,
+                fence_id,
+                ctx_id: VIRGL_CTX_ID,
+                padding: 0,
+            },
+            box_x: x,
+            box_y: y,
+            box_z: 0,
+            box_w: w,
+            box_h: h,
+            box_d: 1,
+            offset,
+            resource_id,
+            level: 0,
+            stride,
+            layer_stride: 0,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuTransferHost3d>() as u32)
+}
+
+/// Transfer a 3D resource from host texture to guest backing (readback/download).
+///
+/// After VirGL renders to the host GPU texture, this copies the rendered pixels
+/// back into the resource's guest-side backing memory. If the backing is BAR0,
+/// this is a host-side DMA that writes directly to the display framebuffer —
+/// bypassing the 6 MB/s guest CPU MMIO bottleneck entirely.
+#[allow(dead_code)]
+fn transfer_from_host_3d(
+    state: &mut GpuPciDeviceState,
+    resource_id: u32,
+    x: u32,
+    y: u32,
+    w: u32,
+    h: u32,
+) -> Result<(), &'static str> {
+    let stride = state.width * 4;
+    let offset = (y as u64) * (stride as u64) + (x as u64) * 4;
+
+    let fence_id = state.next_fence_id;
+    state.next_fence_id += 1;
+
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuTransferHost3d);
+        *cmd = VirtioGpuTransferHost3d {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::TRANSFER_FROM_HOST_3D,
+                flags: VIRTIO_GPU_FLAG_FENCE,
+                fence_id,
+                ctx_id: VIRGL_CTX_ID,
+                padding: 0,
+            },
+            box_x: x,
+            box_y: y,
+            box_z: 0,
+            box_w: w,
+            box_h: h,
+            box_d: 1,
+            offset,
+            resource_id,
+            level: 0,
+            stride,
+            layer_stride: 0,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuTransferHost3d>() as u32)
+}
+
+/// Submit a VirGL command buffer via SUBMIT_3D.
+///
+/// Uses a 3-descriptor chain matching the Linux kernel driver layout:
+///   Desc 0: VirtioGpuCmdSubmit header (device-readable)
+///   Desc 1: VirGL command data (device-readable)
+///   Desc 2: Response header (device-writable)
+fn virgl_submit_3d_cmd(
+    state: &mut GpuPciDeviceState,
+    ctx_id: u32,
+    cmds: &[u32],
+) -> Result<(), &'static str> {
+    let payload_bytes = cmds.len() * 4;
+
+    if payload_bytes > 16384 {
+        return Err("VirGL command buffer too large");
+    }
+
+    // Allocate a fence ID for this submission so the host signals completion
+    let fence_id = state.next_fence_id;
+    state.next_fence_id += 1;
+
+    // Write the Submit3D header into PCI_CMD_BUF
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let hdr = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCmdSubmit);
+        *hdr = VirtioGpuCmdSubmit {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::SUBMIT_3D,
+                flags: VIRTIO_GPU_FLAG_FENCE,
+                fence_id,
+                ctx_id,
+                padding: 0,
+            },
+            size: payload_bytes as u32,
+        };
+    }
+
+    // Copy VirGL command data into PCI_3D_CMD_BUF
+    unsafe {
+        let buf_ptr = &raw mut PCI_3D_CMD_BUF;
+        let dst = (*buf_ptr).data.as_mut_ptr() as *mut u32;
+        core::ptr::copy_nonoverlapping(cmds.as_ptr(), dst, cmds.len());
+    }
+
+    let hdr_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64);
+    let data_phys = virt_to_phys(&raw const PCI_3D_CMD_BUF as u64);
+    let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64);
+    let hdr_len = core::mem::size_of::<VirtioGpuCmdSubmit>() as u32;
+    let resp_len = core::mem::size_of::<VirtioGpuCtrlHdr>() as u32;
+
+    // Build a 3-descriptor chain
+    unsafe {
+        let q = &raw mut PCI_CTRL_QUEUE;
+
+        // Desc 0: Submit3D header (device reads)
+        (*q).desc[0] = VirtqDesc {
+            addr: hdr_phys,
+            len: hdr_len,
+            flags: DESC_F_NEXT,
+            next: 1,
+        };
+
+        // Desc 1: VirGL command payload (device reads)
+        (*q).desc[1] = VirtqDesc {
+            addr: data_phys,
+            len: payload_bytes as u32,
+            flags: DESC_F_NEXT,
+            next: 2,
+        };
+
+        // Desc 2: Response (device writes)
+        (*q).desc[2] = VirtqDesc {
+            addr: resp_phys,
+            len: resp_len,
+            flags: DESC_F_WRITE,
+            next: 0,
+        };
+
+        // Add to available ring
+        let idx = (*q).avail.idx;
+        (*q).avail.ring[(idx % 16) as usize] = 0; // head of chain = desc 0
+        fence(Ordering::SeqCst);
+        (*q).avail.idx = idx.wrapping_add(1);
+        fence(Ordering::SeqCst);
+    }
+
+    // Notify and wait for completion — WFI if MSI available
+    GPU_CMD_COMPLETE.store(false, Ordering::Release);
+    state.device.notify_queue_fast(0);
+
+    let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0;
+    let mut timeout = 10_000_000u32;
+    loop {
+        fence(Ordering::Acquire);
+        let used_idx = unsafe {
+            let q = &raw const PCI_CTRL_QUEUE;
+            read_volatile(&(*q).used.idx)
+        };
+        if used_idx != state.last_used_idx {
+            state.last_used_idx = used_idx;
+            break;
+        }
+        timeout -= 1;
+        if timeout == 0 {
+            return Err("SUBMIT_3D timeout");
+        }
+        if use_msi {
+            #[cfg(target_arch = "aarch64")]
+            unsafe { core::arch::asm!("wfi", options(nomem, nostack)); }
+            #[cfg(not(target_arch = "aarch64"))]
+            core::hint::spin_loop();
+        } else {
+            core::hint::spin_loop();
+        }
+    }
+
+    // Check response — read fence info to verify the host echoed our fence_id
+    let (resp_type, resp_flags, resp_fence) = unsafe {
+        let resp_ptr = &raw const PCI_RESP_BUF;
+        let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr);
+        (
+            core::ptr::read_volatile(&hdr.type_),
+            core::ptr::read_volatile(&hdr.flags),
+            core::ptr::read_volatile(&hdr.fence_id),
+        )
+    };
+    if resp_type != cmd::RESP_OK_NODATA {
+        crate::serial_println!("[virtio-gpu-pci] SUBMIT_3D failed: resp={:#x} flags={:#x} fence={}",
+            resp_type, resp_flags, resp_fence);
+        return Err("SUBMIT_3D command failed");
+    }
+    // Log fence acknowledgement periodically (init + every 500th frame)
+    if fence_id <= 5 || fence_id % 500 == 0 {
+        crate::serial_println!("[virgl] SUBMIT_3D OK: sent fence={} resp_flags={:#x} resp_fence={}",
+            fence_id, resp_flags, resp_fence);
+    }
+    Ok(())
+}
+
+/// Wait for the host to confirm a GPU fence has completed.
+///
+/// Parallels returns SUBMIT_3D responses immediately (resp_flags=0x0) before
+/// the GPU work finishes. The actual fence completion is reported in
+/// subsequent command responses via resp_fence. This function sends NOP
+/// SUBMIT_3D commands and polls until resp_fence >= target_fence_id.
+#[allow(dead_code)]
+fn virgl_fence_sync(state: &mut GpuPciDeviceState, target_fence_id: u64) -> Result<(), &'static str> {
+    use super::virgl::CommandBuffer;
+
+    // Try up to 100 rounds of polling (each takes ~50-100us on Parallels)
+    for _ in 0..100 {
+        let mut cmdbuf = CommandBuffer::new();
+        cmdbuf.set_sub_ctx(1); // NOP — just re-sets the active sub-context
+
+        let payload = cmdbuf.as_slice();
+        let payload_bytes = payload.len() * 4;
+
+        let fence_id = state.next_fence_id;
+        state.next_fence_id += 1;
+
+        unsafe {
+            let cmd_ptr = &raw mut PCI_CMD_BUF;
+            let hdr = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCmdSubmit);
+            *hdr = VirtioGpuCmdSubmit {
+                hdr: VirtioGpuCtrlHdr {
+                    type_: cmd::SUBMIT_3D,
+                    flags: VIRTIO_GPU_FLAG_FENCE,
+                    fence_id,
+                    ctx_id: VIRGL_CTX_ID,
+                    padding: 0,
+                },
+                size: payload_bytes as u32,
+            };
+
+            let buf_ptr = &raw mut PCI_3D_CMD_BUF;
+            let dst = (*buf_ptr).data.as_mut_ptr() as *mut u32;
+            core::ptr::copy_nonoverlapping(payload.as_ptr(), dst, payload.len());
+        }
+
+        let hdr_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64);
+        let data_phys = virt_to_phys(&raw const PCI_3D_CMD_BUF as u64);
+        let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64);
+        let hdr_len = core::mem::size_of::<VirtioGpuCmdSubmit>() as u32;
+        let resp_len = core::mem::size_of::<VirtioGpuCtrlHdr>() as u32;
+
+        unsafe {
+            let q = &raw mut PCI_CTRL_QUEUE;
+            (*q).desc[0] = VirtqDesc { addr: hdr_phys, len: hdr_len, flags: DESC_F_NEXT, next: 1 };
+            (*q).desc[1] = VirtqDesc { addr: data_phys, len: payload_bytes as u32, flags: DESC_F_NEXT, next: 2 };
+            (*q).desc[2] = VirtqDesc { addr: resp_phys, len: resp_len, flags: DESC_F_WRITE, next: 0 };
+            let idx = (*q).avail.idx;
+            (*q).avail.ring[(idx % 16) as usize] = 0;
+            fence(Ordering::SeqCst);
+            (*q).avail.idx = idx.wrapping_add(1);
+            fence(Ordering::SeqCst);
+        }
+
+        state.device.notify_queue(0);
+
+        // Spin-wait for response
+        let mut timeout = 10_000_000u32;
+        loop {
+            fence(Ordering::SeqCst);
+            let used_idx = unsafe {
+                let q = &raw const PCI_CTRL_QUEUE;
+                read_volatile(&(*q).used.idx)
+            };
+            if used_idx != state.last_used_idx {
+                state.last_used_idx = used_idx;
+                break;
+            }
+            timeout -= 1;
+            if timeout == 0 { return Err("fence sync timeout"); }
+            core::hint::spin_loop();
+        }
+
+        // Check if the host reported our target fence as complete
+        let resp_fence = unsafe {
+            let resp_ptr = &raw const PCI_RESP_BUF;
+            let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr);
+            core::ptr::read_volatile(&hdr.fence_id)
+        };
+
+        if resp_fence >= target_fence_id {
+            return Ok(());
+        }
+    }
+
+    Err("fence sync: target fence never completed")
+}
+
+/// Set scanout to a specific resource ID (used for 3D render targets).
+fn set_scanout_resource(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> {
+    unsafe {
+        let cmd_ptr = &raw mut PCI_CMD_BUF;
+        let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuSetScanout);
+        *cmd = VirtioGpuSetScanout {
+            hdr: VirtioGpuCtrlHdr {
+                type_: cmd::SET_SCANOUT,
+                flags: 0,
+                fence_id: 0,
+                ctx_id: 0,
+                padding: 0,
+            },
+            r_x: 0,
+            r_y: 0,
+            r_width: state.width,
+            r_height: state.height,
+            scanout_id: 0,
+            resource_id,
+        };
+    }
+    send_command_expect_ok(state, core::mem::size_of::<VirtioGpuSetScanout>() as u32)
+}
+
+// =============================================================================
+// Public API (2D)
 // =============================================================================
 
 /// Flush the entire framebuffer to the display.
@@ -797,7 +1692,7 @@ pub fn dimensions() -> Option<(u32, u32)> {
     }
 }
 
-/// Get a mutable reference to the framebuffer pixels.
+/// Get a mutable reference to the PCI_FRAMEBUFFER pixels.
 #[allow(dead_code)]
 pub fn framebuffer() -> Option<&'static mut [u8]> {
     unsafe {
@@ -811,3 +1706,424 @@ pub fn framebuffer() -> Option<&'static mut [u8]> {
         }
     }
 }
+
+// =============================================================================
+// Public API (3D / VirGL)
+// =============================================================================
+
+/// Ball descriptor passed from userspace for GPU rendering.
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+pub struct VirglBall {
+    /// X position in pixels (f32 bits)
+    pub x: f32,
+    /// Y position in pixels (f32 bits)
+    pub y: f32,
+    /// Radius in pixels (f32 bits)
+    pub radius: f32,
+    /// Color as [R, G, B, A] each 0.0-1.0
+    pub color: [f32; 4],
+}
+
+/// Render a frame with the VirGL GPU pipeline.
+///
+/// Clears to background color, draws circles for each ball, submits to host
+/// GPU, then issues RESOURCE_FLUSH to display the result.
+pub fn virgl_render_frame(
+    balls: &[VirglBall],
+    bg_r: f32,
+    bg_g: f32,
+    bg_b: f32,
+) -> Result<(), &'static str> {
+    use super::virgl::{CommandBuffer, pipe};
+
+    static FRAME_COUNT: core::sync::atomic::AtomicU32 = core::sync::atomic::AtomicU32::new(0);
+    let frame = FRAME_COUNT.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
+    let verbose = frame < 3 || frame % 500 == 0;
+    if verbose {
+        crate::serial_println!("[virgl] render_frame #{} ({} balls)", frame, balls.len());
+    }
+
+    if !is_virgl_enabled() {
+        return Err("VirGL not enabled");
+    }
+
+    if !VIRGL_SCANOUT_ACTIVE.load(Ordering::Acquire) {
+        VIRGL_SCANOUT_ACTIVE.store(true, Ordering::Release);
+        crate::serial_println!("[virgl] first VirGL frame #{}", frame);
+    }
+
+    let (width, height) = match dimensions() {
+        Some(d) => d,
+        None => return Err("GPU not initialized"),
+    };
+    let fw = width as f32;
+    let fh = height as f32;
+
+    let mut cmdbuf = CommandBuffer::new();
+    if verbose {
+        crate::serial_println!("[virgl] frame #{}: cmdbuf created, setting FB state", frame);
+    }
+
+    cmdbuf.set_sub_ctx(1);
+
+    // Re-emit ALL pipeline state each frame. Mesa's virgl driver re-emits
+    // dirty state before every draw; Parallels may reset context state between
+    // SUBMIT_3D batches, so we must not assume anything persists.
+    cmdbuf.bind_shader(1, pipe::SHADER_VERTEX);
+    cmdbuf.bind_shader(2, pipe::SHADER_FRAGMENT);
+    cmdbuf.bind_object(1, super::virgl::OBJ_BLEND);
+    cmdbuf.bind_object(1, super::virgl::OBJ_DSA);
+    cmdbuf.bind_object(1, super::virgl::OBJ_RASTERIZER);
+    cmdbuf.bind_object(1, super::virgl::OBJ_VERTEX_ELEMENTS);
+    cmdbuf.set_viewport(fw, fh);
+    cmdbuf.set_framebuffer_state(0, &[1]); // surface_handle=1, no depth
+
+    // Clear to background color
+    cmdbuf.clear_color(bg_r, bg_g, bg_b, 1.0);
+
+    // For each ball, generate a triangle fan and draw it
+    let ball_count = balls.len().min(MAX_CIRCLES);
+    if verbose {
+        crate::serial_println!("[virgl] frame #{}: drawing {} balls (with full state re-emit)", frame, ball_count);
+    }
+
+    for (i, ball) in balls[..ball_count].iter().enumerate() {
+        let cx = ball.x;
+        let cy = ball.y;
+        let r = ball.radius;
+        let [cr, cg, cb, ca] = ball.color;
+
+        // Convert pixel coords to NDC: x_ndc = (2*x/width - 1), y_ndc = (1 - 2*y/height)
+        // VirGL with our viewport transform already maps clip coords to screen pixels,
+        // but the vertex shader outputs POSITION in clip space. With our viewport of
+        // (width/2, -height/2) scale + (width/2, height/2) translate, clip space
+        // [-1,1] maps to [0,width] and [1,-1] maps to [0,height].
+        let cx_ndc = 2.0 * cx / fw - 1.0;
+        let cy_ndc = 1.0 - 2.0 * cy / fh;
+        let rx_ndc = 2.0 * r / fw;
+        let ry_ndc = 2.0 * r / fh;
+
+        // Build triangle fan: center + CIRCLE_SEGMENTS perimeter + 1 closing vertex
+        let mut verts = [0u32; VERTS_PER_CIRCLE * 8]; // 8 u32 per vertex (pos4 + col4)
+
+        // Center vertex
+        verts[0] = cx_ndc.to_bits();
+        verts[1] = cy_ndc.to_bits();
+        verts[2] = 0f32.to_bits(); // z = 0
+        verts[3] = 1.0f32.to_bits(); // w = 1
+        verts[4] = cr.to_bits();
+        verts[5] = cg.to_bits();
+        verts[6] = cb.to_bits();
+        verts[7] = ca.to_bits();
+
+        // Perimeter vertices + closing vertex
+        // Precomputed cos/sin for 16-segment circle (2π/16 = π/8 increments)
+        const COS_TABLE: [f32; 17] = [
+            1.0, 0.92388, 0.70711, 0.38268, 0.0,
+            -0.38268, -0.70711, -0.92388, -1.0,
+            -0.92388, -0.70711, -0.38268, 0.0,
+            0.38268, 0.70711, 0.92388, 1.0, // closing = first
+        ];
+        const SIN_TABLE: [f32; 17] = [
+            0.0, 0.38268, 0.70711, 0.92388, 1.0,
+            0.92388, 0.70711, 0.38268, 0.0,
+            -0.38268, -0.70711, -0.92388, -1.0,
+            -0.92388, -0.70711, -0.38268, 0.0, // closing = first
+        ];
+        for seg in 0..=CIRCLE_SEGMENTS {
+            let cos_a = COS_TABLE[seg];
+            let sin_a = SIN_TABLE[seg];
+            let vx = cx_ndc + rx_ndc * cos_a;
+            let vy = cy_ndc + ry_ndc * sin_a;
+            let base = (seg + 1) * 8;
+            verts[base] = vx.to_bits();
+            verts[base + 1] = vy.to_bits();
+            verts[base + 2] = 0f32.to_bits();
+            verts[base + 3] = 1.0f32.to_bits();
+            verts[base + 4] = cr.to_bits();
+            verts[base + 5] = cg.to_bits();
+            verts[base + 6] = cb.to_bits();
+            verts[base + 7] = ca.to_bits();
+        }
+
+        let vb_offset = (i * VERTS_PER_CIRCLE * BYTES_PER_VERTEX) as u32;
+        let vb_bytes = (VERTS_PER_CIRCLE * BYTES_PER_VERTEX) as u32;
+
+        // Upload vertex data inline
+        cmdbuf.resource_inline_write(
+            RESOURCE_VB_ID,
+            vb_offset,
+            vb_bytes,
+            &verts[..VERTS_PER_CIRCLE * 8],
+        );
+
+        // Bind vertex buffer with correct offset for this circle
+        cmdbuf.set_vertex_buffers(&[(BYTES_PER_VERTEX as u32, vb_offset, RESOURCE_VB_ID)]);
+
+        // Draw triangle fan
+        cmdbuf.draw_vbo(
+            0,                          // start = 0 (relative to VB offset)
+            VERTS_PER_CIRCLE as u32,    // count
+            pipe::PRIM_TRIANGLE_FAN,
+            (VERTS_PER_CIRCLE - 1) as u32, // max_index
+        );
+    }
+
+    // Submit VirGL commands to host GPU
+    if verbose {
+        crate::serial_println!("[virgl] frame #{}: submitting {} DWORDs ({} bytes)",
+            frame, cmdbuf.as_slice().len(), cmdbuf.byte_len());
+    }
+    match virgl_submit(cmdbuf.as_slice()) {
+        Ok(()) => {
+            if verbose {
+                crate::serial_println!("[virgl] frame #{}: SUBMIT_3D done", frame);
+            }
+        }
+        Err(e) => {
+            crate::serial_println!("[virgl] frame #{}: SUBMIT_3D FAILED: {}", frame, e);
+            return Err(e);
+        }
+    }
+
+    // SET_SCANOUT only on first frame (scanout target doesn't change between frames).
+    // RESOURCE_FLUSH every frame to tell the hypervisor to re-scan the texture.
+    static SCANOUT_SET: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false);
+    match with_device_state(|state| {
+        if !SCANOUT_SET.load(core::sync::atomic::Ordering::Relaxed) {
+            set_scanout_resource(state, RESOURCE_3D_ID)?;
+            SCANOUT_SET.store(true, core::sync::atomic::Ordering::Relaxed);
+        }
+        resource_flush_3d(state, RESOURCE_3D_ID)
+    }) {
+        Ok(()) => {}
+        Err(e) => {
+            crate::serial_println!("[virgl] frame #{}: SET_SCANOUT/FLUSH FAILED: {}", frame, e);
+            return Err(e);
+        }
+    }
+
+    Ok(())
+}
+
+/// Submit a VirGL command buffer for the active 3D context.
+///
+/// `cmds` is a slice of u32 DWORDs from a VirGL CommandBuffer.
+pub fn virgl_submit(cmds: &[u32]) -> Result<(), &'static str> {
+    with_device_state(|state| {
+        virgl_submit_3d_cmd(state, VIRGL_CTX_ID, cmds)
+    })
+}
+
+/// Copy PCI_3D_FRAMEBUFFER (RAM) → BAR0 (display memory).
+///
+/// After TRANSFER_FROM_HOST_3D copies GPU-rendered pixels to PCI_3D_FRAMEBUFFER,
+/// this copies them to BAR0 so they appear on screen.
+#[allow(dead_code)]
+fn copy_3d_framebuffer_to_bar0(width: u32, height: u32) {
+    let bar0_virt = crate::graphics::arm64_fb::gop_framebuffer();
+    let fb_bytes = (width * height * 4) as usize;
+    if let Some(bar0) = bar0_virt {
+        let copy_len = fb_bytes.min(bar0.len()).min(FB_3D_SIZE);
+        unsafe {
+            let src = &raw const PCI_3D_FRAMEBUFFER;
+            core::ptr::copy_nonoverlapping(
+                (*src).pixels.as_ptr(),
+                bar0.as_mut_ptr(),
+                copy_len,
+            );
+        }
+    }
+}
+
+/// Flush the VirGL render target to the display.
+/// SET_SCANOUT + RESOURCE_FLUSH — matching Linux's display path.
+pub fn virgl_flush() -> Result<(), &'static str> {
+    if !is_virgl_enabled() {
+        return Err("VirGL display not available");
+    }
+    with_device_state(|state| {
+        set_scanout_resource(state, RESOURCE_3D_ID)?;
+        resource_flush_3d(state, RESOURCE_3D_ID)
+    })
+}
+
+// =============================================================================
+// VirGL Initialization (Phase 3: proof-of-pipeline)
+// =============================================================================
+
+/// Initialize the VirGL 3D pipeline if VIRGL was negotiated.
+///
+/// Creates a 3D rendering context, a render target resource matching the
+/// scanout dimensions, submits a clear command, and sets scanout to the
+/// 3D resource. If successful, the screen shows a solid color rendered
+/// entirely by the host GPU — no BAR0 pixel writes.
+pub fn virgl_init() -> Result<(), &'static str> {
+    use super::virgl::{CommandBuffer, format as vfmt, pipe};
+
+    crate::serial_println!("[virgl_init] VIRGL_ENABLED={} addr={:#x}",
+        VIRGL_ENABLED.load(Ordering::Acquire),
+        &VIRGL_ENABLED as *const _ as usize);
+    if !is_virgl_enabled() {
+        return Err("VirGL not supported");
+    }
+
+    crate::serial_println!("[virtio-gpu-pci] Initializing VirGL 3D pipeline (v10: separate backing + fence sync + capset init)...");
+
+    let (width, height) = dimensions().ok_or("GPU not initialized")?;
+
+    // Step 1: Create 3D context
+    with_device_state(|state| {
+        virgl_ctx_create_cmd(state, VIRGL_CTX_ID, b"breenix")
+    })?;
+    crate::serial_println!("[virgl] Step 1: context created (ctx_id={})", VIRGL_CTX_ID);
+
+    // Step 2: Create 3D resource with bind flags matching Linux Mesa/virgl.
+    // Linux strace shows bind=0x0014000a = RENDER_TARGET|SAMPLER_VIEW|SCANOUT|SHARED.
+    // CRITICAL: Must use B8G8R8X8_UNORM (XRGB8888) — ARGB8888 causes EINVAL.
+    let bind_flags = pipe::BIND_RENDER_TARGET | pipe::BIND_SAMPLER_VIEW
+                   | pipe::BIND_SCANOUT | pipe::BIND_SHARED;
+    with_device_state(|state| {
+        virgl_resource_create_3d_cmd(
+            state,
+            RESOURCE_3D_ID,
+            pipe::TEXTURE_2D,
+            vfmt::B8G8R8X8_UNORM,
+            bind_flags,
+            width,
+            height,
+            1,  // depth
+            1,  // array_size
+        )
+    })?;
+    crate::serial_println!("[virgl] Step 2: 3D resource created (id={}, {}x{}, B8G8R8X8_UNORM, bind=0x{:08x})", RESOURCE_3D_ID, width, height, bind_flags);
+
+    // Step 3: Attach SEPARATE backing memory (PCI_3D_FRAMEBUFFER, NOT shared with 2D resource)
+    with_device_state(|state| {
+        virgl_attach_backing_cmd(state, RESOURCE_3D_ID)
+    })?;
+    crate::serial_println!("[virgl] Step 3: separate backing attached");
+
+    // Step 4: Attach 3D resource to VirGL context
+    with_device_state(|state| {
+        virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, RESOURCE_3D_ID)
+    })?;
+    crate::serial_println!("[virgl] Step 4: 3D resource attached to context");
+
+    // Step 5: Create sub-context + pipeline state objects
+    let mut cmdbuf = CommandBuffer::new();
+    cmdbuf.create_sub_ctx(1);
+    cmdbuf.set_sub_ctx(1);
+
+    let vs_text = b"VERT\nDCL IN[0], POSITION\nDCL IN[1], GENERIC[0]\nDCL OUT[0], POSITION\nDCL OUT[1], GENERIC[0]\n  0: MOV OUT[0], IN[0]\n  1: MOV OUT[1], IN[1]\n  2: END\n";
+    cmdbuf.create_shader(1, pipe::SHADER_VERTEX, vs_text);
+    let fs_text = b"FRAG\nDCL IN[0], GENERIC[0], PERSPECTIVE\nDCL OUT[0], COLOR\n  0: MOV OUT[0], IN[0]\n  1: END\n";
+    cmdbuf.create_shader(2, pipe::SHADER_FRAGMENT, fs_text);
+
+    cmdbuf.create_blend_simple(1);
+    cmdbuf.create_dsa_disabled(1);
+    cmdbuf.create_rasterizer_default(1);
+    cmdbuf.create_vertex_elements(1, &[
+        (0, 0, 0, vfmt::R32G32B32A32_FLOAT),
+        (16, 0, 0, vfmt::R32G32B32A32_FLOAT),
+    ]);
+
+    virgl_submit(cmdbuf.as_slice())?;
+    crate::serial_println!("[virgl] Step 5: pipeline state created");
+
+    // Step 6: Bind state, create surface on 3D resource, clear to cornflower blue
+    cmdbuf.clear();
+    cmdbuf.set_sub_ctx(1);
+    cmdbuf.bind_shader(1, pipe::SHADER_VERTEX);
+    cmdbuf.bind_shader(2, pipe::SHADER_FRAGMENT);
+    cmdbuf.bind_object(1, super::virgl::OBJ_BLEND);
+    cmdbuf.bind_object(1, super::virgl::OBJ_DSA);
+    cmdbuf.bind_object(1, super::virgl::OBJ_RASTERIZER);
+    cmdbuf.bind_object(1, super::virgl::OBJ_VERTEX_ELEMENTS);
+    cmdbuf.set_viewport(width as f32, height as f32);
+
+    let surface_handle = 1u32;
+    cmdbuf.create_surface(surface_handle, RESOURCE_3D_ID, vfmt::B8G8R8X8_UNORM, 0, 0);
+    cmdbuf.set_framebuffer_state(0, &[surface_handle]);
+    cmdbuf.clear_color(0.392, 0.584, 0.929, 1.0);
+
+    virgl_submit(cmdbuf.as_slice())?;
+    crate::serial_println!("[virgl] Step 6: VirGL clear submitted to host GPU");
+
+    // Step 7: Parallels processes SUBMIT_3D synchronously — used ring completion
+    // means the GPU work is done. No fence sync needed (Parallels returns
+    // resp_fence=0 for all responses, so virgl_fence_sync doesn't work).
+    crate::serial_println!("[virgl] Step 7: SUBMIT_3D sync completed (Parallels processes synchronously)");
+
+    // Step 8: SKIPPED — Green pixel fill removed.
+    // PCI_3D_FRAMEBUFFER in BSS overlaps with the Parallels boot stack at phys
+    // 0x42000000. Writing 7.5MB of pixel data overwrites the stack frames and
+    // corrupts return addresses. The VirGL clear in Step 6 already put cornflower
+    // blue in the host GPU texture. SET_SCANOUT + RESOURCE_FLUSH should display
+    // that if 3D resource scanout works on Parallels.
+    crate::serial_println!("[virgl] Step 8: skipped green fill (BSS overlaps Parallels boot stack)");
+
+    // Step 9: Switch display to 3D resource.
+    // First disable current scanout (resource_id=0), then enable with 3D resource.
+    // This mimics Linux DRM modesetting which does a full scanout reconfiguration.
+    with_device_state(|state| {
+        // Disable current scanout
+        crate::serial_println!("[virgl] Step 9: disabling current scanout (resource_id=0)...");
+        unsafe {
+            let cmd_ptr = &raw mut PCI_CMD_BUF;
+            let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuSetScanout);
+            *cmd = VirtioGpuSetScanout {
+                hdr: VirtioGpuCtrlHdr {
+                    type_: cmd::SET_SCANOUT,
+                    flags: 0,
+                    fence_id: 0,
+                    ctx_id: 0,
+                    padding: 0,
+                },
+                r_x: 0,
+                r_y: 0,
+                r_width: 0,
+                r_height: 0,
+                scanout_id: 0,
+                resource_id: 0,  // disable scanout
+            };
+        }
+        send_command_expect_ok(state, core::mem::size_of::<VirtioGpuSetScanout>() as u32)?;
+        crate::serial_println!("[virgl] Step 9: scanout disabled");
+
+        // Enable scanout with 3D resource
+        set_scanout_resource(state, RESOURCE_3D_ID)?;
+        crate::serial_println!("[virgl] Step 9: scanout set to 3D resource (id={})", RESOURCE_3D_ID);
+
+        resource_flush_3d(state, RESOURCE_3D_ID)?;
+        crate::serial_println!("[virgl] Step 9: RESOURCE_FLUSH done");
+        Ok(())
+    })?;
+
+    // Step 10: VirGL clear rendered cornflower blue to host texture.
+    // SET_SCANOUT (Step 9) pointed display at the 3D resource.
+    crate::serial_println!("[virgl] Step 10: display configured (cornflower blue if SET_SCANOUT works)");
+
+    // Step 12: Create vertex buffer resource
+    with_device_state(|state| {
+        virgl_resource_create_3d_cmd(
+            state,
+            RESOURCE_VB_ID,
+            pipe::BUFFER,
+            vfmt::R8G8B8A8_UNORM,
+            pipe::BIND_VERTEX_BUFFER,
+            VB_SIZE as u32,
+            1, 1, 1,
+        )
+    })?;
+    with_device_state(|state| {
+        virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, RESOURCE_VB_ID)
+    })?;
+    crate::serial_println!("[virgl] Step 12: vertex buffer created (id={}, {}B)", RESOURCE_VB_ID, VB_SIZE);
+
+    crate::serial_println!("[virgl] VirGL 3D pipeline initialized (v10b: TRANSFER_TO_HOST_3D green test)");
+    crate::serial_println!("[virgl_init] END: VIRGL_ENABLED={}", VIRGL_ENABLED.load(Ordering::Acquire));
+
+    Ok(())
+}
diff --git a/kernel/src/drivers/virtio/mod.rs b/kernel/src/drivers/virtio/mod.rs
index 7e5b8075..38ded38d 100644
--- a/kernel/src/drivers/virtio/mod.rs
+++ b/kernel/src/drivers/virtio/mod.rs
@@ -39,6 +39,8 @@ pub mod input_mmio;
 pub mod sound_mmio;
 #[cfg(target_arch = "aarch64")]
 pub mod gpu_pci;
+#[cfg(target_arch = "aarch64")]
+pub mod virgl;
 
 #[cfg(target_arch = "x86_64")]
 pub mod sound;
diff --git a/kernel/src/drivers/virtio/pci_transport.rs b/kernel/src/drivers/virtio/pci_transport.rs
index 7e3f0f97..e1bb0b1e 100644
--- a/kernel/src/drivers/virtio/pci_transport.rs
+++ b/kernel/src/drivers/virtio/pci_transport.rs
@@ -184,6 +184,9 @@ pub struct VirtioPciDevice {
     device_features: u64,
     /// VirtIO device type ID
     virtio_device_id: u32,
+    /// Cached queue notify virtual addresses (avoids 2 MMIO reads per notify).
+    /// Populated by `cache_queue_notify_addr()` after queue setup.
+    cached_notify_addrs: [u64; 4],
 }
 
 impl VirtioPciDevice {
@@ -294,6 +297,7 @@ impl VirtioPciDevice {
             device_cfg,
             device_features: 0,
             virtio_device_id,
+            cached_notify_addrs: [0; 4],
         })
     }
 
@@ -478,6 +482,35 @@ impl VirtioPciDevice {
         }
     }
 
+    /// Cache the notify address for a queue so `notify_queue_fast()` avoids
+    /// the two MMIO reads (COMMON_Q_SELECT + COMMON_Q_NOFF) on every notify.
+    ///
+    /// Must be called after `set_queue_ready(true)` for the given queue.
+    pub fn cache_queue_notify_addr(&mut self, queue: u32) {
+        if queue as usize >= self.cached_notify_addrs.len() {
+            return;
+        }
+        self.select_queue(queue);
+        let queue_notify_off = self.common.read_u16(COMMON_Q_NOFF) as u32;
+        let offset = (queue_notify_off * self.notify_off_multiplier) as u64;
+        self.cached_notify_addrs[queue as usize] = self.notify.virt_base + offset;
+    }
+
+    /// Notify device using the cached notify address — single MMIO write.
+    ///
+    /// Falls back to `notify_queue()` if the address hasn't been cached.
+    #[inline(always)]
+    pub fn notify_queue_fast(&self, queue: u32) {
+        let idx = queue as usize;
+        if idx < self.cached_notify_addrs.len() && self.cached_notify_addrs[idx] != 0 {
+            unsafe {
+                core::ptr::write_volatile(self.cached_notify_addrs[idx] as *mut u16, queue as u16);
+            }
+        } else {
+            self.notify_queue(queue);
+        }
+    }
+
     // =========================================================================
     // Interrupt Handling
     // =========================================================================
@@ -526,6 +559,14 @@ impl VirtioPciDevice {
         self.device_cfg.read_u32(offset)
     }
 
+    /// Write a u32 to device-specific configuration.
+    pub fn write_config_u32(&self, offset: usize, value: u32) {
+        if !self.device_cfg.is_valid() {
+            return;
+        }
+        self.device_cfg.write_u32(offset, value);
+    }
+
     /// Read a u64 from device-specific configuration (two u32 reads).
     pub fn read_config_u64(&self, offset: usize) -> u64 {
         let low = self.read_config_u32(offset) as u64;
diff --git a/kernel/src/drivers/virtio/virgl.rs b/kernel/src/drivers/virtio/virgl.rs
new file mode 100644
index 00000000..a8316687
--- /dev/null
+++ b/kernel/src/drivers/virtio/virgl.rs
@@ -0,0 +1,396 @@
+//! VirGL Command Encoder
+//!
+//! Encodes Gallium3D-style commands into the VirGL wire format for submission
+//! via VirtIO GPU SUBMIT_3D. Each command is a sequence of 32-bit DWORDs with
+//! a header encoding `(length << 16 | subcmd << 8 | object_type)`.
+
+// =============================================================================
+// VirGL Command Types (VIRGL_CCMD_*)
+// =============================================================================
+
+#[allow(dead_code)]
+mod ccmd {
+    pub const NOP: u8 = 0;
+    pub const CREATE_OBJECT: u8 = 1;
+    pub const BIND_OBJECT: u8 = 2;
+    pub const DESTROY_OBJECT: u8 = 3;
+    pub const SET_VIEWPORT_STATE: u8 = 4;
+    pub const SET_FRAMEBUFFER_STATE: u8 = 5;
+    pub const SET_VERTEX_BUFFERS: u8 = 6;
+    pub const CLEAR: u8 = 7;
+    pub const DRAW_VBO: u8 = 8;
+    pub const RESOURCE_INLINE_WRITE: u8 = 9;
+    pub const SET_SAMPLER_VIEWS: u8 = 10;
+    pub const SET_INDEX_BUFFER: u8 = 11;
+    pub const SET_CONSTANT_BUFFER: u8 = 12;
+    pub const SET_STENCIL_REF: u8 = 13;
+    pub const SET_BLEND_COLOR: u8 = 14;
+    pub const SET_SCISSOR_STATE: u8 = 15;
+    pub const BLIT: u8 = 16;
+    pub const RESOURCE_COPY_REGION: u8 = 17;
+    pub const BIND_SAMPLER_STATES: u8 = 18;
+    pub const SET_SUB_CTX: u8 = 28;
+    pub const CREATE_SUB_CTX: u8 = 29;
+    pub const DESTROY_SUB_CTX: u8 = 30;
+    pub const BIND_SHADER: u8 = 31;
+}
+
+// =============================================================================
+// VirGL Object Types (VIRGL_OBJECT_*)
+// =============================================================================
+
+#[allow(dead_code)]
+mod obj {
+    pub const NULL: u8 = 0;
+    pub const BLEND: u8 = 1;
+    pub const RASTERIZER: u8 = 2;
+    pub const DSA: u8 = 3;
+    pub const SHADER: u8 = 4;
+    pub const VERTEX_ELEMENTS: u8 = 5;
+    pub const SAMPLER_VIEW: u8 = 6;
+    pub const SAMPLER_STATE: u8 = 7;
+    pub const SURFACE: u8 = 8;
+    pub const QUERY: u8 = 9;
+    pub const STREAMOUT_TARGET: u8 = 10;
+}
+
+// Public re-exports for bind_object() callers in gpu_pci.rs
+pub const OBJ_BLEND: u8 = obj::BLEND;
+pub const OBJ_DSA: u8 = obj::DSA;
+pub const OBJ_RASTERIZER: u8 = obj::RASTERIZER;
+pub const OBJ_VERTEX_ELEMENTS: u8 = obj::VERTEX_ELEMENTS;
+
+// =============================================================================
+// VirGL Format Constants (matches Gallium PIPE_FORMAT_*)
+// =============================================================================
+
+#[allow(dead_code)]
+pub mod format {
+    pub const B8G8R8A8_UNORM: u32 = 1;
+    pub const B8G8R8X8_UNORM: u32 = 2;
+    pub const R8G8B8A8_UNORM: u32 = 67;
+    pub const R32_FLOAT: u32 = 28;
+    pub const R32G32_FLOAT: u32 = 29;
+    pub const R32G32B32_FLOAT: u32 = 30;
+    pub const R32G32B32A32_FLOAT: u32 = 31;
+}
+
+// =============================================================================
+// Pipe Constants
+// =============================================================================
+
+#[allow(dead_code)]
+pub mod pipe {
+    // Texture targets
+    pub const BUFFER: u32 = 0;
+    pub const TEXTURE_2D: u32 = 2;
+
+    // Bind flags
+    pub const BIND_DEPTH_STENCIL: u32 = 1 << 0;
+    pub const BIND_RENDER_TARGET: u32 = 1 << 1;
+    pub const BIND_SAMPLER_VIEW: u32 = 1 << 3;
+    pub const BIND_VERTEX_BUFFER: u32 = 1 << 4;
+    pub const BIND_INDEX_BUFFER: u32 = 1 << 5;
+    pub const BIND_CONSTANT_BUFFER: u32 = 1 << 6;
+    pub const BIND_SCANOUT: u32 = 1 << 18;
+    pub const BIND_SHARED: u32 = 1 << 20;
+
+    // Clear buffer flags
+    pub const CLEAR_DEPTH: u32 = 0x01;
+    pub const CLEAR_STENCIL: u32 = 0x02;
+    pub const CLEAR_COLOR0: u32 = 0x04;
+
+    // Primitive types
+    pub const PRIM_POINTS: u32 = 0;
+    pub const PRIM_LINES: u32 = 1;
+    pub const PRIM_TRIANGLES: u32 = 4;
+    pub const PRIM_TRIANGLE_STRIP: u32 = 5;
+    pub const PRIM_TRIANGLE_FAN: u32 = 6;
+
+    // Shader types
+    pub const SHADER_VERTEX: u32 = 0;
+    pub const SHADER_FRAGMENT: u32 = 1;
+}
+
+// =============================================================================
+// Command Buffer
+// =============================================================================
+
+/// Fixed-capacity VirGL command buffer. Accumulates u32 DWORDs for submission
+/// via VIRTIO_GPU_CMD_SUBMIT_3D.
+pub struct CommandBuffer {
+    data: [u32; 3072], // 12KB — large enough for 12 circle draws with inline vertex data
+    len: usize,
+}
+
+impl CommandBuffer {
+    pub const fn new() -> Self {
+        Self {
+            data: [0u32; 3072],
+            len: 0,
+        }
+    }
+
+    /// Reset the buffer for reuse.
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Get the command data as a u32 slice.
+    pub fn as_slice(&self) -> &[u32] {
+        &self.data[..self.len]
+    }
+
+    /// Size in bytes.
+    pub fn byte_len(&self) -> usize {
+        self.len * 4
+    }
+
+    /// Push a single DWORD.
+    fn push(&mut self, val: u32) {
+        if self.len < self.data.len() {
+            self.data[self.len] = val;
+            self.len += 1;
+        }
+    }
+
+    /// Push a slice of DWORDs.
+    fn push_slice(&mut self, vals: &[u32]) {
+        for &v in vals {
+            self.push(v);
+        }
+    }
+
+    // =========================================================================
+    // Command Encoders
+    // =========================================================================
+
+    /// Encode a VirGL command header.
+    #[inline]
+    fn cmd0(cmd: u8, obj: u8, len: u16) -> u32 {
+        (cmd as u32) | ((obj as u32) << 8) | ((len as u32) << 16)
+    }
+
+    /// Create a sub-context.
+    pub fn create_sub_ctx(&mut self, sub_ctx_id: u32) {
+        self.push(Self::cmd0(ccmd::CREATE_SUB_CTX, 0, 1));
+        self.push(sub_ctx_id);
+    }
+
+    /// Set the active sub-context.
+    pub fn set_sub_ctx(&mut self, sub_ctx_id: u32) {
+        self.push(Self::cmd0(ccmd::SET_SUB_CTX, 0, 1));
+        self.push(sub_ctx_id);
+    }
+
+    /// Create a surface object wrapping a resource.
+    pub fn create_surface(&mut self, handle: u32, res_handle: u32, fmt: u32, level: u32, layers: u32) {
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::SURFACE, 5));
+        self.push(handle);
+        self.push(res_handle);
+        self.push(fmt);
+        self.push(level);
+        self.push(layers); // first_layer | (last_layer << 16)
+    }
+
+    /// Create a blend state (no blending, write all color channels).
+    pub fn create_blend_simple(&mut self, handle: u32) {
+        // len=11: handle + S0 + S1 + S2[0..7]
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::BLEND, 11));
+        self.push(handle);
+        self.push(0); // S0: no special features
+        self.push(0); // S1: logicop_func = 0
+        // S2[0]: colormask=0xF (write RGBA), blend disabled
+        self.push(0xF << 27);
+        // S2[1..7]: unused render targets
+        for _ in 0..7 {
+            self.push(0);
+        }
+    }
+
+    /// Create a depth-stencil-alpha state (all disabled).
+    pub fn create_dsa_disabled(&mut self, handle: u32) {
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::DSA, 5));
+        self.push(handle);
+        self.push(0); // S0: depth/alpha disabled
+        self.push(0); // S1: front stencil disabled
+        self.push(0); // S2: back stencil disabled
+        self.push(0); // alpha_ref = 0.0
+    }
+
+    /// Create a basic rasterizer state (fill mode, depth clip, half-pixel center).
+    pub fn create_rasterizer_default(&mut self, handle: u32) {
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::RASTERIZER, 9));
+        self.push(handle);
+        // S0: depth_clip(1<<1) | fill_front=FILL(0<<10) | fill_back=FILL(0<<12) | half_pixel_center(1<<29)
+        // PIPE_POLYGON_MODE: FILL=0, LINE=1, POINT=2. Fill fields are 0 so omitted.
+        self.push((1 << 1) | (1 << 29));
+        self.push(0x3F800000u32); // point_size = 1.0f
+        self.push(0); // sprite_coord_enable
+        self.push(0); // S3
+        self.push(0x3F800000u32); // line_width = 1.0f
+        self.push(0); // offset_units
+        self.push(0); // offset_scale
+        self.push(0); // offset_clamp
+    }
+
+    /// Bind an object by type and handle.
+    pub fn bind_object(&mut self, handle: u32, obj_type: u8) {
+        self.push(Self::cmd0(ccmd::BIND_OBJECT, obj_type, 1));
+        self.push(handle);
+    }
+
+    /// Create a shader from TGSI text.
+    pub fn create_shader(&mut self, handle: u32, shader_type: u32, tgsi_text: &[u8]) {
+        let text_len = tgsi_text.len() + 1; // include null terminator
+        let text_dwords = (text_len + 3) / 4;
+        // Header DWORDs: handle, type, offset, num_tokens, num_so_outputs = 5
+        let payload_len = 5 + text_dwords;
+
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::SHADER, payload_len as u16));
+        self.push(handle);
+        self.push(shader_type);
+        self.push(1 << 31); // offset = 0, bit 31 = 1 (last chunk — triggers compilation)
+        self.push(text_len as u32); // byte length of TGSI text including null
+        self.push(0); // num_so_outputs = 0
+
+        // Pack TGSI text bytes into DWORDs (little-endian)
+        let mut i = 0;
+        while i < text_dwords {
+            let base = i * 4;
+            let mut dword = 0u32;
+            for b in 0..4 {
+                if base + b < tgsi_text.len() {
+                    dword |= (tgsi_text[base + b] as u32) << (b * 8);
+                }
+                // else: zero (null terminator / padding)
+            }
+            self.push(dword);
+            i += 1;
+        }
+    }
+
+    /// Bind a shader by handle and type.
+    pub fn bind_shader(&mut self, handle: u32, shader_type: u32) {
+        self.push(Self::cmd0(ccmd::BIND_SHADER, 0, 2));
+        self.push(handle);
+        self.push(shader_type);
+    }
+
+    /// Set framebuffer state (nr_cbufs color buffer surface handles, optional depth surface).
+    pub fn set_framebuffer_state(&mut self, zsurf_handle: u32, cbuf_handles: &[u32]) {
+        let nr_cbufs = cbuf_handles.len() as u32;
+        self.push(Self::cmd0(ccmd::SET_FRAMEBUFFER_STATE, 0, (nr_cbufs + 2) as u16));
+        self.push(nr_cbufs);
+        self.push(zsurf_handle);
+        for &h in cbuf_handles {
+            self.push(h);
+        }
+    }
+
+    /// Set viewport state for one viewport.
+    pub fn set_viewport(&mut self, width: f32, height: f32) {
+        self.push(Self::cmd0(ccmd::SET_VIEWPORT_STATE, 0, 7));
+        self.push(0); // start_slot = 0
+        self.push(f32_bits(width / 2.0));     // scale_x
+        self.push(f32_bits(-height / 2.0));    // scale_y (negative for GL convention)
+        self.push(f32_bits(0.5));              // scale_z
+        self.push(f32_bits(width / 2.0));      // translate_x
+        self.push(f32_bits(height / 2.0));     // translate_y
+        self.push(f32_bits(0.5));              // translate_z
+    }
+
+    /// Clear the framebuffer.
+    /// Color values are f32 (0.0-1.0), reinterpreted as u32 bits.
+    pub fn clear_color(&mut self, r: f32, g: f32, b: f32, a: f32) {
+        self.push(Self::cmd0(ccmd::CLEAR, 0, 8));
+        self.push(pipe::CLEAR_COLOR0); // buffers = clear color only
+        self.push(f32_bits(r));
+        self.push(f32_bits(g));
+        self.push(f32_bits(b));
+        self.push(f32_bits(a));
+        // depth as f64 split into two u32s (0.0)
+        self.push(0);
+        self.push(0);
+        self.push(0); // stencil
+    }
+
+    /// Create vertex elements describing vertex layout.
+    /// Each element: (src_offset, instance_divisor, vertex_buffer_index, src_format)
+    pub fn create_vertex_elements(&mut self, handle: u32, elements: &[(u32, u32, u32, u32)]) {
+        let len = 4 * elements.len() + 1;
+        self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::VERTEX_ELEMENTS, len as u16));
+        self.push(handle);
+        for &(offset, divisor, vb_index, fmt) in elements {
+            self.push(offset);
+            self.push(divisor);
+            self.push(vb_index);
+            self.push(fmt);
+        }
+    }
+
+    /// Set vertex buffers: (stride, offset, resource_handle) per buffer.
+    /// VirGL protocol: payload is just [stride, offset, handle] * N. Host infers
+    /// num_buffers from length / 3. No start_slot field — buffers bind from slot 0.
+    pub fn set_vertex_buffers(&mut self, buffers: &[(u32, u32, u32)]) {
+        let len = 3 * buffers.len();
+        self.push(Self::cmd0(ccmd::SET_VERTEX_BUFFERS, 0, len as u16));
+        for &(stride, offset, res_handle) in buffers {
+            self.push(stride);
+            self.push(offset);
+            self.push(res_handle);
+        }
+    }
+
+    /// Inline write data into a resource (upload vertex/index data).
+    pub fn resource_inline_write(
+        &mut self,
+        res_handle: u32,
+        x: u32, w: u32,
+        data: &[u32],
+    ) {
+        let len = 11 + data.len();
+        self.push(Self::cmd0(ccmd::RESOURCE_INLINE_WRITE, 0, len as u16));
+        self.push(res_handle);
+        self.push(0); // level
+        self.push(0); // usage
+        self.push(0); // stride (0 for buffers)
+        self.push(0); // layer_stride
+        self.push(x); // x offset in bytes
+        self.push(0); // y
+        self.push(0); // z
+        self.push(w); // width in bytes
+        self.push(1); // h
+        self.push(1); // d
+        self.push_slice(data);
+    }
+
+    /// Draw primitives.
+    pub fn draw_vbo(
+        &mut self,
+        start: u32,
+        count: u32,
+        mode: u32,
+        max_index: u32,
+    ) {
+        self.push(Self::cmd0(ccmd::DRAW_VBO, 0, 12));
+        self.push(start);
+        self.push(count);
+        self.push(mode);
+        self.push(0); // indexed = false
+        self.push(1); // instance_count
+        self.push(0); // start_instance
+        self.push(0); // index_bias
+        self.push(0); // min_index
+        self.push(max_index);
+        self.push(0); // primitive_restart = disabled
+        self.push(0); // restart_index
+        self.push(0); // count_from_so
+    }
+}
+
+/// Reinterpret f32 as u32 bits (IEEE 754).
+#[inline]
+fn f32_bits(f: f32) -> u32 {
+    f.to_bits()
+}
diff --git a/kernel/src/graphics/arm64_fb.rs b/kernel/src/graphics/arm64_fb.rs
index 3f26cbb4..cd10e4df 100644
--- a/kernel/src/graphics/arm64_fb.rs
+++ b/kernel/src/graphics/arm64_fb.rs
@@ -132,6 +132,12 @@ pub fn flush_dirty_rect(x: u32, y: u32, w: u32, h: u32) -> Result<(), &'static s
         // CPU's write buffer is drained so stores are visible to the display
         // controller. Parallels scans BAR0 at its own refresh rate — no VirtIO
         // RESOURCE_FLUSH needed (it's synchronous and would add 10-50ms).
+        //
+        // VirtIO DMA (TRANSFER_TO_HOST_2D + RESOURCE_FLUSH) was benchmarked
+        // and is slower than direct BAR0 writes on Parallels:
+        //   - Per-ball VirtIO DMA: 5-7 FPS (28 VirtIO round-trips/frame)
+        //   - Full-pane VirtIO DMA: 4-8 FPS (2 round-trips but ~7.5MB DMA)
+        //   - Direct BAR0 MMIO:    12 FPS (14 small per-ball copies, ~340KB total)
         unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); }
         Ok(())
     } else if crate::drivers::virtio::gpu_pci::is_initialized() {
@@ -864,6 +870,19 @@ pub fn upgrade_to_double_buffer() {
     }
 }
 
+/// Check if the double buffer (shadow buffer in cached RAM) is initialized.
+///
+/// Used by the syscall flush path to decide whether to use the async path
+/// (copy mmap → shadow, wake render thread) or fall back to direct BAR0 write.
+pub fn has_double_buffer() -> bool {
+    if let Some(fb) = SHELL_FRAMEBUFFER.get() {
+        if let Some(guard) = fb.try_lock() {
+            return guard.double_buffer.is_some();
+        }
+    }
+    false
+}
+
 /// Get the framebuffer dimensions
 pub fn dimensions() -> Option<(usize, usize)> {
     SHELL_FRAMEBUFFER.get().and_then(|fb| {
diff --git a/kernel/src/platform_config.rs b/kernel/src/platform_config.rs
index 7bdecb51..f4fd41ce 100644
--- a/kernel/src/platform_config.rs
+++ b/kernel/src/platform_config.rs
@@ -10,6 +10,10 @@
 #[cfg(target_arch = "aarch64")]
 use core::sync::atomic::{AtomicU64, AtomicU8, Ordering};
 
+/// Next GICv2m SPI index to allocate (offset from GICV2M_SPI_BASE).
+#[cfg(target_arch = "aarch64")]
+static GICV2M_NEXT_SPI: AtomicU64 = AtomicU64::new(0);
+
 // =============================================================================
 // Hardware address atomics with QEMU virt defaults
 // =============================================================================
@@ -68,14 +72,14 @@ static XHCI_HCRST_DONE: AtomicU64 = AtomicU64::new(0);
 
 // Memory layout defaults (QEMU virt, 512MB RAM at 0x40000000)
 // Kernel image:   0x4000_0000 - 0x4100_0000 (16 MB)
-// Per-CPU stacks: 0x4100_0000 - 0x4200_0000 (16 MB)
-// Frame alloc:    0x4200_0000 - 0x5000_0000 (224 MB)
+// BSS (incl FBs): 0x4100_0000 - 0x4300_0000 (32 MB, includes 7.5MB PCI_3D_FRAMEBUFFER)
+// Frame alloc:    0x4300_0000 - 0x5000_0000 (208 MB)
 // DMA (NC):       0x5000_0000 - 0x501F_FFFF (2 MB, Non-Cacheable for xHCI)
 // Heap:           0x5020_0000 - 0x51FF_FFFF (30 MB)
 // Kernel stacks:  0x5200_0000 - 0x5400_0000 (32 MB)
 
 #[cfg(target_arch = "aarch64")]
-static FRAME_ALLOC_START: AtomicU64 = AtomicU64::new(0x4200_0000);
+static FRAME_ALLOC_START: AtomicU64 = AtomicU64::new(0x4300_0000);
 
 #[cfg(target_arch = "aarch64")]
 static FRAME_ALLOC_END: AtomicU64 = AtomicU64::new(0x5000_0000);
@@ -220,6 +224,30 @@ pub fn probe_gicv2m(phys_base: u64) -> bool {
     true
 }
 
+/// Allocate the next available GICv2m MSI SPI.
+///
+/// Returns the SPI number (GIC INTID) for use with `configure_msi()` and
+/// `gic::enable_spi()`. Returns 0 if GICv2m has not been probed or all
+/// SPIs have been allocated.
+///
+/// Thread-safe: uses atomic fetch_add so multiple drivers (xHCI, GPU, etc.)
+/// can allocate SPIs without collision.
+#[cfg(target_arch = "aarch64")]
+pub fn allocate_msi_spi() -> u32 {
+    let base = GICV2M_SPI_BASE.load(Ordering::Relaxed);
+    let count = GICV2M_SPI_COUNT.load(Ordering::Relaxed);
+    if base == 0 || count == 0 {
+        return 0;
+    }
+    let idx = GICV2M_NEXT_SPI.fetch_add(1, Ordering::Relaxed);
+    if idx >= count {
+        // Roll back — no SPI available
+        GICV2M_NEXT_SPI.fetch_sub(1, Ordering::Relaxed);
+        return 0;
+    }
+    (base + idx) as u32
+}
+
 /// PCI ECAM physical base address. 0 if no PCI.
 #[cfg(target_arch = "aarch64")]
 #[inline]
@@ -480,8 +508,10 @@ pub fn init_from_parallels(config: &HardwareConfig) -> bool {
         }
 
         if best_size > 0 {
-            // Frame allocator starts after kernel + stacks (32 MB from RAM base)
-            let fa_start = best_base + 0x0200_0000; // +32 MB
+            // Frame allocator starts after kernel + BSS (48 MB from RAM base).
+            // BSS includes PCI_3D_FRAMEBUFFER (~7.5 MB) and kernel stacks, so
+            // the total image + BSS exceeds 32 MB. 48 MB gives margin for growth.
+            let fa_start = best_base + 0x0300_0000; // +48 MB
             // Frame allocator must end BEFORE the DMA NC region.
             // The .dma section starts at physical 0x5000_0000, so cap fa_end there.
             let fa_end = (best_base + best_size).min(0x5000_0000);
diff --git a/kernel/src/signal/delivery.rs b/kernel/src/signal/delivery.rs
index 6dd3711e..a336a70f 100644
--- a/kernel/src/signal/delivery.rs
+++ b/kernel/src/signal/delivery.rs
@@ -202,9 +202,10 @@ pub enum DeliverResult {
 fn deliver_default_action(process: &mut Process, sig: u32) -> DeliverResult {
     match default_action(sig) {
         SignalDefaultAction::Terminate => {
-            log::info!(
-                "Process {} terminated by signal {} ({})",
+            crate::serial_println!(
+                "[signal] Process {} ({}) terminated by signal {} ({})",
                 process.id.as_u64(),
+                process.name,
                 sig,
                 signal_name(sig)
             );
@@ -235,9 +236,10 @@ fn deliver_default_action(process: &mut Process, sig: u32) -> DeliverResult {
             }
         }
         SignalDefaultAction::CoreDump => {
-            log::info!(
-                "Process {} killed (core dumped) by signal {} ({})",
+            crate::serial_println!(
+                "[signal] Process {} ({}) killed (core dump) by signal {} ({})",
                 process.id.as_u64(),
+                process.name,
                 sig,
                 signal_name(sig)
             );
diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs
index 3e4f3f40..d3cffc27 100644
--- a/kernel/src/syscall/graphics.rs
+++ b/kernel/src/syscall/graphics.rs
@@ -163,6 +163,10 @@ pub enum FbDrawOp {
     DrawLine = 5,
     /// Flush the framebuffer (for double-buffering)
     Flush = 6,
+    /// Submit a VirGL GPU-rendered frame (balls array + background color)
+    VirglSubmitFrame = 7,
+    /// Batch flush multiple dirty rects with one DSB barrier
+    FlushBatch = 8,
 }
 
 /// Draw command structure passed from userspace.
@@ -549,40 +553,58 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult {
                     let row_bytes = mmap_info.width * mmap_info.bpp;
                     let x_byte_offset = mmap_info.x_offset * mmap_info.bpp;
 
-                    // For GOP mode with double buffer, copy user → BAR0 directly
-                    // (single copy) instead of user → shadow → BAR0 (double copy).
-                    // The NC memory mapping makes BAR0 writes fast via write-combining.
-                    // Also update the shadow buffer so terminal reads stay consistent.
+                    // When a dirty rect is specified, only copy the dirty columns
+                    // instead of the full mmap width. For per-ball flushes this
+                    // reduces the copy from ~3.4KB/row to ~336 bytes/row.
+                    let (user_col_offset, shadow_col_offset, copy_row_bytes) = if has_rect {
+                        let col_start = (cmd.p1.max(0) as usize).min(mmap_info.width);
+                        let col_end = (cmd.p1.max(0) as usize + cmd.p3 as usize).min(mmap_info.width);
+                        (
+                            col_start * mmap_info.bpp,
+                            x_byte_offset + col_start * mmap_info.bpp,
+                            (col_end - col_start) * mmap_info.bpp,
+                        )
+                    } else {
+                        (0, x_byte_offset, row_bytes)
+                    };
+
                     if crate::graphics::arm64_fb::is_gop_active() {
+                        // GOP synchronous path: copy mmap → BAR0 directly with
+                        // partial column copy. Each per-ball flush writes only
+                        // ~27KB to BAR0 instead of the full bounding box (~3.7MB).
+                        // Also update shadow buffer for consistency with terminal text.
+                        //
+                        // VirtIO DMA (PCI_FRAMEBUFFER → TRANSFER_TO_HOST_2D) was
+                        // benchmarked and is slower: 5-7 FPS per-ball, 4-8 FPS
+                        // full-pane, vs 12 FPS with direct BAR0 MMIO.
                         if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() {
                             for y in y_start..y_end {
-                                let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
-                                let target_row_offset = y * fb_stride_bytes + x_byte_offset;
-
-                                if target_row_offset + row_bytes <= gop_buf.len() {
+                                let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride + user_col_offset;
+                                let target_row_offset = y * fb_stride_bytes + shadow_col_offset;
+                                if target_row_offset + copy_row_bytes <= gop_buf.len() {
                                     unsafe {
                                         core::ptr::copy_nonoverlapping(
                                             user_row_ptr as *const u8,
                                             gop_buf[target_row_offset..].as_mut_ptr(),
-                                            row_bytes,
+                                            copy_row_bytes,
                                         );
                                     }
                                 }
                             }
-                            // Also update shadow buffer to keep it consistent
-                            if let Some(db) = fb_guard.double_buffer_mut() {
-                                let shadow = db.buffer_mut();
-                                for y in y_start..y_end {
-                                    let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
-                                    let target_row_offset = y * fb_stride_bytes + x_byte_offset;
-                                    if target_row_offset + row_bytes <= shadow.len() {
-                                        unsafe {
-                                            core::ptr::copy_nonoverlapping(
-                                                user_row_ptr as *const u8,
-                                                shadow[target_row_offset..].as_mut_ptr(),
-                                                row_bytes,
-                                            );
-                                        }
+                        }
+                        // Update shadow buffer so terminal reads stay consistent
+                        if let Some(db) = fb_guard.double_buffer_mut() {
+                            let shadow = db.buffer_mut();
+                            for y in y_start..y_end {
+                                let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride + user_col_offset;
+                                let target_row_offset = y * fb_stride_bytes + shadow_col_offset;
+                                if target_row_offset + copy_row_bytes <= shadow.len() {
+                                    unsafe {
+                                        core::ptr::copy_nonoverlapping(
+                                            user_row_ptr as *const u8,
+                                            shadow[target_row_offset..].as_mut_ptr(),
+                                            copy_row_bytes,
+                                        );
                                     }
                                 }
                             }
@@ -607,21 +629,158 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult {
                     }
                 }
 
-                // Drop SHELL_FRAMEBUFFER lock BEFORE GPU flush to avoid holding
-                // both SHELL_FRAMEBUFFER + GPU_LOCK simultaneously. The pixel
-                // copy is done; the render thread can now access the framebuffer
-                // for terminal text while we submit GPU commands.
+                // Drop SHELL_FRAMEBUFFER lock before GPU flush
                 drop(fb_guard);
 
-                // Synchronous GPU flush — submit resource_flush (or transfer_to_host +
-                // resource_flush for non-GOP) directly in the syscall. This eliminates
-                // scheduling latency: bounce's frame is displayed immediately rather
-                // than waiting for the render thread (5ms+ due to timer tick).
+                // Synchronous GPU flush — for GOP this is a DSB barrier ensuring
+                // BAR0 writes are visible to the display controller. For VirtIO
+                // this submits transfer_to_host + resource_flush.
                 if let Some((fx, fy, fw, fh)) = flush_rect {
                     let _ = crate::graphics::arm64_fb::flush_dirty_rect(fx, fy, fw, fh);
                 }
             }
         }
+        7 => {
+            // VirglSubmitFrame: GPU-rendered frame via VirGL
+            // p1:p2 = pointer to VirglFrameDesc (low:high 32-bit halves)
+            // color = background color (packed 0x00RRGGBB)
+            #[cfg(target_arch = "aarch64")]
+            {
+                // Drop FB lock — we don't need the software framebuffer for GPU rendering
+                drop(fb_guard);
+
+                // Reconstruct 64-bit pointer from two i32 halves.
+                // Cast through u32 first to avoid sign extension.
+                let desc_ptr = (cmd.p1 as u32 as u64) | ((cmd.p2 as u32 as u64) << 32);
+                if desc_ptr == 0 || desc_ptr >= USER_SPACE_MAX {
+                    return SyscallResult::Err(super::ErrorCode::Fault as u64);
+                }
+
+                // Read ball count (first u32 at desc_ptr)
+                let ball_count = unsafe { core::ptr::read(desc_ptr as *const u32) } as usize;
+                if ball_count > 16 {
+                    return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64);
+                }
+
+                // Read ball array starting at desc_ptr + 8 (skip count + padding)
+                let balls_ptr = (desc_ptr + 8) as *const crate::drivers::virtio::gpu_pci::VirglBall;
+                let balls_end = desc_ptr + 8 + (ball_count as u64) * core::mem::size_of::<crate::drivers::virtio::gpu_pci::VirglBall>() as u64;
+                if balls_end > USER_SPACE_MAX {
+                    return SyscallResult::Err(super::ErrorCode::Fault as u64);
+                }
+
+                let balls = unsafe { core::slice::from_raw_parts(balls_ptr, ball_count) };
+
+                let bg_r = ((cmd.color >> 16) & 0xFF) as f32 / 255.0;
+                let bg_g = ((cmd.color >> 8) & 0xFF) as f32 / 255.0;
+                let bg_b = (cmd.color & 0xFF) as f32 / 255.0;
+
+                match crate::drivers::virtio::gpu_pci::virgl_render_frame(balls, bg_r, bg_g, bg_b) {
+                    Ok(()) => {}
+                    Err(e) => {
+                        crate::serial_println!("[virgl-syscall] render_frame FAILED: {}", e);
+                        return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64);
+                    }
+                }
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            {
+                drop(fb_guard);
+                return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64);
+            }
+        }
+        8 => {
+            // FlushBatch: batch flush multiple dirty rects with one DSB barrier.
+            // p1:p2 = 64-bit pointer to FlushRect array [(x, y, w, h); ...]
+            // p3 = count of rects (max 16)
+            // Copies each rect from mmap → BAR0, then ONE dsb sy.
+            // Saves 12+ syscall round-trips and DSB barriers per frame.
+            #[cfg(target_arch = "aarch64")]
+            {
+                FB_FLUSH_COUNT.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
+
+                let rects_ptr = (cmd.p1 as u32 as u64) | ((cmd.p2 as u32 as u64) << 32);
+                let count = (cmd.p3 as u32).min(16) as usize;
+
+                // Drop FB lock immediately — batch flush only needs mmap_info + BAR0
+                drop(fb_guard);
+
+                if count == 0 {
+                    return SyscallResult::Ok(0);
+                }
+
+                if rects_ptr == 0 || rects_ptr >= USER_SPACE_MAX {
+                    return SyscallResult::Err(super::ErrorCode::Fault as u64);
+                }
+                let rects_end = rects_ptr.saturating_add((count as u64) * 16);
+                if rects_end > USER_SPACE_MAX {
+                    return SyscallResult::Err(super::ErrorCode::Fault as u64);
+                }
+
+                #[repr(C)]
+                #[derive(Clone, Copy)]
+                struct FlushRect { x: i32, y: i32, w: i32, h: i32 }
+
+                let rects = unsafe {
+                    core::slice::from_raw_parts(rects_ptr as *const FlushRect, count)
+                };
+
+                let fb_mmap_info = fb_mmap_info_pre;
+
+                if let Some(mmap_info) = fb_mmap_info {
+                    if crate::graphics::arm64_fb::is_gop_active() {
+                        // Use lock-free FbInfoCache for stride (no FB lock needed)
+                        let fb_stride_bytes = crate::graphics::arm64_fb::FB_INFO_CACHE.get()
+                            .map(|c| c.stride * c.bytes_per_pixel)
+                            .unwrap_or(0);
+
+                        if fb_stride_bytes > 0 {
+                            if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() {
+                                let x_byte_offset = mmap_info.x_offset * mmap_info.bpp;
+
+                                for rect in rects {
+                                    if rect.w <= 0 || rect.h <= 0 { continue; }
+
+                                    let col_start = (rect.x.max(0) as usize).min(mmap_info.width);
+                                    let col_end = (rect.x.max(0) as usize + rect.w as usize).min(mmap_info.width);
+                                    let y_start = (rect.y.max(0) as usize).min(mmap_info.height);
+                                    let y_end = (rect.y.max(0) as usize + rect.h as usize).min(mmap_info.height);
+
+                                    let user_col_byte = col_start * mmap_info.bpp;
+                                    let target_col_byte = x_byte_offset + col_start * mmap_info.bpp;
+                                    let copy_row_bytes = (col_end - col_start) * mmap_info.bpp;
+
+                                    if copy_row_bytes == 0 { continue; }
+
+                                    for y in y_start..y_end {
+                                        let user_row_ptr = (mmap_info.user_addr as usize)
+                                            + y * mmap_info.user_stride + user_col_byte;
+                                        let target_row_offset = y * fb_stride_bytes + target_col_byte;
+                                        if target_row_offset + copy_row_bytes <= gop_buf.len() {
+                                            unsafe {
+                                                core::ptr::copy_nonoverlapping(
+                                                    user_row_ptr as *const u8,
+                                                    gop_buf[target_row_offset..].as_mut_ptr(),
+                                                    copy_row_bytes,
+                                                );
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            // ONE DSB for all BAR0 writes
+                            unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); }
+                        }
+                    }
+                }
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            {
+                drop(fb_guard);
+                return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64);
+            }
+        }
         _ => {
             return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64);
         }
diff --git a/libs/libbreenix/src/graphics.rs b/libs/libbreenix/src/graphics.rs
index 0ecbc143..4e00bcc4 100644
--- a/libs/libbreenix/src/graphics.rs
+++ b/libs/libbreenix/src/graphics.rs
@@ -103,6 +103,29 @@ pub mod draw_op {
     pub const DRAW_LINE: u32 = 5;
     /// Flush the framebuffer (for double-buffering)
     pub const FLUSH: u32 = 6;
+    /// Submit a VirGL GPU-rendered frame
+    pub const VIRGL_SUBMIT_FRAME: u32 = 7;
+    /// Batch flush multiple dirty rects with one DSB barrier
+    pub const FLUSH_BATCH: u32 = 8;
+}
+
+/// Ball descriptor for VirGL GPU rendering.
+/// Must match kernel's VirglBall in drivers/virtio/gpu_pci.rs.
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+pub struct VirglBall {
+    pub x: f32,
+    pub y: f32,
+    pub radius: f32,
+    pub color: [f32; 4],
+}
+
+/// Frame descriptor passed to the VirglSubmitFrame syscall.
+#[repr(C)]
+pub struct VirglFrameDesc {
+    pub ball_count: u32,
+    pub _pad: u32,
+    pub balls: [VirglBall; 16],
 }
 
 /// Pack RGB color into u32
@@ -238,6 +261,62 @@ pub fn fb_flush_rect(x: i32, y: i32, w: i32, h: i32) -> Result<(), Error> {
     fbdraw(&cmd)
 }
 
+/// A dirty rectangle for batch flushing.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct FlushRect {
+    pub x: i32,
+    pub y: i32,
+    pub w: i32,
+    pub h: i32,
+}
+
+/// Batch flush multiple dirty rectangles with a single syscall and DSB barrier.
+///
+/// Instead of calling `fb_flush_rect()` N times (N syscalls, N DSB barriers),
+/// this sends all dirty rects at once: 1 syscall, 1 DSB. Each rect is copied
+/// from the mmap buffer to BAR0 in sequence.
+pub fn fb_flush_rects(rects: &[FlushRect]) -> Result<(), Error> {
+    if rects.is_empty() {
+        return Ok(());
+    }
+    let rects_ptr = rects.as_ptr() as u64;
+    let cmd = FbDrawCmd {
+        op: draw_op::FLUSH_BATCH,
+        p1: rects_ptr as i32,
+        p2: (rects_ptr >> 32) as i32,
+        p3: rects.len() as i32,
+        p4: 0,
+        color: 0,
+    };
+    fbdraw(&cmd)
+}
+
+/// Submit a VirGL GPU-rendered frame.
+///
+/// Sends ball positions/colors to the kernel, which renders them via the host
+/// GPU and DMA-copies the result to display memory. Zero guest CPU pixel writes.
+pub fn virgl_submit_frame(balls: &[VirglBall], bg_color: u32) -> Result<(), Error> {
+    let mut desc = VirglFrameDesc {
+        ball_count: balls.len().min(16) as u32,
+        _pad: 0,
+        balls: [VirglBall::default(); 16],
+    };
+    for (i, ball) in balls.iter().take(16).enumerate() {
+        desc.balls[i] = *ball;
+    }
+    let desc_ptr = &desc as *const VirglFrameDesc as u64;
+    let cmd = FbDrawCmd {
+        op: draw_op::VIRGL_SUBMIT_FRAME,
+        p1: desc_ptr as i32,               // low 32 bits
+        p2: (desc_ptr >> 32) as i32,        // high 32 bits
+        p3: 0,
+        p4: 0,
+        color: bg_color,
+    };
+    fbdraw(&cmd)
+}
+
 /// Get the current mouse cursor position.
 ///
 /// # Returns
diff --git a/scripts/parallels/gl_bench.c b/scripts/parallels/gl_bench.c
new file mode 100644
index 00000000..ff8c8ef3
--- /dev/null
+++ b/scripts/parallels/gl_bench.c
@@ -0,0 +1,423 @@
+/*
+ * gl_bench.c — Minimal OpenGL ES 2.0 benchmark using GBM + EGL + GLES2
+ *
+ * Renders filled circles (similar to the Breenix bounce demo) on the GPU
+ * using VirGL, measures FPS, and optionally reads back pixels to verify
+ * rendering actually works.
+ *
+ * This runs headless (no window system) using GBM + EGL, which is exactly
+ * how a kernel's VirGL implementation would work.
+ *
+ * Build: gcc -O2 -o gl_bench gl_bench.c -lEGL -lGLESv2 -lgbm -lm
+ * Run:   ./gl_bench [frames]  (default: 300 frames)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <GLES2/gl2.h>
+#include <gbm.h>
+
+/* ------------------------------------------------------------------ */
+/* Timing helpers                                                      */
+/* ------------------------------------------------------------------ */
+
+static uint64_t now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
+}
+
+/* ------------------------------------------------------------------ */
+/* Shader source                                                       */
+/* ------------------------------------------------------------------ */
+
+static const char *vert_src =
+    "attribute vec2 a_pos;\n"
+    "attribute vec4 a_color;\n"
+    "varying vec4 v_color;\n"
+    "uniform vec2 u_resolution;\n"
+    "void main() {\n"
+    "    vec2 clip = (a_pos / u_resolution) * 2.0 - 1.0;\n"
+    "    clip.y = -clip.y;\n"
+    "    gl_Position = vec4(clip, 0.0, 1.0);\n"
+    "    v_color = a_color;\n"
+    "}\n";
+
+static const char *frag_src =
+    "precision mediump float;\n"
+    "varying vec4 v_color;\n"
+    "void main() {\n"
+    "    gl_FragColor = v_color;\n"
+    "}\n";
+
+/* ------------------------------------------------------------------ */
+/* Ball definition                                                     */
+/* ------------------------------------------------------------------ */
+
+#define NUM_BALLS 12
+#define CIRCLE_SEGMENTS 20
+#define WIDTH  864
+#define HEIGHT 1080
+
+struct Ball {
+    float x, y;        /* position in pixels */
+    float vx, vy;      /* velocity in pixels/frame */
+    float radius;
+    float r, g, b;
+    float mass;
+};
+
+static struct Ball balls[NUM_BALLS] = {
+    { 100, 100,  11.0,  8.0, 38, 1.0, 0.2, 0.2, 38 },
+    { 300, 200, -10.0,  7.0, 33, 0.2, 1.0, 0.2, 33 },
+    { 200, 400,   9.0, -9.5, 42, 0.2, 0.2, 1.0, 42 },
+    { 400, 300,  -8.5, -8.0, 28, 1.0, 1.0, 0.2, 28 },
+    { 150, 300,  10.5,  6.0, 24, 1.0, 0.2, 1.0, 24 },
+    { 350, 150,  -9.0,  7.5, 26, 0.2, 1.0, 1.0, 26 },
+    { 450, 500,   8.0, -7.0, 35, 1.0, 0.6, 0.2, 35 },
+    { 250, 550,  -7.5,  8.5, 30, 0.6, 0.2, 1.0, 30 },
+    { 500, 100,   9.5,  9.5, 22, 0.8, 0.8, 0.8, 22 },
+    { 120, 500, -11.0, -6.5, 20, 1.0, 0.4, 0.4, 20 },
+    { 380, 450,   7.0,  9.0, 32, 0.4, 1.0, 0.4, 32 },
+    { 520, 350,  -8.0, -8.5, 27, 0.4, 0.6, 1.0, 27 },
+};
+
+/* ------------------------------------------------------------------ */
+/* Physics (identical to Breenix bounce demo)                          */
+/* ------------------------------------------------------------------ */
+
+static void ball_step(struct Ball *b) {
+    b->x += b->vx;
+    b->y += b->vy;
+}
+
+static void ball_bounce(struct Ball *b) {
+    if (b->x - b->radius < 0)     { b->x = b->radius;              b->vx = -b->vx; }
+    if (b->x + b->radius >= WIDTH) { b->x = WIDTH - b->radius - 1;  b->vx = -b->vx; }
+    if (b->y - b->radius < 0)     { b->y = b->radius;              b->vy = -b->vy; }
+    if (b->y + b->radius >= HEIGHT){ b->y = HEIGHT - b->radius - 1; b->vy = -b->vy; }
+}
+
+static void check_collision(struct Ball *a, struct Ball *b) {
+    float dx = b->x - a->x;
+    float dy = b->y - a->y;
+    float touch = a->radius + b->radius;
+    float dist_sq = dx*dx + dy*dy;
+    if (dist_sq >= touch*touch || dist_sq == 0) return;
+
+    float dist = sqrtf(dist_sq);
+    if (dist == 0) { a->x -= 1; b->x += 1; return; }
+
+    float nx = dx / dist;
+    float ny = dy / dist;
+    float v1n = a->vx*nx + a->vy*ny;
+    float v2n = b->vx*nx + b->vy*ny;
+    if (v1n <= v2n) return;
+
+    float m1 = a->mass, m2 = b->mass, mt = m1 + m2;
+    float v1n_new = ((m1-m2)*v1n + 2*m2*v2n) / mt;
+    float v2n_new = ((m2-m1)*v2n + 2*m1*v1n) / mt;
+    float dv1 = v1n_new - v1n;
+    float dv2 = v2n_new - v2n;
+    a->vx += dv1*nx; a->vy += dv1*ny;
+    b->vx += dv2*nx; b->vy += dv2*ny;
+
+    float overlap = touch - dist + 0.5f;
+    float push1 = overlap * m2 / mt;
+    float push2 = overlap * m1 / mt;
+    a->x -= push1*nx; a->y -= push1*ny;
+    b->x += push2*nx; b->y += push2*ny;
+}
+
+/* ------------------------------------------------------------------ */
+/* GL helpers                                                          */
+/* ------------------------------------------------------------------ */
+
+static GLuint compile_shader(GLenum type, const char *src) {
+    GLuint s = glCreateShader(type);
+    glShaderSource(s, 1, &src, NULL);
+    glCompileShader(s);
+    GLint ok;
+    glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+    if (!ok) {
+        char buf[512];
+        glGetShaderInfoLog(s, sizeof(buf), NULL, buf);
+        fprintf(stderr, "Shader compile error: %s\n", buf);
+        exit(1);
+    }
+    return s;
+}
+
+/* Build a triangle fan for a circle: center + N+1 edge vertices */
+static int build_circle_vertices(float *verts, float cx, float cy, float r,
+                                  float cr, float cg, float cb) {
+    int n = 0;
+    /* Center vertex: x, y, r, g, b, a */
+    verts[n++] = cx; verts[n++] = cy;
+    verts[n++] = cr; verts[n++] = cg; verts[n++] = cb; verts[n++] = 1.0f;
+
+    for (int i = 0; i <= CIRCLE_SEGMENTS; i++) {
+        float angle = (float)i / CIRCLE_SEGMENTS * 2.0f * M_PI;
+        verts[n++] = cx + r * cosf(angle);
+        verts[n++] = cy + r * sinf(angle);
+        verts[n++] = cr; verts[n++] = cg; verts[n++] = cb; verts[n++] = 1.0f;
+    }
+    return (CIRCLE_SEGMENTS + 2); /* vertex count */
+}
+
+/* ------------------------------------------------------------------ */
+/* Main                                                                */
+/* ------------------------------------------------------------------ */
+
+int main(int argc, char *argv[]) {
+    int total_frames = 300;
+    if (argc > 1) total_frames = atoi(argv[1]);
+    if (total_frames <= 0) total_frames = 300;
+
+    printf("=== GL Bench: %d frames, %dx%d, %d balls ===\n",
+           total_frames, WIDTH, HEIGHT, NUM_BALLS);
+
+    /* ---- Open DRM device ---- */
+    int drm_fd = -1;
+    const char *cards[] = {"/dev/dri/renderD128", "/dev/dri/card0", "/dev/dri/card1", NULL};
+    for (int i = 0; cards[i]; i++) {
+        drm_fd = open(cards[i], O_RDWR);
+        if (drm_fd >= 0) {
+            printf("Opened DRM device: %s\n", cards[i]);
+            break;
+        }
+    }
+    if (drm_fd < 0) {
+        fprintf(stderr, "Failed to open any DRM device: %s\n", strerror(errno));
+        return 1;
+    }
+
+    /* ---- GBM device ---- */
+    struct gbm_device *gbm = gbm_create_device(drm_fd);
+    if (!gbm) {
+        fprintf(stderr, "Failed to create GBM device\n");
+        return 1;
+    }
+    printf("GBM device created\n");
+
+    /* ---- EGL setup ---- */
+    /* Use eglGetPlatformDisplay (EGL 1.5) with GBM platform */
+    EGLDisplay dpy = eglGetPlatformDisplay(EGL_PLATFORM_GBM_MESA, gbm, NULL);
+    if (dpy == EGL_NO_DISPLAY) {
+        /* Fallback to legacy eglGetDisplay */
+        dpy = eglGetDisplay((EGLNativeDisplayType)gbm);
+    }
+    if (dpy == EGL_NO_DISPLAY) {
+        fprintf(stderr, "Failed to get EGL display\n");
+        return 1;
+    }
+
+    EGLint major, minor;
+    if (!eglInitialize(dpy, &major, &minor)) {
+        fprintf(stderr, "eglInitialize failed: 0x%x\n", eglGetError());
+        return 1;
+    }
+    printf("EGL %d.%d initialized\n", major, minor);
+
+    const char *egl_vendor = eglQueryString(dpy, EGL_VENDOR);
+    const char *egl_version = eglQueryString(dpy, EGL_VERSION);
+    printf("EGL vendor: %s\n", egl_vendor ? egl_vendor : "unknown");
+    printf("EGL version: %s\n", egl_version ? egl_version : "unknown");
+
+    eglBindAPI(EGL_OPENGL_ES_API);
+
+    EGLint cfg_attribs[] = {
+        EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
+        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+        EGL_RED_SIZE, 8,
+        EGL_GREEN_SIZE, 8,
+        EGL_BLUE_SIZE, 8,
+        EGL_ALPHA_SIZE, 8,
+        EGL_NONE
+    };
+    EGLConfig config;
+    EGLint num_configs;
+    if (!eglChooseConfig(dpy, cfg_attribs, &config, 1, &num_configs) || num_configs == 0) {
+        fprintf(stderr, "eglChooseConfig failed\n");
+        return 1;
+    }
+
+    EGLint ctx_attribs[] = { EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE };
+    EGLContext ctx = eglCreateContext(dpy, config, EGL_NO_CONTEXT, ctx_attribs);
+    if (ctx == EGL_NO_CONTEXT) {
+        fprintf(stderr, "eglCreateContext failed: 0x%x\n", eglGetError());
+        return 1;
+    }
+
+    /* Create GBM surface for offscreen rendering */
+    struct gbm_surface *gbm_surf = gbm_surface_create(gbm, WIDTH, HEIGHT,
+                                                        GBM_FORMAT_ARGB8888,
+                                                        GBM_BO_USE_RENDERING);
+    if (!gbm_surf) {
+        fprintf(stderr, "gbm_surface_create failed\n");
+        return 1;
+    }
+
+    EGLSurface egl_surf = eglCreateWindowSurface(dpy, config,
+                                                   (EGLNativeWindowType)gbm_surf, NULL);
+    if (egl_surf == EGL_NO_SURFACE) {
+        fprintf(stderr, "eglCreateWindowSurface failed: 0x%x\n", eglGetError());
+        return 1;
+    }
+
+    if (!eglMakeCurrent(dpy, egl_surf, egl_surf, ctx)) {
+        fprintf(stderr, "eglMakeCurrent failed: 0x%x\n", eglGetError());
+        return 1;
+    }
+
+    printf("GL_RENDERER: %s\n", glGetString(GL_RENDERER));
+    printf("GL_VENDOR: %s\n", glGetString(GL_VENDOR));
+    printf("GL_VERSION: %s\n", glGetString(GL_VERSION));
+
+    /* ---- Compile shaders ---- */
+    GLuint vs = compile_shader(GL_VERTEX_SHADER, vert_src);
+    GLuint fs = compile_shader(GL_FRAGMENT_SHADER, frag_src);
+    GLuint prog = glCreateProgram();
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glBindAttribLocation(prog, 0, "a_pos");
+    glBindAttribLocation(prog, 1, "a_color");
+    glLinkProgram(prog);
+    GLint link_ok;
+    glGetProgramiv(prog, GL_LINK_STATUS, &link_ok);
+    if (!link_ok) {
+        char buf[512];
+        glGetProgramInfoLog(prog, sizeof(buf), NULL, buf);
+        fprintf(stderr, "Link error: %s\n", buf);
+        return 1;
+    }
+    glUseProgram(prog);
+    GLint u_res = glGetUniformLocation(prog, "u_resolution");
+    glUniform2f(u_res, (float)WIDTH, (float)HEIGHT);
+
+    glViewport(0, 0, WIDTH, HEIGHT);
+    glDisable(GL_DEPTH_TEST);
+
+    printf("Shaders compiled, rendering %d frames...\n\n", total_frames);
+
+    /* ---- Vertex buffer for circles ---- */
+    /* Max vertices: NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 floats */
+    float *verts = malloc(NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 * sizeof(float));
+    if (!verts) { perror("malloc"); return 1; }
+
+    GLuint vbo;
+    glGenBuffers(1, &vbo);
+
+    /* ---- Render loop ---- */
+    uint64_t t_start = now_ns();
+    uint64_t t_last_print = t_start;
+    int frames_since_print = 0;
+
+    for (int frame = 0; frame < total_frames; frame++) {
+        /* Physics */
+        for (int s = 0; s < 16; s++) {
+            for (int i = 0; i < NUM_BALLS; i++) {
+                balls[i].x += balls[i].vx / 16.0f;
+                balls[i].y += balls[i].vy / 16.0f;
+            }
+            for (int i = 0; i < NUM_BALLS; i++) ball_bounce(&balls[i]);
+            for (int i = 0; i < NUM_BALLS; i++)
+                for (int j = i+1; j < NUM_BALLS; j++)
+                    check_collision(&balls[i], &balls[j]);
+        }
+
+        /* Clear */
+        glClearColor(15.0f/255.0f, 15.0f/255.0f, 30.0f/255.0f, 1.0f);
+        glClear(GL_COLOR_BUFFER_BIT);
+
+        /* Draw each ball as a triangle fan */
+        int total_verts_offset = 0;
+        for (int i = 0; i < NUM_BALLS; i++) {
+            int nv = build_circle_vertices(
+                verts + total_verts_offset * 6,
+                balls[i].x, balls[i].y, balls[i].radius,
+                balls[i].r, balls[i].g, balls[i].b);
+
+            glBindBuffer(GL_ARRAY_BUFFER, vbo);
+            glBufferData(GL_ARRAY_BUFFER, nv * 6 * sizeof(float),
+                         verts + total_verts_offset * 6, GL_DYNAMIC_DRAW);
+            glEnableVertexAttribArray(0);
+            glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)0);
+            glEnableVertexAttribArray(1);
+            glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)(2*sizeof(float)));
+            glDrawArrays(GL_TRIANGLE_FAN, 0, nv);
+        }
+
+        /* Swap / present */
+        eglSwapBuffers(dpy, egl_surf);
+
+        /* Release the GBM buffer so we can lock the next one */
+        struct gbm_bo *bo = gbm_surface_lock_front_buffer(gbm_surf);
+        if (bo) gbm_surface_release_buffer(gbm_surf, bo);
+
+        frames_since_print++;
+
+        /* Print FPS every 16 frames */
+        if (frames_since_print >= 16) {
+            uint64_t now = now_ns();
+            uint64_t elapsed = now - t_last_print;
+            if (elapsed > 0) {
+                double fps = (double)frames_since_print * 1e9 / (double)elapsed;
+                double ms = (double)elapsed / (double)frames_since_print / 1e6;
+                printf("[frame %4d] FPS: %.1f  (%.2f ms/frame)\n", frame, fps, ms);
+            }
+            frames_since_print = 0;
+            t_last_print = now;
+        }
+
+        /* Readback a few pixels on first frame to verify rendering */
+        if (frame == 0) {
+            unsigned char pixel[4];
+            /* Read center of screen — should be background color */
+            glReadPixels(WIDTH/2, HEIGHT/2, 1, 1, GL_RGBA, GL_UNSIGNED_BYTE, pixel);
+            printf("  Pixel at center: RGBA(%d, %d, %d, %d) — expect ~(15, 15, 30, 255)\n",
+                   pixel[0], pixel[1], pixel[2], pixel[3]);
+
+            /* Read where first ball should be */
+            int bx = (int)balls[0].x;
+            int by = HEIGHT - (int)balls[0].y; /* GL flips Y */
+            if (bx >= 0 && bx < WIDTH && by >= 0 && by < HEIGHT) {
+                glReadPixels(bx, by, 1, 1, GL_RGBA, GL_UNSIGNED_BYTE, pixel);
+                printf("  Pixel at ball[0] (%d,%d): RGBA(%d, %d, %d, %d) — expect red-ish\n",
+                       bx, by, pixel[0], pixel[1], pixel[2], pixel[3]);
+            }
+        }
+    }
+
+    uint64_t t_end = now_ns();
+    double total_secs = (double)(t_end - t_start) / 1e9;
+    double avg_fps = (double)total_frames / total_secs;
+
+    printf("\n=== Results ===\n");
+    printf("Total frames: %d\n", total_frames);
+    printf("Total time:   %.2f s\n", total_secs);
+    printf("Average FPS:  %.1f\n", avg_fps);
+    printf("Avg ms/frame: %.2f\n", total_secs * 1000.0 / total_frames);
+
+    /* Cleanup */
+    glDeleteBuffers(1, &vbo);
+    glDeleteProgram(prog);
+    free(verts);
+    eglDestroySurface(dpy, egl_surf);
+    gbm_surface_destroy(gbm_surf);
+    eglDestroyContext(dpy, ctx);
+    eglTerminate(dpy);
+    gbm_device_destroy(gbm);
+    close(drm_fd);
+
+    return 0;
+}
diff --git a/scripts/parallels/gl_display.c b/scripts/parallels/gl_display.c
new file mode 100644
index 00000000..a71807db
--- /dev/null
+++ b/scripts/parallels/gl_display.c
@@ -0,0 +1,470 @@
+/*
+ * gl_display.c — VirGL rendering to the physical display via DRM/KMS + EGL
+ *
+ * This is the critical test: renders circles on the GPU via VirGL and
+ * presents them to the physical display using DRM page flipping.
+ * This is what Breenix needs to replicate.
+ *
+ * Build: gcc -O2 -o gl_display gl_display.c -lEGL -lGLESv2 -lgbm -ldrm -lm
+ * Run:   ./gl_display [frames]  (default: 120)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <GLES2/gl2.h>
+#include <gbm.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+/* ------------------------------------------------------------------ */
+/* Timing                                                              */
+/* ------------------------------------------------------------------ */
+
+static uint64_t now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
+}
+
+/* ------------------------------------------------------------------ */
+/* Shaders                                                             */
+/* ------------------------------------------------------------------ */
+
+static const char *vert_src =
+    "attribute vec2 a_pos;\n"
+    "attribute vec4 a_color;\n"
+    "varying vec4 v_color;\n"
+    "uniform vec2 u_resolution;\n"
+    "void main() {\n"
+    "    vec2 clip = (a_pos / u_resolution) * 2.0 - 1.0;\n"
+    "    clip.y = -clip.y;\n"
+    "    gl_Position = vec4(clip, 0.0, 1.0);\n"
+    "    v_color = a_color;\n"
+    "}\n";
+
+static const char *frag_src =
+    "precision mediump float;\n"
+    "varying vec4 v_color;\n"
+    "void main() {\n"
+    "    gl_FragColor = v_color;\n"
+    "}\n";
+
+/* ------------------------------------------------------------------ */
+/* Balls                                                               */
+/* ------------------------------------------------------------------ */
+
+#define NUM_BALLS 12
+#define CIRCLE_SEGMENTS 20
+
+struct Ball {
+    float x, y, vx, vy, radius, r, g, b, mass;
+};
+
+static struct Ball balls[NUM_BALLS];
+static int fb_width, fb_height;
+
+static void init_balls(int w, int h) {
+    fb_width = w;
+    fb_height = h;
+    struct Ball defaults[NUM_BALLS] = {
+        { 100, 100,  11.0,  8.0, 38, 1.0, 0.2, 0.2, 38 },
+        { 300, 200, -10.0,  7.0, 33, 0.2, 1.0, 0.2, 33 },
+        { 200, 400,   9.0, -9.5, 42, 0.2, 0.2, 1.0, 42 },
+        { 400, 300,  -8.5, -8.0, 28, 1.0, 1.0, 0.2, 28 },
+        { 150, 300,  10.5,  6.0, 24, 1.0, 0.2, 1.0, 24 },
+        { 350, 150,  -9.0,  7.5, 26, 0.2, 1.0, 1.0, 26 },
+        { 450, 500,   8.0, -7.0, 35, 1.0, 0.6, 0.2, 35 },
+        { 250, 550,  -7.5,  8.5, 30, 0.6, 0.2, 1.0, 30 },
+        { 500, 100,   9.5,  9.5, 22, 0.8, 0.8, 0.8, 22 },
+        { 120, 500, -11.0, -6.5, 20, 1.0, 0.4, 0.4, 20 },
+        { 380, 450,   7.0,  9.0, 32, 0.4, 1.0, 0.4, 32 },
+        { 520, 350,  -8.0, -8.5, 27, 0.4, 0.6, 1.0, 27 },
+    };
+    memcpy(balls, defaults, sizeof(defaults));
+}
+
+static void ball_bounce(struct Ball *b) {
+    if (b->x - b->radius < 0)          { b->x = b->radius;              b->vx = -b->vx; }
+    if (b->x + b->radius >= fb_width)   { b->x = fb_width - b->radius - 1;  b->vx = -b->vx; }
+    if (b->y - b->radius < 0)          { b->y = b->radius;              b->vy = -b->vy; }
+    if (b->y + b->radius >= fb_height)  { b->y = fb_height - b->radius - 1; b->vy = -b->vy; }
+}
+
+static void check_collision(struct Ball *a, struct Ball *b) {
+    float dx = b->x - a->x, dy = b->y - a->y;
+    float touch = a->radius + b->radius;
+    float dist_sq = dx*dx + dy*dy;
+    if (dist_sq >= touch*touch || dist_sq == 0) return;
+    float dist = sqrtf(dist_sq);
+    if (dist == 0) { a->x -= 1; b->x += 1; return; }
+    float nx = dx/dist, ny = dy/dist;
+    float v1n = a->vx*nx + a->vy*ny;
+    float v2n = b->vx*nx + b->vy*ny;
+    if (v1n <= v2n) return;
+    float m1 = a->mass, m2 = b->mass, mt = m1+m2;
+    float v1n_new = ((m1-m2)*v1n + 2*m2*v2n)/mt;
+    float v2n_new = ((m2-m1)*v2n + 2*m1*v1n)/mt;
+    a->vx += (v1n_new-v1n)*nx; a->vy += (v1n_new-v1n)*ny;
+    b->vx += (v2n_new-v2n)*nx; b->vy += (v2n_new-v2n)*ny;
+    float overlap = touch - dist + 0.5f;
+    a->x -= overlap*m2/mt*nx; a->y -= overlap*m2/mt*ny;
+    b->x += overlap*m1/mt*nx; b->y += overlap*m1/mt*ny;
+}
+
+static void physics_step(void) {
+    for (int s = 0; s < 16; s++) {
+        for (int i = 0; i < NUM_BALLS; i++) {
+            balls[i].x += balls[i].vx/16.0f;
+            balls[i].y += balls[i].vy/16.0f;
+        }
+        for (int i = 0; i < NUM_BALLS; i++) ball_bounce(&balls[i]);
+        for (int i = 0; i < NUM_BALLS; i++)
+            for (int j = i+1; j < NUM_BALLS; j++)
+                check_collision(&balls[i], &balls[j]);
+    }
+}
+
+/* ------------------------------------------------------------------ */
+/* GL helpers                                                          */
+/* ------------------------------------------------------------------ */
+
+static GLuint compile_shader(GLenum type, const char *src) {
+    GLuint s = glCreateShader(type);
+    glShaderSource(s, 1, &src, NULL);
+    glCompileShader(s);
+    GLint ok;
+    glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+    if (!ok) {
+        char buf[512];
+        glGetShaderInfoLog(s, sizeof(buf), NULL, buf);
+        fprintf(stderr, "Shader error: %s\n", buf);
+        exit(1);
+    }
+    return s;
+}
+
+static int build_circle_verts(float *v, float cx, float cy, float r,
+                               float cr, float cg, float cb) {
+    int n = 0;
+    v[n++] = cx; v[n++] = cy;
+    v[n++] = cr; v[n++] = cg; v[n++] = cb; v[n++] = 1.0f;
+    for (int i = 0; i <= CIRCLE_SEGMENTS; i++) {
+        float angle = (float)i / CIRCLE_SEGMENTS * 2.0f * M_PI;
+        v[n++] = cx + r * cosf(angle);
+        v[n++] = cy + r * sinf(angle);
+        v[n++] = cr; v[n++] = cg; v[n++] = cb; v[n++] = 1.0f;
+    }
+    return CIRCLE_SEGMENTS + 2;
+}
+
+/* ------------------------------------------------------------------ */
+/* DRM + GBM + EGL display setup                                       */
+/* ------------------------------------------------------------------ */
+
+struct drm_state {
+    int fd;
+    drmModeConnector *connector;
+    drmModeEncoder *encoder;
+    drmModeCrtc *saved_crtc;
+    uint32_t crtc_id;
+    drmModeModeInfo mode;
+};
+
+static int find_drm_display(struct drm_state *drm) {
+    const char *cards[] = {"/dev/dri/card0", "/dev/dri/card1", NULL};
+    for (int i = 0; cards[i]; i++) {
+        drm->fd = open(cards[i], O_RDWR | O_CLOEXEC);
+        if (drm->fd < 0) {
+            fprintf(stderr, "  Cannot open %s: %s\n", cards[i], strerror(errno));
+            continue;
+        }
+
+        /* Need master for modesetting */
+        drmSetMaster(drm->fd);
+
+        drmModeRes *res = drmModeGetResources(drm->fd);
+        if (!res) {
+            fprintf(stderr, "  %s: drmModeGetResources failed: %s\n", cards[i], strerror(errno));
+            close(drm->fd);
+            continue;
+        }
+        fprintf(stderr, "  %s: %d connectors, %d crtcs, %d encoders\n",
+                cards[i], res->count_connectors, res->count_crtcs, res->count_encoders);
+
+        /* Find connected connector */
+        for (int c = 0; c < res->count_connectors; c++) {
+            drm->connector = drmModeGetConnector(drm->fd, res->connectors[c]);
+            if (!drm->connector) continue;
+            if (drm->connector->connection == DRM_MODE_CONNECTED &&
+                drm->connector->count_modes > 0) {
+                printf("Found connector %d: %s, %dx%d\n",
+                       drm->connector->connector_id,
+                       drm->connector->count_modes > 0 ? "has modes" : "no modes",
+                       drm->connector->modes[0].hdisplay,
+                       drm->connector->modes[0].vdisplay);
+                drm->mode = drm->connector->modes[0];
+                break;
+            }
+            drmModeFreeConnector(drm->connector);
+            drm->connector = NULL;
+        }
+
+        if (!drm->connector) {
+            fprintf(stderr, "  %s: no connected connector found\n", cards[i]);
+            drmModeFreeResources(res);
+            close(drm->fd);
+            continue;
+        }
+
+        /* Find encoder + CRTC */
+        drm->encoder = drmModeGetEncoder(drm->fd, drm->connector->encoder_id);
+        if (!drm->encoder) {
+            /* Try first encoder */
+            for (int e = 0; e < res->count_encoders; e++) {
+                drm->encoder = drmModeGetEncoder(drm->fd, res->encoders[e]);
+                if (drm->encoder) break;
+            }
+        }
+        if (!drm->encoder) {
+            fprintf(stderr, "No encoder found\n");
+            drmModeFreeResources(res);
+            continue;
+        }
+
+        drm->crtc_id = drm->encoder->crtc_id;
+        if (!drm->crtc_id && res->count_crtcs > 0)
+            drm->crtc_id = res->crtcs[0];
+
+        drm->saved_crtc = drmModeGetCrtc(drm->fd, drm->crtc_id);
+
+        printf("Using DRM device: %s\n", cards[i]);
+        printf("Display: %dx%d @ %dHz\n",
+               drm->mode.hdisplay, drm->mode.vdisplay, drm->mode.vrefresh);
+
+        drmModeFreeResources(res);
+        return 0;
+    }
+    return -1;
+}
+
+/* ------------------------------------------------------------------ */
+/* Main                                                                */
+/* ------------------------------------------------------------------ */
+
+int main(int argc, char *argv[]) {
+    int total_frames = 120;
+    if (argc > 1) total_frames = atoi(argv[1]);
+    if (total_frames <= 0) total_frames = 120;
+
+    printf("=== GL Display: %d frames, DRM/KMS page-flip ===\n", total_frames);
+
+    /* ---- Find DRM display ---- */
+    struct drm_state drm = {0};
+    if (find_drm_display(&drm) < 0) {
+        fprintf(stderr, "No DRM display found\n");
+        return 1;
+    }
+
+    int width = drm.mode.hdisplay;
+    int height = drm.mode.vdisplay;
+    init_balls(width, height);
+
+    /* ---- GBM device ---- */
+    struct gbm_device *gbm = gbm_create_device(drm.fd);
+    if (!gbm) { fprintf(stderr, "GBM failed\n"); return 1; }
+
+    /* ---- GBM surface ---- */
+    struct gbm_surface *gbm_surf = gbm_surface_create(gbm, width, height,
+                                                        GBM_FORMAT_XRGB8888,
+                                                        GBM_BO_USE_SCANOUT | GBM_BO_USE_RENDERING);
+    if (!gbm_surf) { fprintf(stderr, "GBM surface failed\n"); return 1; }
+
+    /* ---- EGL ---- */
+    EGLDisplay dpy = eglGetPlatformDisplay(EGL_PLATFORM_GBM_MESA, gbm, NULL);
+    if (dpy == EGL_NO_DISPLAY)
+        dpy = eglGetDisplay((EGLNativeDisplayType)gbm);
+    if (dpy == EGL_NO_DISPLAY) { fprintf(stderr, "No EGL display\n"); return 1; }
+
+    EGLint major, minor;
+    eglInitialize(dpy, &major, &minor);
+    eglBindAPI(EGL_OPENGL_ES_API);
+
+    /* Enumerate ALL EGL configs and find one matching XRGB8888.
+     * eglChooseConfig returns ARGB configs by default, which cause
+     * drmModeSetCrtc EINVAL on Parallels — XRGB8888 is required. */
+    EGLint total_configs = 0;
+    eglGetConfigs(dpy, NULL, 0, &total_configs);
+    EGLConfig *all_configs = malloc(total_configs * sizeof(EGLConfig));
+    eglGetConfigs(dpy, all_configs, total_configs, &total_configs);
+    printf("Scanning %d EGL configs for XRGB8888 match...\n", total_configs);
+
+    EGLConfig config = NULL;
+    for (int i = 0; i < total_configs; i++) {
+        EGLint native_visual, render_type, surf_type, alpha_size;
+        eglGetConfigAttrib(dpy, all_configs[i], EGL_NATIVE_VISUAL_ID, &native_visual);
+        eglGetConfigAttrib(dpy, all_configs[i], EGL_RENDERABLE_TYPE, &render_type);
+        eglGetConfigAttrib(dpy, all_configs[i], EGL_SURFACE_TYPE, &surf_type);
+        eglGetConfigAttrib(dpy, all_configs[i], EGL_ALPHA_SIZE, &alpha_size);
+        if (native_visual == (int)GBM_FORMAT_XRGB8888 &&
+            (render_type & EGL_OPENGL_ES2_BIT) &&
+            (surf_type & EGL_WINDOW_BIT)) {
+            config = all_configs[i];
+            printf("  Found XRGB8888 config #%d (alpha=%d)\n", i, alpha_size);
+            break;
+        }
+    }
+    free(all_configs);
+    if (!config) { fprintf(stderr, "No XRGB8888 EGL config found\n"); return 1; }
+
+    EGLint ctx_attrs[] = { EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE };
+    EGLContext ctx = eglCreateContext(dpy, config, EGL_NO_CONTEXT, ctx_attrs);
+    if (ctx == EGL_NO_CONTEXT) { fprintf(stderr, "eglCreateContext failed: 0x%x\n", eglGetError()); return 1; }
+
+    EGLSurface egl_surf = eglCreateWindowSurface(dpy, config,
+                                                   (EGLNativeWindowType)gbm_surf, NULL);
+    if (egl_surf == EGL_NO_SURFACE) { fprintf(stderr, "eglCreateWindowSurface failed: 0x%x\n", eglGetError()); return 1; }
+
+    eglMakeCurrent(dpy, egl_surf, egl_surf, ctx);
+
+    printf("GL_RENDERER: %s\n", glGetString(GL_RENDERER));
+    printf("GL_VERSION: %s\n", glGetString(GL_VERSION));
+
+    /* ---- Compile shaders ---- */
+    GLuint vs = compile_shader(GL_VERTEX_SHADER, vert_src);
+    GLuint fs = compile_shader(GL_FRAGMENT_SHADER, frag_src);
+    GLuint prog = glCreateProgram();
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glBindAttribLocation(prog, 0, "a_pos");
+    glBindAttribLocation(prog, 1, "a_color");
+    glLinkProgram(prog);
+    glUseProgram(prog);
+    glUniform2f(glGetUniformLocation(prog, "u_resolution"), (float)width, (float)height);
+    glViewport(0, 0, width, height);
+    glDisable(GL_DEPTH_TEST);
+
+    float *verts = malloc(NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 * sizeof(float));
+    GLuint vbo;
+    glGenBuffers(1, &vbo);
+
+    printf("Setup complete. Rendering %d frames to display...\n\n", total_frames);
+
+    /* ---- Render loop with DRM page flipping ---- */
+    uint64_t t_start = now_ns();
+    uint64_t t_last_print = t_start;
+    int frames_since_print = 0;
+    struct gbm_bo *prev_bo = NULL;
+    uint32_t prev_fb_id = 0;
+
+    for (int frame = 0; frame < total_frames; frame++) {
+        physics_step();
+
+        /* Clear + draw */
+        glClearColor(15.0f/255.0f, 15.0f/255.0f, 30.0f/255.0f, 1.0f);
+        glClear(GL_COLOR_BUFFER_BIT);
+
+        for (int i = 0; i < NUM_BALLS; i++) {
+            int nv = build_circle_verts(verts, balls[i].x, balls[i].y,
+                                         balls[i].radius, balls[i].r, balls[i].g, balls[i].b);
+            glBindBuffer(GL_ARRAY_BUFFER, vbo);
+            glBufferData(GL_ARRAY_BUFFER, nv * 6 * sizeof(float), verts, GL_DYNAMIC_DRAW);
+            glEnableVertexAttribArray(0);
+            glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)0);
+            glEnableVertexAttribArray(1);
+            glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)(2*sizeof(float)));
+            glDrawArrays(GL_TRIANGLE_FAN, 0, nv);
+        }
+
+        /* eglSwapBuffers triggers the GPU render */
+        eglSwapBuffers(dpy, egl_surf);
+
+        /* Get the front buffer GBM BO */
+        struct gbm_bo *bo = gbm_surface_lock_front_buffer(gbm_surf);
+        if (!bo) {
+            fprintf(stderr, "Failed to lock front buffer\n");
+            continue;
+        }
+
+        uint32_t handle = gbm_bo_get_handle(bo).u32;
+        uint32_t stride = gbm_bo_get_stride(bo);
+        uint32_t fb_id = 0;
+
+        /* Create DRM framebuffer from the GBM BO */
+        int ret = drmModeAddFB(drm.fd, width, height, 24, 32, stride, handle, &fb_id);
+        if (ret) {
+            fprintf(stderr, "drmModeAddFB failed: %s\n", strerror(errno));
+            gbm_surface_release_buffer(gbm_surf, bo);
+            continue;
+        }
+
+        /* Set this buffer as the CRTC scanout (blocking page flip) */
+        ret = drmModeSetCrtc(drm.fd, drm.crtc_id, fb_id, 0, 0,
+                             &drm.connector->connector_id, 1, &drm.mode);
+        if (ret) {
+            fprintf(stderr, "drmModeSetCrtc failed: %s (frame %d)\n", strerror(errno), frame);
+        }
+
+        /* Release previous buffer */
+        if (prev_bo) {
+            drmModeRmFB(drm.fd, prev_fb_id);
+            gbm_surface_release_buffer(gbm_surf, prev_bo);
+        }
+        prev_bo = bo;
+        prev_fb_id = fb_id;
+
+        frames_since_print++;
+        if (frames_since_print >= 16) {
+            uint64_t now = now_ns();
+            uint64_t elapsed = now - t_last_print;
+            if (elapsed > 0) {
+                double fps = (double)frames_since_print * 1e9 / (double)elapsed;
+                double ms = (double)elapsed / (double)frames_since_print / 1e6;
+                printf("[frame %4d] FPS: %.1f  (%.2f ms/frame)\n", frame, fps, ms);
+            }
+            frames_since_print = 0;
+            t_last_print = now;
+        }
+    }
+
+    uint64_t t_end = now_ns();
+    double total_secs = (double)(t_end - t_start) / 1e9;
+    printf("\n=== Results ===\n");
+    printf("Total frames: %d\n", total_frames);
+    printf("Total time:   %.2f s\n", total_secs);
+    printf("Average FPS:  %.1f\n", (double)total_frames / total_secs);
+
+    /* Restore original display */
+    if (drm.saved_crtc) {
+        drmModeSetCrtc(drm.fd, drm.saved_crtc->crtc_id, drm.saved_crtc->buffer_id,
+                       drm.saved_crtc->x, drm.saved_crtc->y,
+                       &drm.connector->connector_id, 1, &drm.saved_crtc->mode);
+        drmModeFreeCrtc(drm.saved_crtc);
+    }
+
+    /* Cleanup */
+    if (prev_bo) {
+        drmModeRmFB(drm.fd, prev_fb_id);
+        gbm_surface_release_buffer(gbm_surf, prev_bo);
+    }
+    glDeleteBuffers(1, &vbo);
+    free(verts);
+    eglDestroySurface(dpy, egl_surf);
+    gbm_surface_destroy(gbm_surf);
+    eglDestroyContext(dpy, ctx);
+    eglTerminate(dpy);
+    gbm_device_destroy(gbm);
+    close(drm.fd);
+
+    return 0;
+}
diff --git a/scripts/parallels/vm-type.sh b/scripts/parallels/vm-type.sh
new file mode 100755
index 00000000..3ad98cbe
--- /dev/null
+++ b/scripts/parallels/vm-type.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# Type a string into a Parallels VM console using keyboard scancodes.
+# Usage: ./vm-type.sh <vm-name> <string>
+# Special strings: ENTER, SPACE, TAB, BACKSPACE
+
+VM="$1"
+shift
+TEXT="$*"
+
+if [ -z "$VM" ] || [ -z "$TEXT" ]; then
+    echo "Usage: $0 <vm-name> <text>"
+    exit 1
+fi
+
+send_key() {
+    local sc="$1"
+    prlctl send-key-event "$VM" --scancode "$sc" --event press --delay 30 2>/dev/null
+    prlctl send-key-event "$VM" --scancode "$sc" --event release 2>/dev/null
+}
+
+send_shift_key() {
+    local sc="$1"
+    # Press shift
+    prlctl send-key-event "$VM" --scancode 42 --event press 2>/dev/null
+    sleep 0.05
+    prlctl send-key-event "$VM" --scancode "$sc" --event press --delay 30 2>/dev/null
+    prlctl send-key-event "$VM" --scancode "$sc" --event release 2>/dev/null
+    # Release shift
+    prlctl send-key-event "$VM" --scancode 42 --event release 2>/dev/null
+}
+
+# Handle special words
+if [ "$TEXT" = "ENTER" ]; then
+    send_key 28
+    exit 0
+fi
+
+# Type each character
+for (( i=0; i<${#TEXT}; i++ )); do
+    c="${TEXT:$i:1}"
+    case "$c" in
+        a) send_key 30 ;; b) send_key 48 ;; c) send_key 46 ;; d) send_key 32 ;;
+        e) send_key 18 ;; f) send_key 33 ;; g) send_key 34 ;; h) send_key 35 ;;
+        i) send_key 23 ;; j) send_key 36 ;; k) send_key 37 ;; l) send_key 38 ;;
+        m) send_key 50 ;; n) send_key 49 ;; o) send_key 24 ;; p) send_key 25 ;;
+        q) send_key 16 ;; r) send_key 19 ;; s) send_key 31 ;; t) send_key 20 ;;
+        u) send_key 22 ;; v) send_key 47 ;; w) send_key 17 ;; x) send_key 45 ;;
+        y) send_key 21 ;; z) send_key 44 ;;
+        A) send_shift_key 30 ;; B) send_shift_key 48 ;; C) send_shift_key 46 ;;
+        D) send_shift_key 32 ;; E) send_shift_key 18 ;; F) send_shift_key 33 ;;
+        G) send_shift_key 34 ;; H) send_shift_key 35 ;; I) send_shift_key 23 ;;
+        J) send_shift_key 36 ;; K) send_shift_key 37 ;; L) send_shift_key 38 ;;
+        M) send_shift_key 50 ;; N) send_shift_key 49 ;; O) send_shift_key 24 ;;
+        P) send_shift_key 25 ;; Q) send_shift_key 16 ;; R) send_shift_key 19 ;;
+        S) send_shift_key 31 ;; T) send_shift_key 20 ;; U) send_shift_key 22 ;;
+        V) send_shift_key 47 ;; W) send_shift_key 17 ;; X) send_shift_key 45 ;;
+        Y) send_shift_key 21 ;; Z) send_shift_key 44 ;;
+        0) send_key 11 ;; 1) send_key 2 ;; 2) send_key 3 ;; 3) send_key 4 ;;
+        4) send_key 5 ;; 5) send_key 6 ;; 6) send_key 7 ;; 7) send_key 8 ;;
+        8) send_key 9 ;; 9) send_key 10 ;;
+        ' ') send_key 57 ;; # space
+        '-') send_key 12 ;;
+        '=') send_key 13 ;;
+        '[') send_key 26 ;;
+        ']') send_key 27 ;;
+        '\\') send_key 43 ;;
+        ';') send_key 39 ;;
+        "'") send_key 40 ;;
+        '`') send_key 41 ;;
+        ',') send_key 51 ;;
+        '.') send_key 52 ;;
+        '/') send_key 53 ;;
+        '!') send_shift_key 2 ;;
+        '@') send_shift_key 3 ;;
+        '#') send_shift_key 4 ;;
+        '$') send_shift_key 5 ;;
+        '%') send_shift_key 6 ;;
+        '^') send_shift_key 7 ;;
+        '&') send_shift_key 8 ;;
+        '*') send_shift_key 9 ;;
+        '(') send_shift_key 10 ;;
+        ')') send_shift_key 11 ;;
+        '_') send_shift_key 12 ;;
+        '+') send_shift_key 13 ;;
+        '{') send_shift_key 26 ;;
+        '}') send_shift_key 27 ;;
+        '|') send_shift_key 43 ;;
+        ':') send_shift_key 39 ;;
+        '"') send_shift_key 40 ;;
+        '~') send_shift_key 41 ;;
+        '<') send_shift_key 51 ;;
+        '>') send_shift_key 52 ;;
+        '?') send_shift_key 53 ;;
+        *) echo "Warning: unmapped char '$c'" >&2 ;;
+    esac
+    sleep 0.02
+done
diff --git a/userspace/programs/src/bounce.rs b/userspace/programs/src/bounce.rs
index cb9408e9..eecdaf87 100644
--- a/userspace/programs/src/bounce.rs
+++ b/userspace/programs/src/bounce.rs
@@ -1,14 +1,18 @@
 //! Bouncing balls with collision detection demo for Breenix (std version)
 //!
-//! Uses mmap'd framebuffer for zero-syscall drawing via libgfx. All pixel
-//! writes go directly to a userspace buffer; only flush (1 syscall/frame)
-//! copies the dirty region to VRAM.
+//! Two rendering paths:
+//! - **VirGL GPU** (preferred): All rendering on host GPU via VirGL 3D pipeline.
+//!   Guest sends ~1KB of draw commands, host renders, DMA copies to BAR0.
+//!   Expected: 60+ FPS.
+//! - **mmap fallback**: Software rendering to mmap'd framebuffer with per-ball flush.
+//!   Guest CPU writes ~340KB to BAR0 per frame. Achieves ~12 FPS on Parallels.
 //!
 //! Created for Gus!
 
 use std::process;
 
 use libbreenix::graphics;
+use libbreenix::graphics::{FlushRect, VirglBall};
 use libbreenix::time;
 
 use libgfx::color::Color;
@@ -131,6 +135,9 @@ impl FpsCounter {
             if elapsed > 0 {
                 self.display_fps = (self.frame_count as u64 * 1_000_000_000 / elapsed) as u32;
             }
+            // Log FPS to serial so we can verify from the log
+            println!("[bounce] FPS: {} ({}ms/frame)", self.display_fps,
+                     elapsed / (self.frame_count as u64 * 1_000_000));
             self.frame_count = 0;
             self.last_time_ns = now;
         }
@@ -163,15 +170,105 @@ impl FpsCounter {
 // ---------------------------------------------------------------------------
 
 fn main() {
-    println!("Bounce demo starting (for Gus!)");
+    // Unique boot ID from monotonic clock — different every boot, proves we're
+    // running the latest binary (check this value in serial logs).
+    let boot_id = clock_monotonic_ns();
+    println!("Bounce demo starting (for Gus!) [boot_id={:016x}]", boot_id);
 
     let info = match graphics::fbinfo() {
         Ok(info) => info,
         Err(_e) => { println!("Error: Could not get framebuffer info"); process::exit(1); }
     };
 
-    let width = info.left_pane_width() as i32;
     let height = info.height as i32;
+
+    // VirGL uses full viewport (GPU renders everything), mmap uses left pane only
+    let virgl_width = info.width as i32;
+    let mmap_width = info.left_pane_width() as i32;
+
+    // 12 balls, fast velocities. Sub-stepping catches edge collisions.
+    let mut balls = [
+        Ball::new(100, 100,  1100,  800, 38, Color::rgb(255,  50,  50)),  // Red
+        Ball::new(300, 200, -1000,  700, 33, Color::rgb( 50, 255,  50)),  // Green
+        Ball::new(200, 400,   900, -950, 42, Color::rgb( 50,  50, 255)),  // Blue
+        Ball::new(400, 300,  -850, -800, 28, Color::rgb(255, 255,  50)),  // Yellow
+        Ball::new(150, 300,  1050,  600, 24, Color::rgb(255,  50, 255)),  // Magenta
+        Ball::new(350, 150,  -900,  750, 26, Color::rgb( 50, 255, 255)),  // Cyan
+        Ball::new(450, 500,   800, -700, 35, Color::rgb(255, 150,  50)),  // Orange
+        Ball::new(250, 550,  -750,  850, 30, Color::rgb(150,  50, 255)),  // Purple
+        Ball::new(500, 100,   950,  950, 22, Color::rgb(200, 200, 200)),  // White
+        Ball::new(120, 500, -1100, -650, 20, Color::rgb(255, 100, 100)),  // Salmon
+        Ball::new(380, 450,   700,  900, 32, Color::rgb(100, 255, 100)),  // Lime
+        Ball::new(520, 350,  -800, -850, 27, Color::rgb(100, 150, 255)),  // Sky
+    ];
+
+    let bg = Color::rgb(15, 15, 30);
+    let bg_packed = graphics::rgb(15, 15, 30);
+
+    // Try VirGL GPU rendering first. If the first frame succeeds, use GPU path.
+    let virgl_balls = build_virgl_balls(&balls);
+    let use_virgl = graphics::virgl_submit_frame(&virgl_balls[..balls.len()], bg_packed).is_ok();
+
+    if use_virgl {
+        println!("Starting VirGL GPU-rendered demo (12 balls, {}x{}) [boot_id={:016x}]",
+                 virgl_width, height, boot_id);
+        run_virgl_loop(&mut balls, virgl_width, height, bg_packed);
+    } else {
+        println!("VirGL unavailable, falling back to mmap rendering [boot_id={:016x}]", boot_id);
+        run_mmap_loop(&mut balls, mmap_width, height, &info, bg);
+    }
+}
+
+/// Convert Ball array to VirglBall descriptors for GPU rendering.
+fn build_virgl_balls(balls: &[Ball]) -> [VirglBall; 12] {
+    let mut vb = [VirglBall::default(); 12];
+    for (i, ball) in balls.iter().enumerate().take(12) {
+        let c = ball.color;
+        vb[i] = VirglBall {
+            x: ball.px() as f32,
+            y: ball.py() as f32,
+            radius: ball.radius as f32,
+            color: [
+                c.r as f32 / 255.0,
+                c.g as f32 / 255.0,
+                c.b as f32 / 255.0,
+                1.0,
+            ],
+        };
+    }
+    vb
+}
+
+/// VirGL GPU rendering loop — all rendering on host GPU, zero guest pixel writes.
+fn run_virgl_loop(balls: &mut [Ball; 12], width: i32, height: i32, bg_packed: u32) {
+    const SUBSTEPS: i32 = 16;
+    let mut fps = FpsCounter::new();
+
+    loop {
+        // Sub-step physics
+        for _ in 0..SUBSTEPS {
+            for ball in balls.iter_mut() { ball.step(SUBSTEPS); }
+            for ball in balls.iter_mut() { ball.bounce_walls(width, height); }
+            for i in 0..balls.len() {
+                for j in (i + 1)..balls.len() {
+                    let (left, right) = balls.split_at_mut(j);
+                    check_collision(&mut left[i], &mut right[0]);
+                }
+            }
+        }
+
+        // Build VirGL ball descriptors from current positions
+        let vb = build_virgl_balls(balls);
+
+        // Submit to GPU — one syscall renders everything
+        let _ = graphics::virgl_submit_frame(&vb[..balls.len()], bg_packed);
+
+        fps.tick();
+    }
+}
+
+/// Mmap software rendering loop — fallback when VirGL is unavailable.
+fn run_mmap_loop(balls: &mut [Ball; 12], width: i32, height: i32, info: &graphics::FbInfo, bg: Color) {
     let bpp = info.bytes_per_pixel as usize;
 
     let fb_ptr = match graphics::fb_mmap() {
@@ -190,31 +287,18 @@ fn main() {
         )
     };
 
-    println!("Starting collision demo (12 balls, mmap mode)...");
-
-    let bg = Color::rgb(15, 15, 30);
-
-    // 12 balls, fast velocities. Sub-stepping catches edge collisions.
-    let mut balls = [
-        Ball::new(100, 100,  1100,  800, 38, Color::rgb(255,  50,  50)),  // Red
-        Ball::new(300, 200, -1000,  700, 33, Color::rgb( 50, 255,  50)),  // Green
-        Ball::new(200, 400,   900, -950, 42, Color::rgb( 50,  50, 255)),  // Blue
-        Ball::new(400, 300,  -850, -800, 28, Color::rgb(255, 255,  50)),  // Yellow
-        Ball::new(150, 300,  1050,  600, 24, Color::rgb(255,  50, 255)),  // Magenta
-        Ball::new(350, 150,  -900,  750, 26, Color::rgb( 50, 255, 255)),  // Cyan
-        Ball::new(450, 500,   800, -700, 35, Color::rgb(255, 150,  50)),  // Orange
-        Ball::new(250, 550,  -750,  850, 30, Color::rgb(150,  50, 255)),  // Purple
-        Ball::new(500, 100,   950,  950, 22, Color::rgb(200, 200, 200)),  // White
-        Ball::new(120, 500, -1100, -650, 20, Color::rgb(255, 100, 100)),  // Salmon
-        Ball::new(380, 450,   700,  900, 32, Color::rgb(100, 255, 100)),  // Lime
-        Ball::new(520, 350,  -800, -850, 27, Color::rgb(100, 150, 255)),  // Sky
-    ];
+    println!("Starting collision demo (12 balls, {}x{}, mmap, batch flush)", width, height);
 
     // With velocities ~1000 (10 px/frame), 16 sub-steps = ~0.6 px per step.
     const SUBSTEPS: i32 = 16;
 
     let mut fps = FpsCounter::new();
 
+    // Track previous frame ball positions for per-ball flushing.
+    let mut prev: [(i32, i32, i32); 12] = [(0, 0, 0); 12];
+    let mut first_frame = true;
+    const PAD: i32 = 2;
+
     loop {
         // Sub-step physics
         for _ in 0..SUBSTEPS {
@@ -232,21 +316,68 @@ fn main() {
             }
         }
 
-        // Draw — libgfx tracks dirty rects automatically
-        fb.clear(bg);
-        for ball in balls.iter() {
-            ball.draw(&mut fb);
-        }
-        fps.tick();
-        fps.draw(&mut fb);
-
-        // Flush only the dirty region
-        if let Some(dirty) = fb.take_dirty() {
-            let _ = graphics::fb_flush_rect(dirty.x, dirty.y, dirty.w, dirty.h);
+        if first_frame {
+            // First frame: full clear + single flush
+            fb.clear(bg);
+            for ball in balls.iter() { ball.draw(&mut fb); }
+            for (i, ball) in balls.iter().enumerate() {
+                prev[i] = (ball.px(), ball.py(), ball.radius);
+            }
+            fps.tick();
+            fps.draw(&mut fb);
+            if let Some(dirty) = fb.take_dirty() {
+                let _ = graphics::fb_flush_rect(dirty.x, dirty.y, dirty.w, dirty.h);
+            }
+            first_frame = false;
         } else {
-            let _ = graphics::fb_flush();
+            // Phase 1: Erase all previous ball positions
+            for &(px, py, r) in prev.iter() {
+                if r > 0 {
+                    shapes::fill_rect(&mut fb,
+                        (px - r - PAD).max(0), (py - r - PAD).max(0),
+                        (r + PAD) * 2 + 1, (r + PAD) * 2 + 1, bg);
+                }
+            }
+            let fps_y = (height - 40).max(0);
+            shapes::fill_rect(&mut fb, 0, fps_y, 340, 40, bg);
+
+            // Phase 2: Draw all new ball positions
+            for ball in balls.iter() { ball.draw(&mut fb); }
+            fps.tick();
+            fps.draw(&mut fb);
+
+            // Discard accumulated dirty rect — we use batch flush below
+            let _ = fb.take_dirty();
+
+            // Phase 3: Batch flush — all dirty rects in ONE syscall, ONE DSB barrier.
+            // Saves 12 syscall round-trips + 12 DSB stalls vs per-ball flushing.
+            let mut flush_rects = [FlushRect { x: 0, y: 0, w: 0, h: 0 }; 13];
+            let mut rect_count = 0usize;
+            for (i, ball) in balls.iter().enumerate() {
+                let (opx, opy, or) = prev[i];
+                let npx = ball.px();
+                let npy = ball.py();
+                let nr = ball.radius;
+                let x1 = (opx - or - PAD).min(npx - nr - PAD).max(0);
+                let y1 = (opy - or - PAD).min(npy - nr - PAD).max(0);
+                let x2 = (opx + or + PAD + 1).max(npx + nr + PAD + 1).min(width);
+                let y2 = (opy + or + PAD + 1).max(npy + nr + PAD + 1).min(height);
+                if x2 > x1 && y2 > y1 {
+                    flush_rects[rect_count] = FlushRect { x: x1, y: y1, w: x2 - x1, h: y2 - y1 };
+                    rect_count += 1;
+                }
+            }
+            // FPS region
+            flush_rects[rect_count] = FlushRect { x: 0, y: fps_y, w: 340, h: 40 };
+            rect_count += 1;
+            let _ = graphics::fb_flush_rects(&flush_rects[..rect_count]);
+
+            // Save new positions for next frame's erase
+            for (i, ball) in balls.iter().enumerate() {
+                prev[i] = (ball.px(), ball.py(), ball.radius);
+            }
         }
 
-        let _ = time::sleep_ms(16); // ~60 FPS target
+        let _ = time::sleep_ms(1); // Yield CPU briefly without wasting frame time
     }
 }