From 5385302180374f7a772cf8d35625a5450d46bfd7 Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Tue, 3 Mar 2026 15:42:32 -0500 Subject: [PATCH] feat: MSI interrupt-driven VirtIO GPU + VirGL command encoding fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace busy-polling with MSI interrupt + WFI completion for VirtIO GPU commands. Each VirGL frame previously burned ~50ms spinning in a tight poll loop with SeqCst fences. Now the guest halts with WFI, Parallels processes the command immediately, and delivers an MSI interrupt to wake the vCPU. Result: 10 FPS → 2000-3000 FPS. Key changes: - SPI allocation infrastructure (allocate_msi_spi) so GPU and xHCI don't collide on the same GICv2m SPI - Cached queue notify addresses — single write_volatile instead of 3 MMIO ops per notification - MSI setup + interrupt handler with WFI completion loop - GPU MSI dispatch in aarch64 exception handler VirGL command encoding fixes: - Rasterizer fill mode was POINT (2) instead of FILL (0) — all triangles rendered as invisible 1-pixel dots - SET_VERTEX_BUFFERS had spurious start_slot DWORD making length%3!=0, causing silent EINVAL from host (vertex buffers never bound) - SHADER_FRAGMENT constant was 4 (TESS_EVAL) instead of 1 - Shader compilation never triggered (bit 31 not set in offset field) Also adds VirGL command encoder module (virgl.rs), Linux reference implementations for Parallels VirGL benchmarking, and minor cleanups. Co-Authored-By: Ryan Breen Co-Authored-By: Claude Opus 4.6 --- kernel/src/arch_impl/aarch64/exception.rs | 6 + .../src/arch_impl/aarch64/timer_interrupt.rs | 5 - kernel/src/drivers/mod.rs | 9 +- kernel/src/drivers/usb/xhci.rs | 10 +- kernel/src/drivers/virtio/gpu_pci.rs | 1354 ++++++++++++++++- kernel/src/drivers/virtio/mod.rs | 2 + kernel/src/drivers/virtio/pci_transport.rs | 41 + kernel/src/drivers/virtio/virgl.rs | 396 +++++ kernel/src/graphics/arm64_fb.rs | 19 + kernel/src/platform_config.rs | 40 +- kernel/src/signal/delivery.rs | 10 +- kernel/src/syscall/graphics.rs | 221 ++- libs/libbreenix/src/graphics.rs | 79 + scripts/parallels/gl_bench.c | 423 +++++ scripts/parallels/gl_display.c | 470 ++++++ scripts/parallels/vm-type.sh | 97 ++ userspace/programs/src/bounce.rs | 205 ++- 17 files changed, 3282 insertions(+), 105 deletions(-) create mode 100644 kernel/src/drivers/virtio/virgl.rs create mode 100644 scripts/parallels/gl_bench.c create mode 100644 scripts/parallels/gl_display.c create mode 100755 scripts/parallels/vm-type.sh diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs index e0026f20..0e6192df 100644 --- a/kernel/src/arch_impl/aarch64/exception.rs +++ b/kernel/src/arch_impl/aarch64/exception.rs @@ -1028,6 +1028,12 @@ pub extern "C" fn handle_irq() { crate::drivers::usb::xhci::handle_interrupt(); } } + // VirtIO GPU PCI interrupt dispatch (MSI completion) + if let Some(gpu_irq) = crate::drivers::virtio::gpu_pci::get_irq() { + if irq_id == gpu_irq { + crate::drivers::virtio::gpu_pci::handle_interrupt(); + } + } } // Should not happen - GIC filters invalid IDs (1020+) diff --git a/kernel/src/arch_impl/aarch64/timer_interrupt.rs b/kernel/src/arch_impl/aarch64/timer_interrupt.rs index 38784d41..45de82c0 100644 --- a/kernel/src/arch_impl/aarch64/timer_interrupt.rs +++ b/kernel/src/arch_impl/aarch64/timer_interrupt.rs @@ -168,11 +168,6 @@ pub extern "C" fn timer_interrupt_handler() { // Increment timer interrupt counter (used for debugging when needed) let _count = TIMER_INTERRUPT_COUNT.fetch_add(1, Ordering::Relaxed) + 1; - // Debug breadcrumb: print '.' every 200 ticks (~1 second) to verify timer is alive - if cpu_id == 0 && _count % 200 == 0 { - raw_serial_char(b'.'); - } - // CPU 0 only: poll input devices (single-device, not safe from multiple CPUs) if cpu_id == 0 { poll_keyboard_to_stdin(); diff --git a/kernel/src/drivers/mod.rs b/kernel/src/drivers/mod.rs index e80783fd..adabb5a8 100644 --- a/kernel/src/drivers/mod.rs +++ b/kernel/src/drivers/mod.rs @@ -97,7 +97,14 @@ pub fn init() -> usize { serial_println!("[drivers] Found {} VirtIO PCI devices", virtio_devices.len()); match virtio::gpu_pci::init() { - Ok(()) => serial_println!("[drivers] VirtIO GPU (PCI) initialized"), + Ok(()) => { + serial_println!("[drivers] VirtIO GPU (PCI) initialized"); + // Attempt to initialize VirGL 3D acceleration if the device supports it + match virtio::gpu_pci::virgl_init() { + Ok(()) => serial_println!("[drivers] VirGL 3D acceleration active"), + Err(e) => serial_println!("[drivers] VirGL init skipped: {}", e), + } + } Err(e) => serial_println!("[drivers] VirtIO GPU (PCI) init failed: {}", e), } diff --git a/kernel/src/drivers/usb/xhci.rs b/kernel/src/drivers/usb/xhci.rs index c1cf7665..7c1efbec 100644 --- a/kernel/src/drivers/usb/xhci.rs +++ b/kernel/src/drivers/usb/xhci.rs @@ -4122,7 +4122,7 @@ fn setup_xhci_msi(pci_dev: &crate::drivers::pci::Device) -> u32 { // On Parallels ARM64, GICv2m is at 0x02250000 (discovered from MADT). const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000; let gicv2m_base = crate::platform_config::gicv2m_base_phys(); - let (base, spi_base, spi_count) = if gicv2m_base != 0 { + let (base, _spi_base, spi_count) = if gicv2m_base != 0 { // Already probed ( gicv2m_base, @@ -4146,8 +4146,12 @@ fn setup_xhci_msi(pci_dev: &crate::drivers::pci::Device) -> u32 { return 0; } - // Step 3: Allocate first available SPI for XHCI - let spi = spi_base; + // Step 3: Allocate next available SPI for XHCI + let spi = crate::platform_config::allocate_msi_spi(); + if spi == 0 { + xhci_trace_note(0, "err:alloc_spi"); + return 0; + } let intid = spi; // GIC INTID = SPI number for GICv2m // Step 4: Program PCI MSI registers diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs index 2aea0374..ad23db37 100644 --- a/kernel/src/drivers/virtio/gpu_pci.rs +++ b/kernel/src/drivers/virtio/gpu_pci.rs @@ -9,7 +9,7 @@ use super::pci_transport::VirtioPciDevice; use core::ptr::read_volatile; -use core::sync::atomic::{fence, AtomicBool, Ordering}; +use core::sync::atomic::{fence, AtomicBool, AtomicU32, Ordering}; use spin::Mutex; /// Lock protecting the GPU PCI command path (PCI_CMD_BUF, PCI_RESP_BUF, @@ -35,9 +35,23 @@ mod cmd { pub const RESOURCE_ATTACH_BACKING: u32 = 0x0106; pub const RESOURCE_DETACH_BACKING: u32 = 0x0107; + // 3D commands (VirGL) + pub const CTX_CREATE: u32 = 0x0200; + pub const CTX_DESTROY: u32 = 0x0201; + pub const CTX_ATTACH_RESOURCE: u32 = 0x0202; + pub const CTX_DETACH_RESOURCE: u32 = 0x0203; + pub const RESOURCE_CREATE_3D: u32 = 0x0204; + pub const TRANSFER_TO_HOST_3D: u32 = 0x0205; + pub const TRANSFER_FROM_HOST_3D: u32 = 0x0206; + pub const SUBMIT_3D: u32 = 0x0207; + + // Capability commands + pub const GET_CAPSET_INFO: u32 = 0x0110; + // Response types pub const RESP_OK_NODATA: u32 = 0x1100; pub const RESP_OK_DISPLAY_INFO: u32 = 0x1101; + pub const RESP_OK_CAPSET_INFO: u32 = 0x1102; pub const RESP_ERR_UNSPEC: u32 = 0x1200; } @@ -158,6 +172,106 @@ struct VirtioGpuResourceFlush { padding: u32, } +/// GET_CAPSET_INFO request +#[repr(C)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuGetCapsetInfo { + hdr: VirtioGpuCtrlHdr, + capset_index: u32, + padding: u32, +} + +/// GET_CAPSET_INFO response +#[repr(C)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuRespCapsetInfo { + hdr: VirtioGpuCtrlHdr, + capset_id: u32, + capset_max_version: u32, + capset_max_size: u32, + padding: u32, +} + +// VirtIO GPU config space offsets +const GPU_CFG_EVENTS_READ: usize = 0; +const GPU_CFG_EVENTS_CLEAR: usize = 4; +const GPU_CFG_NUM_SCANOUTS: usize = 8; +const GPU_CFG_NUM_CAPSETS: usize = 12; +const VIRTIO_GPU_EVENT_DISPLAY: u32 = 1 << 0; + +// ============================================================================= +// VirtIO GPU 3D (VirGL) Protocol Structures +// ============================================================================= + +/// Create a 3D rendering context +#[repr(C)] +#[derive(Clone, Copy)] +struct VirtioGpuCtxCreate { + hdr: VirtioGpuCtrlHdr, + nlen: u32, + context_init: u32, // 0 for VirGL + debug_name: [u8; 64], +} + +/// Attach/detach a resource to/from a 3D context +#[repr(C)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuCtxResource { + hdr: VirtioGpuCtrlHdr, + resource_id: u32, + padding: u32, +} + +/// Create a 3D resource (texture, render target, buffer) +#[repr(C)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuResourceCreate3d { + hdr: VirtioGpuCtrlHdr, + resource_id: u32, + target: u32, + format: u32, + bind: u32, + width: u32, + height: u32, + depth: u32, + array_size: u32, + last_level: u32, + nr_samples: u32, + flags: u32, + padding: u32, +} + +/// Transfer host 3D command — used for both TRANSFER_TO_HOST_3D (0x0205) +/// and TRANSFER_FROM_HOST_3D (0x0206). Copies between guest backing and +/// host-side texture. Linux's DRM driver calls TRANSFER_TO_HOST_3D before +/// RESOURCE_FLUSH even for VirGL-rendered content — it serves as a +/// synchronization point that tells the host the resource is display-ready. +#[repr(C)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuTransferHost3d { + hdr: VirtioGpuCtrlHdr, + box_x: u32, + box_y: u32, + box_z: u32, + box_w: u32, + box_h: u32, + box_d: u32, + offset: u64, + resource_id: u32, + level: u32, + stride: u32, + layer_stride: u32, +} + +/// Submit 3D command buffer header (followed immediately by VirGL command data) +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioGpuCmdSubmit { + hdr: VirtioGpuCtrlHdr, + size: u32, // size in bytes of the VirGL command buffer + // NO padding — VirGL data follows immediately at offset 28 +} + // ============================================================================= // Virtqueue Structures // ============================================================================= @@ -232,6 +346,15 @@ struct PciCmdBuffer { static mut PCI_CMD_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] }; static mut PCI_RESP_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] }; +/// Larger command buffer for 3D submissions (Submit3D header + VirGL payload). +/// 16KB accommodates shader text + inline vertex data for 12 circle draws. +#[repr(C, align(64))] +struct Pci3dCmdBuffer { + data: [u8; 16384], +} +static mut PCI_3D_CMD_BUF: Pci3dCmdBuffer = Pci3dCmdBuffer { data: [0; 16384] }; + + // Default framebuffer dimensions (Parallels: set_scanout configures display mode) // 1728x1080 matches the QEMU resolution for consistent performance comparison. const DEFAULT_FB_WIDTH: u32 = 1728; @@ -242,12 +365,38 @@ const FB_MAX_HEIGHT: u32 = 1600; const FB_SIZE: usize = (FB_MAX_WIDTH * FB_MAX_HEIGHT * 4) as usize; const BYTES_PER_PIXEL: usize = 4; const RESOURCE_ID: u32 = 1; +/// Resource ID for the VirGL 3D render target (with BIND_SCANOUT) +const RESOURCE_3D_ID: u32 = 2; +/// Resource ID for the VirGL vertex buffer +const RESOURCE_VB_ID: u32 = 3; +/// VirGL 3D context ID +const VIRGL_CTX_ID: u32 = 1; +/// Maximum circles we can render per frame +const MAX_CIRCLES: usize = 16; +/// Vertices per circle (triangle fan: center + N perimeter + closing vertex) +const CIRCLE_SEGMENTS: usize = 16; +/// Vertices per circle = center + segments + 1 (close fan) +const VERTS_PER_CIRCLE: usize = CIRCLE_SEGMENTS + 2; +/// Bytes per vertex: position (4×f32) + color (4×f32) = 32 bytes +const BYTES_PER_VERTEX: usize = 32; +/// Vertex buffer size: enough for MAX_CIRCLES circles +const VB_SIZE: usize = MAX_CIRCLES * VERTS_PER_CIRCLE * BYTES_PER_VERTEX; // VirtIO standard feature bits const VIRTIO_F_VERSION_1: u64 = 1 << 32; // VirtIO GPU feature bits (requested but not required) #[allow(dead_code)] const VIRTIO_GPU_F_EDID: u64 = 1 << 1; +// VirtIO GPU 3D (VirGL) acceleration +const VIRTIO_GPU_F_VIRGL: u64 = 1 << 0; + +/// Whether VirGL 3D acceleration was successfully negotiated with the device. +static VIRGL_ENABLED: AtomicBool = AtomicBool::new(false); +/// Whether SET_SCANOUT has been issued for the 3D resource. +/// Deferred until the first virgl_render_frame call so that the mmap/GOP +/// path keeps working until userspace explicitly opts into VirGL. +static VIRGL_SCANOUT_ACTIVE: AtomicBool = AtomicBool::new(false); + #[repr(C, align(4096))] struct PciFramebuffer { @@ -256,10 +405,25 @@ struct PciFramebuffer { static mut PCI_FRAMEBUFFER: PciFramebuffer = PciFramebuffer { pixels: [0; FB_SIZE] }; +/// Separate backing for 3D resource — NOT shared with the 2D resource. +/// Linux's Mesa/virgl creates independent GEM buffers for each resource. +/// Sharing backing between 2D and 3D resources may cause the hypervisor +/// to handle SET_SCANOUT incorrectly. +const FB_3D_SIZE: usize = (DEFAULT_FB_WIDTH * DEFAULT_FB_HEIGHT * 4) as usize; + +#[repr(C, align(4096))] +struct Pci3dFramebuffer { + pixels: [u8; FB_3D_SIZE], +} +static mut PCI_3D_FRAMEBUFFER: Pci3dFramebuffer = Pci3dFramebuffer { pixels: [0; FB_3D_SIZE] }; + // ============================================================================= // GPU PCI Device State // ============================================================================= +/// VirtIO GPU fence flag — tells the host to signal completion via fence_id. +const VIRTIO_GPU_FLAG_FENCE: u32 = 1; + /// Combined GPU PCI device state (transport + GPU state) struct GpuPciDeviceState { device: VirtioPciDevice, @@ -267,11 +431,22 @@ struct GpuPciDeviceState { height: u32, resource_id: u32, last_used_idx: u16, + /// Monotonically increasing fence counter for GPU synchronization. + /// Each fenced command gets a unique fence_id; the host signals completion + /// by echoing this ID in the response. Required for TRANSFER_FROM_HOST_3D + /// to ensure DMA writes complete before reading backing memory. + next_fence_id: u64, } static mut GPU_PCI_STATE: Option = None; static GPU_PCI_INITIALIZED: AtomicBool = AtomicBool::new(false); +/// GIC INTID (SPI number) allocated for GPU MSI. 0 = polling mode. +static GPU_IRQ: AtomicU32 = AtomicU32::new(0); + +/// Set by the interrupt handler to wake the WFI loop in send_command(). +static GPU_CMD_COMPLETE: AtomicBool = AtomicBool::new(false); + // ============================================================================= // Helpers // ============================================================================= @@ -297,6 +472,109 @@ pub fn is_initialized() -> bool { GPU_PCI_INITIALIZED.load(Ordering::Acquire) } +/// Check if VirGL 3D acceleration was negotiated with the device. +pub fn is_virgl_enabled() -> bool { + VIRGL_ENABLED.load(Ordering::Acquire) +} + +/// Disable VirGL for display purposes (e.g., Parallels can't route VirGL output to screen). +pub fn disable_virgl() { + VIRGL_ENABLED.store(false, Ordering::Release); +} + +// ============================================================================= +// MSI Interrupt Support +// ============================================================================= + +/// Set up PCI MSI for the VirtIO GPU through GICv2m. +/// +/// Follows the same pattern as xHCI MSI setup: find MSI cap, probe GICv2m, +/// allocate SPI, program MSI registers, configure GIC. +/// +/// Returns the allocated SPI number, or 0 if MSI is unavailable. +#[cfg(target_arch = "aarch64")] +fn setup_gpu_msi(pci_dev: &crate::drivers::pci::Device) -> u32 { + use crate::arch_impl::aarch64::gic; + + // Step 1: Find MSI capability + let msi_cap = match pci_dev.find_msi_capability() { + Some(offset) => offset, + None => { + crate::serial_println!("[virtio-gpu-pci] No MSI capability found, using polling"); + return 0; + } + }; + + // Step 2: Ensure GICv2m is probed + const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000; + let gicv2m_base = crate::platform_config::gicv2m_base_phys(); + let base = if gicv2m_base != 0 { + gicv2m_base + } else if crate::platform_config::probe_gicv2m(PARALLELS_GICV2M_BASE) { + PARALLELS_GICV2M_BASE + } else { + crate::serial_println!("[virtio-gpu-pci] GICv2m not available, using polling"); + return 0; + }; + + // Step 3: Allocate SPI + let spi = crate::platform_config::allocate_msi_spi(); + if spi == 0 { + crate::serial_println!("[virtio-gpu-pci] No SPIs available, using polling"); + return 0; + } + + // Step 4: Program PCI MSI registers + let msi_address = (base + 0x40) as u32; + let msi_data = spi as u16; + pci_dev.configure_msi(msi_cap, msi_address, msi_data); + pci_dev.disable_intx(); + + // Step 5: Configure GIC for this SPI (edge-triggered) + gic::configure_spi_edge_triggered(spi); + + crate::serial_println!("[virtio-gpu-pci] MSI configured: SPI={}", spi); + spi +} + +/// Handle GPU MSI interrupt — called from exception.rs IRQ dispatch. +/// +/// Wakes the WFI loop in send_command() by setting GPU_CMD_COMPLETE. +/// Follows the xHCI pattern: disable SPI, clear pending, ack, re-enable. +#[cfg(target_arch = "aarch64")] +pub fn handle_interrupt() { + use crate::arch_impl::aarch64::gic; + + let irq = GPU_IRQ.load(Ordering::Relaxed); + if irq == 0 { + return; + } + + gic::disable_spi(irq); + gic::clear_spi_pending(irq); + + // Read ISR to auto-acknowledge the VirtIO interrupt condition + if GPU_PCI_INITIALIZED.load(Ordering::Acquire) { + unsafe { + let ptr = &raw const GPU_PCI_STATE; + if let Some(ref state) = *ptr { + state.device.read_interrupt_status(); + } + } + } + + GPU_CMD_COMPLETE.store(true, Ordering::Release); + + gic::clear_spi_pending(irq); + gic::enable_spi(irq); +} + +/// Get the GIC INTID for the GPU interrupt (for exception dispatch). +pub fn get_irq() -> Option { + let irq = GPU_IRQ.load(Ordering::Relaxed); + if irq != 0 { Some(irq) } else { None } +} + // ============================================================================= // Initialization // ============================================================================= @@ -324,9 +602,23 @@ pub fn init() -> Result<(), &'static str> { // VIRTIO_F_VERSION_1 is mandatory for PCI modern transport — without it, // Parallels's GPU device accepts the feature set but ignores subsequent // state-modifying commands (create_resource, attach_backing, etc.). - let requested = VIRTIO_F_VERSION_1 | VIRTIO_GPU_F_EDID; + let requested = VIRTIO_F_VERSION_1 | VIRTIO_GPU_F_EDID | VIRTIO_GPU_F_VIRGL; + + // Log raw device-offered features before negotiation + let device_feats = virtio.read_device_features(); + crate::serial_println!("[virtio-gpu-pci] Device features: {:#018x}", device_feats); + crate::serial_println!("[virtio-gpu-pci] VIRGL offered: {}", device_feats & VIRTIO_GPU_F_VIRGL != 0); + virtio.init(requested)?; - let _negotiated = virtio.device_features() & requested; + + // Check what was actually negotiated + let negotiated = virtio.device_features() & requested; + let virgl_on = negotiated & VIRTIO_GPU_F_VIRGL != 0; + crate::serial_println!("[virtio-gpu-pci] Negotiated: {:#018x} (VIRGL={})", negotiated, virgl_on); + VIRGL_ENABLED.store(virgl_on, Ordering::Release); + crate::serial_println!("[virtio-gpu-pci] VIRGL_ENABLED stored={}, readback={}, addr={:#x}", + virgl_on, VIRGL_ENABLED.load(Ordering::Acquire), + &VIRGL_ENABLED as *const _ as usize); // Set up control queue (queue 0) virtio.select_queue(0); @@ -361,9 +653,38 @@ pub fn init() -> Result<(), &'static str> { virtio.set_queue_used(queue_phys + 4096); virtio.set_queue_ready(true); - // Mark device ready + // Cache queue 0 notify address to avoid 2 MMIO reads per notification + virtio.cache_queue_notify_addr(0); + + // Set up MSI interrupt before driver_ok so the device can signal completions + #[cfg(target_arch = "aarch64")] + let msi_spi = setup_gpu_msi(virtio.pci_device()); + #[cfg(not(target_arch = "aarch64"))] + let msi_spi = 0u32; + + // Mark device ready — MUST happen before sending any commands (Linux: virtio_device_ready()) virtio.driver_ok(); + // Enable the MSI SPI after driver_ok so the device can actually fire interrupts + #[cfg(target_arch = "aarch64")] + if msi_spi != 0 { + GPU_IRQ.store(msi_spi, Ordering::Release); + crate::arch_impl::aarch64::gic::enable_spi(msi_spi); + crate::serial_println!("[virtio-gpu-pci] MSI SPI {} enabled", msi_spi); + } + + // Read device-specific config (Linux reads num_scanouts + num_capsets here) + let num_scanouts = virtio.read_config_u32(GPU_CFG_NUM_SCANOUTS); + let num_capsets = virtio.read_config_u32(GPU_CFG_NUM_CAPSETS); + crate::serial_println!("[virtio-gpu-pci] Config: num_scanouts={}, num_capsets={}", num_scanouts, num_capsets); + + // Check and clear pending display events (Linux: virtio_gpu_config_changed_work_func) + let events = virtio.read_config_u32(GPU_CFG_EVENTS_READ); + if events & VIRTIO_GPU_EVENT_DISPLAY != 0 { + crate::serial_println!("[virtio-gpu-pci] Clearing pending DISPLAY event (events_read={:#x})", events); + virtio.write_config_u32(GPU_CFG_EVENTS_CLEAR, events & VIRTIO_GPU_EVENT_DISPLAY); + } + // Store initial state with default dimensions (will be updated after display query) unsafe { let ptr = &raw mut GPU_PCI_STATE; @@ -373,11 +694,23 @@ pub fn init() -> Result<(), &'static str> { height: DEFAULT_FB_HEIGHT, resource_id: RESOURCE_ID, last_used_idx: 0, + next_fence_id: 1, }); } // Don't set GPU_PCI_INITIALIZED yet — the GPU commands below can fail. - // If create_resource/attach_backing/set_scanout/flush time out, leaving - // the flag true would mislead other code into thinking the device is usable. + + // GET_CAPSET_INFO for each capset (Linux does this before GET_DISPLAY_INFO) + for idx in 0..num_capsets { + match get_capset_info(idx) { + Ok((id, max_ver, max_size)) => { + crate::serial_println!("[virtio-gpu-pci] Capset {}: id={}, max_ver={}, max_size={}", + idx, id, max_ver, max_size); + } + Err(e) => { + crate::serial_println!("[virtio-gpu-pci] GET_CAPSET_INFO[{}] failed: {}", idx, e); + } + } + } // Query display info to see what Parallels reports as native resolution. let display_dims = get_display_info(); @@ -484,16 +817,15 @@ fn send_command( fence(Ordering::SeqCst); } - // Notify device via PCI transport - state.device.notify_queue(0); + // Signal that we're waiting for a completion, then notify device + GPU_CMD_COMPLETE.store(false, Ordering::Release); + state.device.notify_queue_fast(0); - // Spin-wait for used ring. - // The timeout must be generous: TRANSFER_TO_HOST_2D transfers up to 4MB - // (full framebuffer) and QEMU processes this in its event loop. - // 10M iterations is safe. + // Wait for used ring update — WFI if MSI is available, spin_loop otherwise. + let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0; let mut timeout = 10_000_000u32; loop { - fence(Ordering::SeqCst); + fence(Ordering::Acquire); let used_idx = unsafe { let q = &raw const PCI_CTRL_QUEUE; read_volatile(&(*q).used.idx) @@ -506,7 +838,17 @@ fn send_command( if timeout == 0 { return Err("GPU PCI command timeout"); } - core::hint::spin_loop(); + if use_msi { + // WFI halts the vCPU until an interrupt arrives. The hypervisor + // processes the VirtIO command while the guest is halted, then + // delivers the MSI interrupt to wake us. + #[cfg(target_arch = "aarch64")] + unsafe { core::arch::asm!("wfi", options(nomem, nostack)); } + #[cfg(not(target_arch = "aarch64"))] + core::hint::spin_loop(); + } else { + core::hint::spin_loop(); + } } Ok(()) @@ -528,12 +870,18 @@ fn send_command_expect_ok( )?; // Read response — use read_volatile to defeat caching (DMA coherency) - let resp_type = unsafe { + let (resp_type, resp_flags, resp_fence) = unsafe { let resp_ptr = &raw const PCI_RESP_BUF; - core::ptr::read_volatile(&(*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr)).type_) + let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr); + ( + core::ptr::read_volatile(&hdr.type_), + core::ptr::read_volatile(&hdr.flags), + core::ptr::read_volatile(&hdr.fence_id), + ) }; if resp_type != cmd::RESP_OK_NODATA { - crate::serial_println!("[virtio-gpu-pci] Command failed: resp_type={:#x}", resp_type); + crate::serial_println!("[virtio-gpu-pci] Command failed: resp_type={:#x} flags={:#x} fence={}", + resp_type, resp_flags, resp_fence); return Err("GPU PCI command failed"); } Ok(()) @@ -591,6 +939,57 @@ fn get_display_info() -> Result<(u32, u32), &'static str> { }) } +/// Query capability set info (Linux: virtio_gpu_get_capsets). +/// Returns (capset_id, max_version, max_size). +fn get_capset_info(capset_index: u32) -> Result<(u32, u32, u32), &'static str> { + with_device_state(|state| { + let cmd_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64); + let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64); + + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuGetCapsetInfo); + *cmd = VirtioGpuGetCapsetInfo { + hdr: VirtioGpuCtrlHdr { + type_: cmd::GET_CAPSET_INFO, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + capset_index, + padding: 0, + }; + } + + send_command( + state, + cmd_phys, + core::mem::size_of::() as u32, + resp_phys, + core::mem::size_of::() as u32, + )?; + + unsafe { + let resp_ptr = &raw const PCI_RESP_BUF; + let resp = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuRespCapsetInfo); + let resp_type = core::ptr::read_volatile(&resp.hdr.type_); + + if resp_type != cmd::RESP_OK_CAPSET_INFO { + crate::serial_println!("[virtio-gpu-pci] GET_CAPSET_INFO: unexpected resp_type={:#x} (expected {:#x})", + resp_type, cmd::RESP_OK_CAPSET_INFO); + return Err("GET_CAPSET_INFO failed"); + } + + Ok(( + core::ptr::read_volatile(&resp.capset_id), + core::ptr::read_volatile(&resp.capset_max_version), + core::ptr::read_volatile(&resp.capset_max_size), + )) + } + }) +} + fn create_resource() -> Result<(), &'static str> { with_device_state(|state| { framebuffer_len(state)?; @@ -755,7 +1154,503 @@ fn resource_flush_cmd( } // ============================================================================= -// Public API +// 3D (VirGL) Command Helpers +// ============================================================================= + +/// Create a VirGL 3D rendering context. +fn virgl_ctx_create_cmd(state: &mut GpuPciDeviceState, ctx_id: u32, name: &[u8]) -> Result<(), &'static str> { + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCtxCreate); + let mut debug_name = [0u8; 64]; + let copy_len = name.len().min(63); + debug_name[..copy_len].copy_from_slice(&name[..copy_len]); + *cmd = VirtioGpuCtxCreate { + hdr: VirtioGpuCtrlHdr { + type_: cmd::CTX_CREATE, + flags: 0, + fence_id: 0, + ctx_id, + padding: 0, + }, + nlen: copy_len as u32, + context_init: 0, // VirGL context + debug_name, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Attach a resource to a VirGL context. +fn virgl_ctx_attach_resource_cmd(state: &mut GpuPciDeviceState, ctx_id: u32, resource_id: u32) -> Result<(), &'static str> { + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCtxResource); + *cmd = VirtioGpuCtxResource { + hdr: VirtioGpuCtrlHdr { + type_: cmd::CTX_ATTACH_RESOURCE, + flags: 0, + fence_id: 0, + ctx_id, + padding: 0, + }, + resource_id, + padding: 0, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Create a 3D resource (texture / render target / buffer). +fn virgl_resource_create_3d_cmd( + state: &mut GpuPciDeviceState, + resource_id: u32, + target: u32, + fmt: u32, + bind: u32, + width: u32, + height: u32, + depth: u32, + array_size: u32, +) -> Result<(), &'static str> { + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuResourceCreate3d); + *cmd = VirtioGpuResourceCreate3d { + hdr: VirtioGpuCtrlHdr { + type_: cmd::RESOURCE_CREATE_3D, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + resource_id, + target, + format: fmt, + bind, + width, + height, + depth, + array_size, + last_level: 0, + nr_samples: 0, + flags: 0, + padding: 0, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Attach backing memory to a 3D resource. +/// +/// Uses PCI_3D_FRAMEBUFFER (separate BSS RAM) as the backing store. +/// CRITICAL: Must NOT share backing with the 2D resource (PCI_FRAMEBUFFER). +/// Linux's Mesa/virgl creates independent GEM buffers for each resource. +/// Sharing backing may cause the hypervisor to mishandle SET_SCANOUT. +fn virgl_attach_backing_cmd(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> { + let fb_addr = virt_to_phys(&raw const PCI_3D_FRAMEBUFFER as u64); + let actual_len = (state.width * state.height * 4).min(FB_3D_SIZE as u32); + crate::serial_println!("[virgl] attach_backing: 3D RAM phys=0x{:x}, len={} (SEPARATE from 2D)", fb_addr, actual_len); + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut PciAttachBackingCmd); + *cmd = PciAttachBackingCmd { + cmd: VirtioGpuResourceAttachBacking { + hdr: VirtioGpuCtrlHdr { + type_: cmd::RESOURCE_ATTACH_BACKING, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + resource_id, + nr_entries: 1, + }, + entry: VirtioGpuMemEntry { + addr: fb_addr, + length: actual_len, + padding: 0, + }, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Flush a specific resource to the display (SET_SCANOUT must point at it first). +fn resource_flush_3d(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> { + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuResourceFlush); + *cmd = VirtioGpuResourceFlush { + hdr: VirtioGpuCtrlHdr { + type_: cmd::RESOURCE_FLUSH, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + r_x: 0, + r_y: 0, + r_width: state.width, + r_height: state.height, + resource_id, + padding: 0, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Transfer a 3D resource from guest backing to host texture (upload). +/// +/// NOTE: Not used for VirGL-rendered 3D resources — the host GPU already has +/// the rendered data. Kept for potential future use with CPU-written resources. +#[allow(dead_code)] +fn transfer_to_host_3d( + state: &mut GpuPciDeviceState, + resource_id: u32, + x: u32, + y: u32, + w: u32, + h: u32, +) -> Result<(), &'static str> { + let stride = state.width * 4; + let offset = (y as u64) * (stride as u64) + (x as u64) * 4; + + let fence_id = state.next_fence_id; + state.next_fence_id += 1; + + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuTransferHost3d); + *cmd = VirtioGpuTransferHost3d { + hdr: VirtioGpuCtrlHdr { + type_: cmd::TRANSFER_TO_HOST_3D, + flags: VIRTIO_GPU_FLAG_FENCE, + fence_id, + ctx_id: VIRGL_CTX_ID, + padding: 0, + }, + box_x: x, + box_y: y, + box_z: 0, + box_w: w, + box_h: h, + box_d: 1, + offset, + resource_id, + level: 0, + stride, + layer_stride: 0, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Transfer a 3D resource from host texture to guest backing (readback/download). +/// +/// After VirGL renders to the host GPU texture, this copies the rendered pixels +/// back into the resource's guest-side backing memory. If the backing is BAR0, +/// this is a host-side DMA that writes directly to the display framebuffer — +/// bypassing the 6 MB/s guest CPU MMIO bottleneck entirely. +#[allow(dead_code)] +fn transfer_from_host_3d( + state: &mut GpuPciDeviceState, + resource_id: u32, + x: u32, + y: u32, + w: u32, + h: u32, +) -> Result<(), &'static str> { + let stride = state.width * 4; + let offset = (y as u64) * (stride as u64) + (x as u64) * 4; + + let fence_id = state.next_fence_id; + state.next_fence_id += 1; + + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuTransferHost3d); + *cmd = VirtioGpuTransferHost3d { + hdr: VirtioGpuCtrlHdr { + type_: cmd::TRANSFER_FROM_HOST_3D, + flags: VIRTIO_GPU_FLAG_FENCE, + fence_id, + ctx_id: VIRGL_CTX_ID, + padding: 0, + }, + box_x: x, + box_y: y, + box_z: 0, + box_w: w, + box_h: h, + box_d: 1, + offset, + resource_id, + level: 0, + stride, + layer_stride: 0, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +/// Submit a VirGL command buffer via SUBMIT_3D. +/// +/// Uses a 3-descriptor chain matching the Linux kernel driver layout: +/// Desc 0: VirtioGpuCmdSubmit header (device-readable) +/// Desc 1: VirGL command data (device-readable) +/// Desc 2: Response header (device-writable) +fn virgl_submit_3d_cmd( + state: &mut GpuPciDeviceState, + ctx_id: u32, + cmds: &[u32], +) -> Result<(), &'static str> { + let payload_bytes = cmds.len() * 4; + + if payload_bytes > 16384 { + return Err("VirGL command buffer too large"); + } + + // Allocate a fence ID for this submission so the host signals completion + let fence_id = state.next_fence_id; + state.next_fence_id += 1; + + // Write the Submit3D header into PCI_CMD_BUF + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let hdr = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCmdSubmit); + *hdr = VirtioGpuCmdSubmit { + hdr: VirtioGpuCtrlHdr { + type_: cmd::SUBMIT_3D, + flags: VIRTIO_GPU_FLAG_FENCE, + fence_id, + ctx_id, + padding: 0, + }, + size: payload_bytes as u32, + }; + } + + // Copy VirGL command data into PCI_3D_CMD_BUF + unsafe { + let buf_ptr = &raw mut PCI_3D_CMD_BUF; + let dst = (*buf_ptr).data.as_mut_ptr() as *mut u32; + core::ptr::copy_nonoverlapping(cmds.as_ptr(), dst, cmds.len()); + } + + let hdr_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64); + let data_phys = virt_to_phys(&raw const PCI_3D_CMD_BUF as u64); + let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64); + let hdr_len = core::mem::size_of::() as u32; + let resp_len = core::mem::size_of::() as u32; + + // Build a 3-descriptor chain + unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + + // Desc 0: Submit3D header (device reads) + (*q).desc[0] = VirtqDesc { + addr: hdr_phys, + len: hdr_len, + flags: DESC_F_NEXT, + next: 1, + }; + + // Desc 1: VirGL command payload (device reads) + (*q).desc[1] = VirtqDesc { + addr: data_phys, + len: payload_bytes as u32, + flags: DESC_F_NEXT, + next: 2, + }; + + // Desc 2: Response (device writes) + (*q).desc[2] = VirtqDesc { + addr: resp_phys, + len: resp_len, + flags: DESC_F_WRITE, + next: 0, + }; + + // Add to available ring + let idx = (*q).avail.idx; + (*q).avail.ring[(idx % 16) as usize] = 0; // head of chain = desc 0 + fence(Ordering::SeqCst); + (*q).avail.idx = idx.wrapping_add(1); + fence(Ordering::SeqCst); + } + + // Notify and wait for completion — WFI if MSI available + GPU_CMD_COMPLETE.store(false, Ordering::Release); + state.device.notify_queue_fast(0); + + let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0; + let mut timeout = 10_000_000u32; + loop { + fence(Ordering::Acquire); + let used_idx = unsafe { + let q = &raw const PCI_CTRL_QUEUE; + read_volatile(&(*q).used.idx) + }; + if used_idx != state.last_used_idx { + state.last_used_idx = used_idx; + break; + } + timeout -= 1; + if timeout == 0 { + return Err("SUBMIT_3D timeout"); + } + if use_msi { + #[cfg(target_arch = "aarch64")] + unsafe { core::arch::asm!("wfi", options(nomem, nostack)); } + #[cfg(not(target_arch = "aarch64"))] + core::hint::spin_loop(); + } else { + core::hint::spin_loop(); + } + } + + // Check response — read fence info to verify the host echoed our fence_id + let (resp_type, resp_flags, resp_fence) = unsafe { + let resp_ptr = &raw const PCI_RESP_BUF; + let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr); + ( + core::ptr::read_volatile(&hdr.type_), + core::ptr::read_volatile(&hdr.flags), + core::ptr::read_volatile(&hdr.fence_id), + ) + }; + if resp_type != cmd::RESP_OK_NODATA { + crate::serial_println!("[virtio-gpu-pci] SUBMIT_3D failed: resp={:#x} flags={:#x} fence={}", + resp_type, resp_flags, resp_fence); + return Err("SUBMIT_3D command failed"); + } + // Log fence acknowledgement periodically (init + every 500th frame) + if fence_id <= 5 || fence_id % 500 == 0 { + crate::serial_println!("[virgl] SUBMIT_3D OK: sent fence={} resp_flags={:#x} resp_fence={}", + fence_id, resp_flags, resp_fence); + } + Ok(()) +} + +/// Wait for the host to confirm a GPU fence has completed. +/// +/// Parallels returns SUBMIT_3D responses immediately (resp_flags=0x0) before +/// the GPU work finishes. The actual fence completion is reported in +/// subsequent command responses via resp_fence. This function sends NOP +/// SUBMIT_3D commands and polls until resp_fence >= target_fence_id. +#[allow(dead_code)] +fn virgl_fence_sync(state: &mut GpuPciDeviceState, target_fence_id: u64) -> Result<(), &'static str> { + use super::virgl::CommandBuffer; + + // Try up to 100 rounds of polling (each takes ~50-100us on Parallels) + for _ in 0..100 { + let mut cmdbuf = CommandBuffer::new(); + cmdbuf.set_sub_ctx(1); // NOP — just re-sets the active sub-context + + let payload = cmdbuf.as_slice(); + let payload_bytes = payload.len() * 4; + + let fence_id = state.next_fence_id; + state.next_fence_id += 1; + + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let hdr = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuCmdSubmit); + *hdr = VirtioGpuCmdSubmit { + hdr: VirtioGpuCtrlHdr { + type_: cmd::SUBMIT_3D, + flags: VIRTIO_GPU_FLAG_FENCE, + fence_id, + ctx_id: VIRGL_CTX_ID, + padding: 0, + }, + size: payload_bytes as u32, + }; + + let buf_ptr = &raw mut PCI_3D_CMD_BUF; + let dst = (*buf_ptr).data.as_mut_ptr() as *mut u32; + core::ptr::copy_nonoverlapping(payload.as_ptr(), dst, payload.len()); + } + + let hdr_phys = virt_to_phys(&raw const PCI_CMD_BUF as u64); + let data_phys = virt_to_phys(&raw const PCI_3D_CMD_BUF as u64); + let resp_phys = virt_to_phys(&raw const PCI_RESP_BUF as u64); + let hdr_len = core::mem::size_of::() as u32; + let resp_len = core::mem::size_of::() as u32; + + unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + (*q).desc[0] = VirtqDesc { addr: hdr_phys, len: hdr_len, flags: DESC_F_NEXT, next: 1 }; + (*q).desc[1] = VirtqDesc { addr: data_phys, len: payload_bytes as u32, flags: DESC_F_NEXT, next: 2 }; + (*q).desc[2] = VirtqDesc { addr: resp_phys, len: resp_len, flags: DESC_F_WRITE, next: 0 }; + let idx = (*q).avail.idx; + (*q).avail.ring[(idx % 16) as usize] = 0; + fence(Ordering::SeqCst); + (*q).avail.idx = idx.wrapping_add(1); + fence(Ordering::SeqCst); + } + + state.device.notify_queue(0); + + // Spin-wait for response + let mut timeout = 10_000_000u32; + loop { + fence(Ordering::SeqCst); + let used_idx = unsafe { + let q = &raw const PCI_CTRL_QUEUE; + read_volatile(&(*q).used.idx) + }; + if used_idx != state.last_used_idx { + state.last_used_idx = used_idx; + break; + } + timeout -= 1; + if timeout == 0 { return Err("fence sync timeout"); } + core::hint::spin_loop(); + } + + // Check if the host reported our target fence as complete + let resp_fence = unsafe { + let resp_ptr = &raw const PCI_RESP_BUF; + let hdr = &*((*resp_ptr).data.as_ptr() as *const VirtioGpuCtrlHdr); + core::ptr::read_volatile(&hdr.fence_id) + }; + + if resp_fence >= target_fence_id { + return Ok(()); + } + } + + Err("fence sync: target fence never completed") +} + +/// Set scanout to a specific resource ID (used for 3D render targets). +fn set_scanout_resource(state: &mut GpuPciDeviceState, resource_id: u32) -> Result<(), &'static str> { + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuSetScanout); + *cmd = VirtioGpuSetScanout { + hdr: VirtioGpuCtrlHdr { + type_: cmd::SET_SCANOUT, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + r_x: 0, + r_y: 0, + r_width: state.width, + r_height: state.height, + scanout_id: 0, + resource_id, + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32) +} + +// ============================================================================= +// Public API (2D) // ============================================================================= /// Flush the entire framebuffer to the display. @@ -797,7 +1692,7 @@ pub fn dimensions() -> Option<(u32, u32)> { } } -/// Get a mutable reference to the framebuffer pixels. +/// Get a mutable reference to the PCI_FRAMEBUFFER pixels. #[allow(dead_code)] pub fn framebuffer() -> Option<&'static mut [u8]> { unsafe { @@ -811,3 +1706,424 @@ pub fn framebuffer() -> Option<&'static mut [u8]> { } } } + +// ============================================================================= +// Public API (3D / VirGL) +// ============================================================================= + +/// Ball descriptor passed from userspace for GPU rendering. +#[repr(C)] +#[derive(Clone, Copy, Default)] +pub struct VirglBall { + /// X position in pixels (f32 bits) + pub x: f32, + /// Y position in pixels (f32 bits) + pub y: f32, + /// Radius in pixels (f32 bits) + pub radius: f32, + /// Color as [R, G, B, A] each 0.0-1.0 + pub color: [f32; 4], +} + +/// Render a frame with the VirGL GPU pipeline. +/// +/// Clears to background color, draws circles for each ball, submits to host +/// GPU, then issues RESOURCE_FLUSH to display the result. +pub fn virgl_render_frame( + balls: &[VirglBall], + bg_r: f32, + bg_g: f32, + bg_b: f32, +) -> Result<(), &'static str> { + use super::virgl::{CommandBuffer, pipe}; + + static FRAME_COUNT: core::sync::atomic::AtomicU32 = core::sync::atomic::AtomicU32::new(0); + let frame = FRAME_COUNT.fetch_add(1, core::sync::atomic::Ordering::Relaxed); + let verbose = frame < 3 || frame % 500 == 0; + if verbose { + crate::serial_println!("[virgl] render_frame #{} ({} balls)", frame, balls.len()); + } + + if !is_virgl_enabled() { + return Err("VirGL not enabled"); + } + + if !VIRGL_SCANOUT_ACTIVE.load(Ordering::Acquire) { + VIRGL_SCANOUT_ACTIVE.store(true, Ordering::Release); + crate::serial_println!("[virgl] first VirGL frame #{}", frame); + } + + let (width, height) = match dimensions() { + Some(d) => d, + None => return Err("GPU not initialized"), + }; + let fw = width as f32; + let fh = height as f32; + + let mut cmdbuf = CommandBuffer::new(); + if verbose { + crate::serial_println!("[virgl] frame #{}: cmdbuf created, setting FB state", frame); + } + + cmdbuf.set_sub_ctx(1); + + // Re-emit ALL pipeline state each frame. Mesa's virgl driver re-emits + // dirty state before every draw; Parallels may reset context state between + // SUBMIT_3D batches, so we must not assume anything persists. + cmdbuf.bind_shader(1, pipe::SHADER_VERTEX); + cmdbuf.bind_shader(2, pipe::SHADER_FRAGMENT); + cmdbuf.bind_object(1, super::virgl::OBJ_BLEND); + cmdbuf.bind_object(1, super::virgl::OBJ_DSA); + cmdbuf.bind_object(1, super::virgl::OBJ_RASTERIZER); + cmdbuf.bind_object(1, super::virgl::OBJ_VERTEX_ELEMENTS); + cmdbuf.set_viewport(fw, fh); + cmdbuf.set_framebuffer_state(0, &[1]); // surface_handle=1, no depth + + // Clear to background color + cmdbuf.clear_color(bg_r, bg_g, bg_b, 1.0); + + // For each ball, generate a triangle fan and draw it + let ball_count = balls.len().min(MAX_CIRCLES); + if verbose { + crate::serial_println!("[virgl] frame #{}: drawing {} balls (with full state re-emit)", frame, ball_count); + } + + for (i, ball) in balls[..ball_count].iter().enumerate() { + let cx = ball.x; + let cy = ball.y; + let r = ball.radius; + let [cr, cg, cb, ca] = ball.color; + + // Convert pixel coords to NDC: x_ndc = (2*x/width - 1), y_ndc = (1 - 2*y/height) + // VirGL with our viewport transform already maps clip coords to screen pixels, + // but the vertex shader outputs POSITION in clip space. With our viewport of + // (width/2, -height/2) scale + (width/2, height/2) translate, clip space + // [-1,1] maps to [0,width] and [1,-1] maps to [0,height]. + let cx_ndc = 2.0 * cx / fw - 1.0; + let cy_ndc = 1.0 - 2.0 * cy / fh; + let rx_ndc = 2.0 * r / fw; + let ry_ndc = 2.0 * r / fh; + + // Build triangle fan: center + CIRCLE_SEGMENTS perimeter + 1 closing vertex + let mut verts = [0u32; VERTS_PER_CIRCLE * 8]; // 8 u32 per vertex (pos4 + col4) + + // Center vertex + verts[0] = cx_ndc.to_bits(); + verts[1] = cy_ndc.to_bits(); + verts[2] = 0f32.to_bits(); // z = 0 + verts[3] = 1.0f32.to_bits(); // w = 1 + verts[4] = cr.to_bits(); + verts[5] = cg.to_bits(); + verts[6] = cb.to_bits(); + verts[7] = ca.to_bits(); + + // Perimeter vertices + closing vertex + // Precomputed cos/sin for 16-segment circle (2π/16 = π/8 increments) + const COS_TABLE: [f32; 17] = [ + 1.0, 0.92388, 0.70711, 0.38268, 0.0, + -0.38268, -0.70711, -0.92388, -1.0, + -0.92388, -0.70711, -0.38268, 0.0, + 0.38268, 0.70711, 0.92388, 1.0, // closing = first + ]; + const SIN_TABLE: [f32; 17] = [ + 0.0, 0.38268, 0.70711, 0.92388, 1.0, + 0.92388, 0.70711, 0.38268, 0.0, + -0.38268, -0.70711, -0.92388, -1.0, + -0.92388, -0.70711, -0.38268, 0.0, // closing = first + ]; + for seg in 0..=CIRCLE_SEGMENTS { + let cos_a = COS_TABLE[seg]; + let sin_a = SIN_TABLE[seg]; + let vx = cx_ndc + rx_ndc * cos_a; + let vy = cy_ndc + ry_ndc * sin_a; + let base = (seg + 1) * 8; + verts[base] = vx.to_bits(); + verts[base + 1] = vy.to_bits(); + verts[base + 2] = 0f32.to_bits(); + verts[base + 3] = 1.0f32.to_bits(); + verts[base + 4] = cr.to_bits(); + verts[base + 5] = cg.to_bits(); + verts[base + 6] = cb.to_bits(); + verts[base + 7] = ca.to_bits(); + } + + let vb_offset = (i * VERTS_PER_CIRCLE * BYTES_PER_VERTEX) as u32; + let vb_bytes = (VERTS_PER_CIRCLE * BYTES_PER_VERTEX) as u32; + + // Upload vertex data inline + cmdbuf.resource_inline_write( + RESOURCE_VB_ID, + vb_offset, + vb_bytes, + &verts[..VERTS_PER_CIRCLE * 8], + ); + + // Bind vertex buffer with correct offset for this circle + cmdbuf.set_vertex_buffers(&[(BYTES_PER_VERTEX as u32, vb_offset, RESOURCE_VB_ID)]); + + // Draw triangle fan + cmdbuf.draw_vbo( + 0, // start = 0 (relative to VB offset) + VERTS_PER_CIRCLE as u32, // count + pipe::PRIM_TRIANGLE_FAN, + (VERTS_PER_CIRCLE - 1) as u32, // max_index + ); + } + + // Submit VirGL commands to host GPU + if verbose { + crate::serial_println!("[virgl] frame #{}: submitting {} DWORDs ({} bytes)", + frame, cmdbuf.as_slice().len(), cmdbuf.byte_len()); + } + match virgl_submit(cmdbuf.as_slice()) { + Ok(()) => { + if verbose { + crate::serial_println!("[virgl] frame #{}: SUBMIT_3D done", frame); + } + } + Err(e) => { + crate::serial_println!("[virgl] frame #{}: SUBMIT_3D FAILED: {}", frame, e); + return Err(e); + } + } + + // SET_SCANOUT only on first frame (scanout target doesn't change between frames). + // RESOURCE_FLUSH every frame to tell the hypervisor to re-scan the texture. + static SCANOUT_SET: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false); + match with_device_state(|state| { + if !SCANOUT_SET.load(core::sync::atomic::Ordering::Relaxed) { + set_scanout_resource(state, RESOURCE_3D_ID)?; + SCANOUT_SET.store(true, core::sync::atomic::Ordering::Relaxed); + } + resource_flush_3d(state, RESOURCE_3D_ID) + }) { + Ok(()) => {} + Err(e) => { + crate::serial_println!("[virgl] frame #{}: SET_SCANOUT/FLUSH FAILED: {}", frame, e); + return Err(e); + } + } + + Ok(()) +} + +/// Submit a VirGL command buffer for the active 3D context. +/// +/// `cmds` is a slice of u32 DWORDs from a VirGL CommandBuffer. +pub fn virgl_submit(cmds: &[u32]) -> Result<(), &'static str> { + with_device_state(|state| { + virgl_submit_3d_cmd(state, VIRGL_CTX_ID, cmds) + }) +} + +/// Copy PCI_3D_FRAMEBUFFER (RAM) → BAR0 (display memory). +/// +/// After TRANSFER_FROM_HOST_3D copies GPU-rendered pixels to PCI_3D_FRAMEBUFFER, +/// this copies them to BAR0 so they appear on screen. +#[allow(dead_code)] +fn copy_3d_framebuffer_to_bar0(width: u32, height: u32) { + let bar0_virt = crate::graphics::arm64_fb::gop_framebuffer(); + let fb_bytes = (width * height * 4) as usize; + if let Some(bar0) = bar0_virt { + let copy_len = fb_bytes.min(bar0.len()).min(FB_3D_SIZE); + unsafe { + let src = &raw const PCI_3D_FRAMEBUFFER; + core::ptr::copy_nonoverlapping( + (*src).pixels.as_ptr(), + bar0.as_mut_ptr(), + copy_len, + ); + } + } +} + +/// Flush the VirGL render target to the display. +/// SET_SCANOUT + RESOURCE_FLUSH — matching Linux's display path. +pub fn virgl_flush() -> Result<(), &'static str> { + if !is_virgl_enabled() { + return Err("VirGL display not available"); + } + with_device_state(|state| { + set_scanout_resource(state, RESOURCE_3D_ID)?; + resource_flush_3d(state, RESOURCE_3D_ID) + }) +} + +// ============================================================================= +// VirGL Initialization (Phase 3: proof-of-pipeline) +// ============================================================================= + +/// Initialize the VirGL 3D pipeline if VIRGL was negotiated. +/// +/// Creates a 3D rendering context, a render target resource matching the +/// scanout dimensions, submits a clear command, and sets scanout to the +/// 3D resource. If successful, the screen shows a solid color rendered +/// entirely by the host GPU — no BAR0 pixel writes. +pub fn virgl_init() -> Result<(), &'static str> { + use super::virgl::{CommandBuffer, format as vfmt, pipe}; + + crate::serial_println!("[virgl_init] VIRGL_ENABLED={} addr={:#x}", + VIRGL_ENABLED.load(Ordering::Acquire), + &VIRGL_ENABLED as *const _ as usize); + if !is_virgl_enabled() { + return Err("VirGL not supported"); + } + + crate::serial_println!("[virtio-gpu-pci] Initializing VirGL 3D pipeline (v10: separate backing + fence sync + capset init)..."); + + let (width, height) = dimensions().ok_or("GPU not initialized")?; + + // Step 1: Create 3D context + with_device_state(|state| { + virgl_ctx_create_cmd(state, VIRGL_CTX_ID, b"breenix") + })?; + crate::serial_println!("[virgl] Step 1: context created (ctx_id={})", VIRGL_CTX_ID); + + // Step 2: Create 3D resource with bind flags matching Linux Mesa/virgl. + // Linux strace shows bind=0x0014000a = RENDER_TARGET|SAMPLER_VIEW|SCANOUT|SHARED. + // CRITICAL: Must use B8G8R8X8_UNORM (XRGB8888) — ARGB8888 causes EINVAL. + let bind_flags = pipe::BIND_RENDER_TARGET | pipe::BIND_SAMPLER_VIEW + | pipe::BIND_SCANOUT | pipe::BIND_SHARED; + with_device_state(|state| { + virgl_resource_create_3d_cmd( + state, + RESOURCE_3D_ID, + pipe::TEXTURE_2D, + vfmt::B8G8R8X8_UNORM, + bind_flags, + width, + height, + 1, // depth + 1, // array_size + ) + })?; + crate::serial_println!("[virgl] Step 2: 3D resource created (id={}, {}x{}, B8G8R8X8_UNORM, bind=0x{:08x})", RESOURCE_3D_ID, width, height, bind_flags); + + // Step 3: Attach SEPARATE backing memory (PCI_3D_FRAMEBUFFER, NOT shared with 2D resource) + with_device_state(|state| { + virgl_attach_backing_cmd(state, RESOURCE_3D_ID) + })?; + crate::serial_println!("[virgl] Step 3: separate backing attached"); + + // Step 4: Attach 3D resource to VirGL context + with_device_state(|state| { + virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, RESOURCE_3D_ID) + })?; + crate::serial_println!("[virgl] Step 4: 3D resource attached to context"); + + // Step 5: Create sub-context + pipeline state objects + let mut cmdbuf = CommandBuffer::new(); + cmdbuf.create_sub_ctx(1); + cmdbuf.set_sub_ctx(1); + + let vs_text = b"VERT\nDCL IN[0], POSITION\nDCL IN[1], GENERIC[0]\nDCL OUT[0], POSITION\nDCL OUT[1], GENERIC[0]\n 0: MOV OUT[0], IN[0]\n 1: MOV OUT[1], IN[1]\n 2: END\n"; + cmdbuf.create_shader(1, pipe::SHADER_VERTEX, vs_text); + let fs_text = b"FRAG\nDCL IN[0], GENERIC[0], PERSPECTIVE\nDCL OUT[0], COLOR\n 0: MOV OUT[0], IN[0]\n 1: END\n"; + cmdbuf.create_shader(2, pipe::SHADER_FRAGMENT, fs_text); + + cmdbuf.create_blend_simple(1); + cmdbuf.create_dsa_disabled(1); + cmdbuf.create_rasterizer_default(1); + cmdbuf.create_vertex_elements(1, &[ + (0, 0, 0, vfmt::R32G32B32A32_FLOAT), + (16, 0, 0, vfmt::R32G32B32A32_FLOAT), + ]); + + virgl_submit(cmdbuf.as_slice())?; + crate::serial_println!("[virgl] Step 5: pipeline state created"); + + // Step 6: Bind state, create surface on 3D resource, clear to cornflower blue + cmdbuf.clear(); + cmdbuf.set_sub_ctx(1); + cmdbuf.bind_shader(1, pipe::SHADER_VERTEX); + cmdbuf.bind_shader(2, pipe::SHADER_FRAGMENT); + cmdbuf.bind_object(1, super::virgl::OBJ_BLEND); + cmdbuf.bind_object(1, super::virgl::OBJ_DSA); + cmdbuf.bind_object(1, super::virgl::OBJ_RASTERIZER); + cmdbuf.bind_object(1, super::virgl::OBJ_VERTEX_ELEMENTS); + cmdbuf.set_viewport(width as f32, height as f32); + + let surface_handle = 1u32; + cmdbuf.create_surface(surface_handle, RESOURCE_3D_ID, vfmt::B8G8R8X8_UNORM, 0, 0); + cmdbuf.set_framebuffer_state(0, &[surface_handle]); + cmdbuf.clear_color(0.392, 0.584, 0.929, 1.0); + + virgl_submit(cmdbuf.as_slice())?; + crate::serial_println!("[virgl] Step 6: VirGL clear submitted to host GPU"); + + // Step 7: Parallels processes SUBMIT_3D synchronously — used ring completion + // means the GPU work is done. No fence sync needed (Parallels returns + // resp_fence=0 for all responses, so virgl_fence_sync doesn't work). + crate::serial_println!("[virgl] Step 7: SUBMIT_3D sync completed (Parallels processes synchronously)"); + + // Step 8: SKIPPED — Green pixel fill removed. + // PCI_3D_FRAMEBUFFER in BSS overlaps with the Parallels boot stack at phys + // 0x42000000. Writing 7.5MB of pixel data overwrites the stack frames and + // corrupts return addresses. The VirGL clear in Step 6 already put cornflower + // blue in the host GPU texture. SET_SCANOUT + RESOURCE_FLUSH should display + // that if 3D resource scanout works on Parallels. + crate::serial_println!("[virgl] Step 8: skipped green fill (BSS overlaps Parallels boot stack)"); + + // Step 9: Switch display to 3D resource. + // First disable current scanout (resource_id=0), then enable with 3D resource. + // This mimics Linux DRM modesetting which does a full scanout reconfiguration. + with_device_state(|state| { + // Disable current scanout + crate::serial_println!("[virgl] Step 9: disabling current scanout (resource_id=0)..."); + unsafe { + let cmd_ptr = &raw mut PCI_CMD_BUF; + let cmd = &mut *((*cmd_ptr).data.as_mut_ptr() as *mut VirtioGpuSetScanout); + *cmd = VirtioGpuSetScanout { + hdr: VirtioGpuCtrlHdr { + type_: cmd::SET_SCANOUT, + flags: 0, + fence_id: 0, + ctx_id: 0, + padding: 0, + }, + r_x: 0, + r_y: 0, + r_width: 0, + r_height: 0, + scanout_id: 0, + resource_id: 0, // disable scanout + }; + } + send_command_expect_ok(state, core::mem::size_of::() as u32)?; + crate::serial_println!("[virgl] Step 9: scanout disabled"); + + // Enable scanout with 3D resource + set_scanout_resource(state, RESOURCE_3D_ID)?; + crate::serial_println!("[virgl] Step 9: scanout set to 3D resource (id={})", RESOURCE_3D_ID); + + resource_flush_3d(state, RESOURCE_3D_ID)?; + crate::serial_println!("[virgl] Step 9: RESOURCE_FLUSH done"); + Ok(()) + })?; + + // Step 10: VirGL clear rendered cornflower blue to host texture. + // SET_SCANOUT (Step 9) pointed display at the 3D resource. + crate::serial_println!("[virgl] Step 10: display configured (cornflower blue if SET_SCANOUT works)"); + + // Step 12: Create vertex buffer resource + with_device_state(|state| { + virgl_resource_create_3d_cmd( + state, + RESOURCE_VB_ID, + pipe::BUFFER, + vfmt::R8G8B8A8_UNORM, + pipe::BIND_VERTEX_BUFFER, + VB_SIZE as u32, + 1, 1, 1, + ) + })?; + with_device_state(|state| { + virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, RESOURCE_VB_ID) + })?; + crate::serial_println!("[virgl] Step 12: vertex buffer created (id={}, {}B)", RESOURCE_VB_ID, VB_SIZE); + + crate::serial_println!("[virgl] VirGL 3D pipeline initialized (v10b: TRANSFER_TO_HOST_3D green test)"); + crate::serial_println!("[virgl_init] END: VIRGL_ENABLED={}", VIRGL_ENABLED.load(Ordering::Acquire)); + + Ok(()) +} diff --git a/kernel/src/drivers/virtio/mod.rs b/kernel/src/drivers/virtio/mod.rs index 7e5b8075..38ded38d 100644 --- a/kernel/src/drivers/virtio/mod.rs +++ b/kernel/src/drivers/virtio/mod.rs @@ -39,6 +39,8 @@ pub mod input_mmio; pub mod sound_mmio; #[cfg(target_arch = "aarch64")] pub mod gpu_pci; +#[cfg(target_arch = "aarch64")] +pub mod virgl; #[cfg(target_arch = "x86_64")] pub mod sound; diff --git a/kernel/src/drivers/virtio/pci_transport.rs b/kernel/src/drivers/virtio/pci_transport.rs index 7e3f0f97..e1bb0b1e 100644 --- a/kernel/src/drivers/virtio/pci_transport.rs +++ b/kernel/src/drivers/virtio/pci_transport.rs @@ -184,6 +184,9 @@ pub struct VirtioPciDevice { device_features: u64, /// VirtIO device type ID virtio_device_id: u32, + /// Cached queue notify virtual addresses (avoids 2 MMIO reads per notify). + /// Populated by `cache_queue_notify_addr()` after queue setup. + cached_notify_addrs: [u64; 4], } impl VirtioPciDevice { @@ -294,6 +297,7 @@ impl VirtioPciDevice { device_cfg, device_features: 0, virtio_device_id, + cached_notify_addrs: [0; 4], }) } @@ -478,6 +482,35 @@ impl VirtioPciDevice { } } + /// Cache the notify address for a queue so `notify_queue_fast()` avoids + /// the two MMIO reads (COMMON_Q_SELECT + COMMON_Q_NOFF) on every notify. + /// + /// Must be called after `set_queue_ready(true)` for the given queue. + pub fn cache_queue_notify_addr(&mut self, queue: u32) { + if queue as usize >= self.cached_notify_addrs.len() { + return; + } + self.select_queue(queue); + let queue_notify_off = self.common.read_u16(COMMON_Q_NOFF) as u32; + let offset = (queue_notify_off * self.notify_off_multiplier) as u64; + self.cached_notify_addrs[queue as usize] = self.notify.virt_base + offset; + } + + /// Notify device using the cached notify address — single MMIO write. + /// + /// Falls back to `notify_queue()` if the address hasn't been cached. + #[inline(always)] + pub fn notify_queue_fast(&self, queue: u32) { + let idx = queue as usize; + if idx < self.cached_notify_addrs.len() && self.cached_notify_addrs[idx] != 0 { + unsafe { + core::ptr::write_volatile(self.cached_notify_addrs[idx] as *mut u16, queue as u16); + } + } else { + self.notify_queue(queue); + } + } + // ========================================================================= // Interrupt Handling // ========================================================================= @@ -526,6 +559,14 @@ impl VirtioPciDevice { self.device_cfg.read_u32(offset) } + /// Write a u32 to device-specific configuration. + pub fn write_config_u32(&self, offset: usize, value: u32) { + if !self.device_cfg.is_valid() { + return; + } + self.device_cfg.write_u32(offset, value); + } + /// Read a u64 from device-specific configuration (two u32 reads). pub fn read_config_u64(&self, offset: usize) -> u64 { let low = self.read_config_u32(offset) as u64; diff --git a/kernel/src/drivers/virtio/virgl.rs b/kernel/src/drivers/virtio/virgl.rs new file mode 100644 index 00000000..a8316687 --- /dev/null +++ b/kernel/src/drivers/virtio/virgl.rs @@ -0,0 +1,396 @@ +//! VirGL Command Encoder +//! +//! Encodes Gallium3D-style commands into the VirGL wire format for submission +//! via VirtIO GPU SUBMIT_3D. Each command is a sequence of 32-bit DWORDs with +//! a header encoding `(length << 16 | subcmd << 8 | object_type)`. + +// ============================================================================= +// VirGL Command Types (VIRGL_CCMD_*) +// ============================================================================= + +#[allow(dead_code)] +mod ccmd { + pub const NOP: u8 = 0; + pub const CREATE_OBJECT: u8 = 1; + pub const BIND_OBJECT: u8 = 2; + pub const DESTROY_OBJECT: u8 = 3; + pub const SET_VIEWPORT_STATE: u8 = 4; + pub const SET_FRAMEBUFFER_STATE: u8 = 5; + pub const SET_VERTEX_BUFFERS: u8 = 6; + pub const CLEAR: u8 = 7; + pub const DRAW_VBO: u8 = 8; + pub const RESOURCE_INLINE_WRITE: u8 = 9; + pub const SET_SAMPLER_VIEWS: u8 = 10; + pub const SET_INDEX_BUFFER: u8 = 11; + pub const SET_CONSTANT_BUFFER: u8 = 12; + pub const SET_STENCIL_REF: u8 = 13; + pub const SET_BLEND_COLOR: u8 = 14; + pub const SET_SCISSOR_STATE: u8 = 15; + pub const BLIT: u8 = 16; + pub const RESOURCE_COPY_REGION: u8 = 17; + pub const BIND_SAMPLER_STATES: u8 = 18; + pub const SET_SUB_CTX: u8 = 28; + pub const CREATE_SUB_CTX: u8 = 29; + pub const DESTROY_SUB_CTX: u8 = 30; + pub const BIND_SHADER: u8 = 31; +} + +// ============================================================================= +// VirGL Object Types (VIRGL_OBJECT_*) +// ============================================================================= + +#[allow(dead_code)] +mod obj { + pub const NULL: u8 = 0; + pub const BLEND: u8 = 1; + pub const RASTERIZER: u8 = 2; + pub const DSA: u8 = 3; + pub const SHADER: u8 = 4; + pub const VERTEX_ELEMENTS: u8 = 5; + pub const SAMPLER_VIEW: u8 = 6; + pub const SAMPLER_STATE: u8 = 7; + pub const SURFACE: u8 = 8; + pub const QUERY: u8 = 9; + pub const STREAMOUT_TARGET: u8 = 10; +} + +// Public re-exports for bind_object() callers in gpu_pci.rs +pub const OBJ_BLEND: u8 = obj::BLEND; +pub const OBJ_DSA: u8 = obj::DSA; +pub const OBJ_RASTERIZER: u8 = obj::RASTERIZER; +pub const OBJ_VERTEX_ELEMENTS: u8 = obj::VERTEX_ELEMENTS; + +// ============================================================================= +// VirGL Format Constants (matches Gallium PIPE_FORMAT_*) +// ============================================================================= + +#[allow(dead_code)] +pub mod format { + pub const B8G8R8A8_UNORM: u32 = 1; + pub const B8G8R8X8_UNORM: u32 = 2; + pub const R8G8B8A8_UNORM: u32 = 67; + pub const R32_FLOAT: u32 = 28; + pub const R32G32_FLOAT: u32 = 29; + pub const R32G32B32_FLOAT: u32 = 30; + pub const R32G32B32A32_FLOAT: u32 = 31; +} + +// ============================================================================= +// Pipe Constants +// ============================================================================= + +#[allow(dead_code)] +pub mod pipe { + // Texture targets + pub const BUFFER: u32 = 0; + pub const TEXTURE_2D: u32 = 2; + + // Bind flags + pub const BIND_DEPTH_STENCIL: u32 = 1 << 0; + pub const BIND_RENDER_TARGET: u32 = 1 << 1; + pub const BIND_SAMPLER_VIEW: u32 = 1 << 3; + pub const BIND_VERTEX_BUFFER: u32 = 1 << 4; + pub const BIND_INDEX_BUFFER: u32 = 1 << 5; + pub const BIND_CONSTANT_BUFFER: u32 = 1 << 6; + pub const BIND_SCANOUT: u32 = 1 << 18; + pub const BIND_SHARED: u32 = 1 << 20; + + // Clear buffer flags + pub const CLEAR_DEPTH: u32 = 0x01; + pub const CLEAR_STENCIL: u32 = 0x02; + pub const CLEAR_COLOR0: u32 = 0x04; + + // Primitive types + pub const PRIM_POINTS: u32 = 0; + pub const PRIM_LINES: u32 = 1; + pub const PRIM_TRIANGLES: u32 = 4; + pub const PRIM_TRIANGLE_STRIP: u32 = 5; + pub const PRIM_TRIANGLE_FAN: u32 = 6; + + // Shader types + pub const SHADER_VERTEX: u32 = 0; + pub const SHADER_FRAGMENT: u32 = 1; +} + +// ============================================================================= +// Command Buffer +// ============================================================================= + +/// Fixed-capacity VirGL command buffer. Accumulates u32 DWORDs for submission +/// via VIRTIO_GPU_CMD_SUBMIT_3D. +pub struct CommandBuffer { + data: [u32; 3072], // 12KB — large enough for 12 circle draws with inline vertex data + len: usize, +} + +impl CommandBuffer { + pub const fn new() -> Self { + Self { + data: [0u32; 3072], + len: 0, + } + } + + /// Reset the buffer for reuse. + pub fn clear(&mut self) { + self.len = 0; + } + + /// Get the command data as a u32 slice. + pub fn as_slice(&self) -> &[u32] { + &self.data[..self.len] + } + + /// Size in bytes. + pub fn byte_len(&self) -> usize { + self.len * 4 + } + + /// Push a single DWORD. + fn push(&mut self, val: u32) { + if self.len < self.data.len() { + self.data[self.len] = val; + self.len += 1; + } + } + + /// Push a slice of DWORDs. + fn push_slice(&mut self, vals: &[u32]) { + for &v in vals { + self.push(v); + } + } + + // ========================================================================= + // Command Encoders + // ========================================================================= + + /// Encode a VirGL command header. + #[inline] + fn cmd0(cmd: u8, obj: u8, len: u16) -> u32 { + (cmd as u32) | ((obj as u32) << 8) | ((len as u32) << 16) + } + + /// Create a sub-context. + pub fn create_sub_ctx(&mut self, sub_ctx_id: u32) { + self.push(Self::cmd0(ccmd::CREATE_SUB_CTX, 0, 1)); + self.push(sub_ctx_id); + } + + /// Set the active sub-context. + pub fn set_sub_ctx(&mut self, sub_ctx_id: u32) { + self.push(Self::cmd0(ccmd::SET_SUB_CTX, 0, 1)); + self.push(sub_ctx_id); + } + + /// Create a surface object wrapping a resource. + pub fn create_surface(&mut self, handle: u32, res_handle: u32, fmt: u32, level: u32, layers: u32) { + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::SURFACE, 5)); + self.push(handle); + self.push(res_handle); + self.push(fmt); + self.push(level); + self.push(layers); // first_layer | (last_layer << 16) + } + + /// Create a blend state (no blending, write all color channels). + pub fn create_blend_simple(&mut self, handle: u32) { + // len=11: handle + S0 + S1 + S2[0..7] + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::BLEND, 11)); + self.push(handle); + self.push(0); // S0: no special features + self.push(0); // S1: logicop_func = 0 + // S2[0]: colormask=0xF (write RGBA), blend disabled + self.push(0xF << 27); + // S2[1..7]: unused render targets + for _ in 0..7 { + self.push(0); + } + } + + /// Create a depth-stencil-alpha state (all disabled). + pub fn create_dsa_disabled(&mut self, handle: u32) { + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::DSA, 5)); + self.push(handle); + self.push(0); // S0: depth/alpha disabled + self.push(0); // S1: front stencil disabled + self.push(0); // S2: back stencil disabled + self.push(0); // alpha_ref = 0.0 + } + + /// Create a basic rasterizer state (fill mode, depth clip, half-pixel center). + pub fn create_rasterizer_default(&mut self, handle: u32) { + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::RASTERIZER, 9)); + self.push(handle); + // S0: depth_clip(1<<1) | fill_front=FILL(0<<10) | fill_back=FILL(0<<12) | half_pixel_center(1<<29) + // PIPE_POLYGON_MODE: FILL=0, LINE=1, POINT=2. Fill fields are 0 so omitted. + self.push((1 << 1) | (1 << 29)); + self.push(0x3F800000u32); // point_size = 1.0f + self.push(0); // sprite_coord_enable + self.push(0); // S3 + self.push(0x3F800000u32); // line_width = 1.0f + self.push(0); // offset_units + self.push(0); // offset_scale + self.push(0); // offset_clamp + } + + /// Bind an object by type and handle. + pub fn bind_object(&mut self, handle: u32, obj_type: u8) { + self.push(Self::cmd0(ccmd::BIND_OBJECT, obj_type, 1)); + self.push(handle); + } + + /// Create a shader from TGSI text. + pub fn create_shader(&mut self, handle: u32, shader_type: u32, tgsi_text: &[u8]) { + let text_len = tgsi_text.len() + 1; // include null terminator + let text_dwords = (text_len + 3) / 4; + // Header DWORDs: handle, type, offset, num_tokens, num_so_outputs = 5 + let payload_len = 5 + text_dwords; + + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::SHADER, payload_len as u16)); + self.push(handle); + self.push(shader_type); + self.push(1 << 31); // offset = 0, bit 31 = 1 (last chunk — triggers compilation) + self.push(text_len as u32); // byte length of TGSI text including null + self.push(0); // num_so_outputs = 0 + + // Pack TGSI text bytes into DWORDs (little-endian) + let mut i = 0; + while i < text_dwords { + let base = i * 4; + let mut dword = 0u32; + for b in 0..4 { + if base + b < tgsi_text.len() { + dword |= (tgsi_text[base + b] as u32) << (b * 8); + } + // else: zero (null terminator / padding) + } + self.push(dword); + i += 1; + } + } + + /// Bind a shader by handle and type. + pub fn bind_shader(&mut self, handle: u32, shader_type: u32) { + self.push(Self::cmd0(ccmd::BIND_SHADER, 0, 2)); + self.push(handle); + self.push(shader_type); + } + + /// Set framebuffer state (nr_cbufs color buffer surface handles, optional depth surface). + pub fn set_framebuffer_state(&mut self, zsurf_handle: u32, cbuf_handles: &[u32]) { + let nr_cbufs = cbuf_handles.len() as u32; + self.push(Self::cmd0(ccmd::SET_FRAMEBUFFER_STATE, 0, (nr_cbufs + 2) as u16)); + self.push(nr_cbufs); + self.push(zsurf_handle); + for &h in cbuf_handles { + self.push(h); + } + } + + /// Set viewport state for one viewport. + pub fn set_viewport(&mut self, width: f32, height: f32) { + self.push(Self::cmd0(ccmd::SET_VIEWPORT_STATE, 0, 7)); + self.push(0); // start_slot = 0 + self.push(f32_bits(width / 2.0)); // scale_x + self.push(f32_bits(-height / 2.0)); // scale_y (negative for GL convention) + self.push(f32_bits(0.5)); // scale_z + self.push(f32_bits(width / 2.0)); // translate_x + self.push(f32_bits(height / 2.0)); // translate_y + self.push(f32_bits(0.5)); // translate_z + } + + /// Clear the framebuffer. + /// Color values are f32 (0.0-1.0), reinterpreted as u32 bits. + pub fn clear_color(&mut self, r: f32, g: f32, b: f32, a: f32) { + self.push(Self::cmd0(ccmd::CLEAR, 0, 8)); + self.push(pipe::CLEAR_COLOR0); // buffers = clear color only + self.push(f32_bits(r)); + self.push(f32_bits(g)); + self.push(f32_bits(b)); + self.push(f32_bits(a)); + // depth as f64 split into two u32s (0.0) + self.push(0); + self.push(0); + self.push(0); // stencil + } + + /// Create vertex elements describing vertex layout. + /// Each element: (src_offset, instance_divisor, vertex_buffer_index, src_format) + pub fn create_vertex_elements(&mut self, handle: u32, elements: &[(u32, u32, u32, u32)]) { + let len = 4 * elements.len() + 1; + self.push(Self::cmd0(ccmd::CREATE_OBJECT, obj::VERTEX_ELEMENTS, len as u16)); + self.push(handle); + for &(offset, divisor, vb_index, fmt) in elements { + self.push(offset); + self.push(divisor); + self.push(vb_index); + self.push(fmt); + } + } + + /// Set vertex buffers: (stride, offset, resource_handle) per buffer. + /// VirGL protocol: payload is just [stride, offset, handle] * N. Host infers + /// num_buffers from length / 3. No start_slot field — buffers bind from slot 0. + pub fn set_vertex_buffers(&mut self, buffers: &[(u32, u32, u32)]) { + let len = 3 * buffers.len(); + self.push(Self::cmd0(ccmd::SET_VERTEX_BUFFERS, 0, len as u16)); + for &(stride, offset, res_handle) in buffers { + self.push(stride); + self.push(offset); + self.push(res_handle); + } + } + + /// Inline write data into a resource (upload vertex/index data). + pub fn resource_inline_write( + &mut self, + res_handle: u32, + x: u32, w: u32, + data: &[u32], + ) { + let len = 11 + data.len(); + self.push(Self::cmd0(ccmd::RESOURCE_INLINE_WRITE, 0, len as u16)); + self.push(res_handle); + self.push(0); // level + self.push(0); // usage + self.push(0); // stride (0 for buffers) + self.push(0); // layer_stride + self.push(x); // x offset in bytes + self.push(0); // y + self.push(0); // z + self.push(w); // width in bytes + self.push(1); // h + self.push(1); // d + self.push_slice(data); + } + + /// Draw primitives. + pub fn draw_vbo( + &mut self, + start: u32, + count: u32, + mode: u32, + max_index: u32, + ) { + self.push(Self::cmd0(ccmd::DRAW_VBO, 0, 12)); + self.push(start); + self.push(count); + self.push(mode); + self.push(0); // indexed = false + self.push(1); // instance_count + self.push(0); // start_instance + self.push(0); // index_bias + self.push(0); // min_index + self.push(max_index); + self.push(0); // primitive_restart = disabled + self.push(0); // restart_index + self.push(0); // count_from_so + } +} + +/// Reinterpret f32 as u32 bits (IEEE 754). +#[inline] +fn f32_bits(f: f32) -> u32 { + f.to_bits() +} diff --git a/kernel/src/graphics/arm64_fb.rs b/kernel/src/graphics/arm64_fb.rs index 3f26cbb4..cd10e4df 100644 --- a/kernel/src/graphics/arm64_fb.rs +++ b/kernel/src/graphics/arm64_fb.rs @@ -132,6 +132,12 @@ pub fn flush_dirty_rect(x: u32, y: u32, w: u32, h: u32) -> Result<(), &'static s // CPU's write buffer is drained so stores are visible to the display // controller. Parallels scans BAR0 at its own refresh rate — no VirtIO // RESOURCE_FLUSH needed (it's synchronous and would add 10-50ms). + // + // VirtIO DMA (TRANSFER_TO_HOST_2D + RESOURCE_FLUSH) was benchmarked + // and is slower than direct BAR0 writes on Parallels: + // - Per-ball VirtIO DMA: 5-7 FPS (28 VirtIO round-trips/frame) + // - Full-pane VirtIO DMA: 4-8 FPS (2 round-trips but ~7.5MB DMA) + // - Direct BAR0 MMIO: 12 FPS (14 small per-ball copies, ~340KB total) unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); } Ok(()) } else if crate::drivers::virtio::gpu_pci::is_initialized() { @@ -864,6 +870,19 @@ pub fn upgrade_to_double_buffer() { } } +/// Check if the double buffer (shadow buffer in cached RAM) is initialized. +/// +/// Used by the syscall flush path to decide whether to use the async path +/// (copy mmap → shadow, wake render thread) or fall back to direct BAR0 write. +pub fn has_double_buffer() -> bool { + if let Some(fb) = SHELL_FRAMEBUFFER.get() { + if let Some(guard) = fb.try_lock() { + return guard.double_buffer.is_some(); + } + } + false +} + /// Get the framebuffer dimensions pub fn dimensions() -> Option<(usize, usize)> { SHELL_FRAMEBUFFER.get().and_then(|fb| { diff --git a/kernel/src/platform_config.rs b/kernel/src/platform_config.rs index 7bdecb51..f4fd41ce 100644 --- a/kernel/src/platform_config.rs +++ b/kernel/src/platform_config.rs @@ -10,6 +10,10 @@ #[cfg(target_arch = "aarch64")] use core::sync::atomic::{AtomicU64, AtomicU8, Ordering}; +/// Next GICv2m SPI index to allocate (offset from GICV2M_SPI_BASE). +#[cfg(target_arch = "aarch64")] +static GICV2M_NEXT_SPI: AtomicU64 = AtomicU64::new(0); + // ============================================================================= // Hardware address atomics with QEMU virt defaults // ============================================================================= @@ -68,14 +72,14 @@ static XHCI_HCRST_DONE: AtomicU64 = AtomicU64::new(0); // Memory layout defaults (QEMU virt, 512MB RAM at 0x40000000) // Kernel image: 0x4000_0000 - 0x4100_0000 (16 MB) -// Per-CPU stacks: 0x4100_0000 - 0x4200_0000 (16 MB) -// Frame alloc: 0x4200_0000 - 0x5000_0000 (224 MB) +// BSS (incl FBs): 0x4100_0000 - 0x4300_0000 (32 MB, includes 7.5MB PCI_3D_FRAMEBUFFER) +// Frame alloc: 0x4300_0000 - 0x5000_0000 (208 MB) // DMA (NC): 0x5000_0000 - 0x501F_FFFF (2 MB, Non-Cacheable for xHCI) // Heap: 0x5020_0000 - 0x51FF_FFFF (30 MB) // Kernel stacks: 0x5200_0000 - 0x5400_0000 (32 MB) #[cfg(target_arch = "aarch64")] -static FRAME_ALLOC_START: AtomicU64 = AtomicU64::new(0x4200_0000); +static FRAME_ALLOC_START: AtomicU64 = AtomicU64::new(0x4300_0000); #[cfg(target_arch = "aarch64")] static FRAME_ALLOC_END: AtomicU64 = AtomicU64::new(0x5000_0000); @@ -220,6 +224,30 @@ pub fn probe_gicv2m(phys_base: u64) -> bool { true } +/// Allocate the next available GICv2m MSI SPI. +/// +/// Returns the SPI number (GIC INTID) for use with `configure_msi()` and +/// `gic::enable_spi()`. Returns 0 if GICv2m has not been probed or all +/// SPIs have been allocated. +/// +/// Thread-safe: uses atomic fetch_add so multiple drivers (xHCI, GPU, etc.) +/// can allocate SPIs without collision. +#[cfg(target_arch = "aarch64")] +pub fn allocate_msi_spi() -> u32 { + let base = GICV2M_SPI_BASE.load(Ordering::Relaxed); + let count = GICV2M_SPI_COUNT.load(Ordering::Relaxed); + if base == 0 || count == 0 { + return 0; + } + let idx = GICV2M_NEXT_SPI.fetch_add(1, Ordering::Relaxed); + if idx >= count { + // Roll back — no SPI available + GICV2M_NEXT_SPI.fetch_sub(1, Ordering::Relaxed); + return 0; + } + (base + idx) as u32 +} + /// PCI ECAM physical base address. 0 if no PCI. #[cfg(target_arch = "aarch64")] #[inline] @@ -480,8 +508,10 @@ pub fn init_from_parallels(config: &HardwareConfig) -> bool { } if best_size > 0 { - // Frame allocator starts after kernel + stacks (32 MB from RAM base) - let fa_start = best_base + 0x0200_0000; // +32 MB + // Frame allocator starts after kernel + BSS (48 MB from RAM base). + // BSS includes PCI_3D_FRAMEBUFFER (~7.5 MB) and kernel stacks, so + // the total image + BSS exceeds 32 MB. 48 MB gives margin for growth. + let fa_start = best_base + 0x0300_0000; // +48 MB // Frame allocator must end BEFORE the DMA NC region. // The .dma section starts at physical 0x5000_0000, so cap fa_end there. let fa_end = (best_base + best_size).min(0x5000_0000); diff --git a/kernel/src/signal/delivery.rs b/kernel/src/signal/delivery.rs index 6dd3711e..a336a70f 100644 --- a/kernel/src/signal/delivery.rs +++ b/kernel/src/signal/delivery.rs @@ -202,9 +202,10 @@ pub enum DeliverResult { fn deliver_default_action(process: &mut Process, sig: u32) -> DeliverResult { match default_action(sig) { SignalDefaultAction::Terminate => { - log::info!( - "Process {} terminated by signal {} ({})", + crate::serial_println!( + "[signal] Process {} ({}) terminated by signal {} ({})", process.id.as_u64(), + process.name, sig, signal_name(sig) ); @@ -235,9 +236,10 @@ fn deliver_default_action(process: &mut Process, sig: u32) -> DeliverResult { } } SignalDefaultAction::CoreDump => { - log::info!( - "Process {} killed (core dumped) by signal {} ({})", + crate::serial_println!( + "[signal] Process {} ({}) killed (core dump) by signal {} ({})", process.id.as_u64(), + process.name, sig, signal_name(sig) ); diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs index 3e4f3f40..d3cffc27 100644 --- a/kernel/src/syscall/graphics.rs +++ b/kernel/src/syscall/graphics.rs @@ -163,6 +163,10 @@ pub enum FbDrawOp { DrawLine = 5, /// Flush the framebuffer (for double-buffering) Flush = 6, + /// Submit a VirGL GPU-rendered frame (balls array + background color) + VirglSubmitFrame = 7, + /// Batch flush multiple dirty rects with one DSB barrier + FlushBatch = 8, } /// Draw command structure passed from userspace. @@ -549,40 +553,58 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult { let row_bytes = mmap_info.width * mmap_info.bpp; let x_byte_offset = mmap_info.x_offset * mmap_info.bpp; - // For GOP mode with double buffer, copy user → BAR0 directly - // (single copy) instead of user → shadow → BAR0 (double copy). - // The NC memory mapping makes BAR0 writes fast via write-combining. - // Also update the shadow buffer so terminal reads stay consistent. + // When a dirty rect is specified, only copy the dirty columns + // instead of the full mmap width. For per-ball flushes this + // reduces the copy from ~3.4KB/row to ~336 bytes/row. + let (user_col_offset, shadow_col_offset, copy_row_bytes) = if has_rect { + let col_start = (cmd.p1.max(0) as usize).min(mmap_info.width); + let col_end = (cmd.p1.max(0) as usize + cmd.p3 as usize).min(mmap_info.width); + ( + col_start * mmap_info.bpp, + x_byte_offset + col_start * mmap_info.bpp, + (col_end - col_start) * mmap_info.bpp, + ) + } else { + (0, x_byte_offset, row_bytes) + }; + if crate::graphics::arm64_fb::is_gop_active() { + // GOP synchronous path: copy mmap → BAR0 directly with + // partial column copy. Each per-ball flush writes only + // ~27KB to BAR0 instead of the full bounding box (~3.7MB). + // Also update shadow buffer for consistency with terminal text. + // + // VirtIO DMA (PCI_FRAMEBUFFER → TRANSFER_TO_HOST_2D) was + // benchmarked and is slower: 5-7 FPS per-ball, 4-8 FPS + // full-pane, vs 12 FPS with direct BAR0 MMIO. if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() { for y in y_start..y_end { - let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; - let target_row_offset = y * fb_stride_bytes + x_byte_offset; - - if target_row_offset + row_bytes <= gop_buf.len() { + let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride + user_col_offset; + let target_row_offset = y * fb_stride_bytes + shadow_col_offset; + if target_row_offset + copy_row_bytes <= gop_buf.len() { unsafe { core::ptr::copy_nonoverlapping( user_row_ptr as *const u8, gop_buf[target_row_offset..].as_mut_ptr(), - row_bytes, + copy_row_bytes, ); } } } - // Also update shadow buffer to keep it consistent - if let Some(db) = fb_guard.double_buffer_mut() { - let shadow = db.buffer_mut(); - for y in y_start..y_end { - let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; - let target_row_offset = y * fb_stride_bytes + x_byte_offset; - if target_row_offset + row_bytes <= shadow.len() { - unsafe { - core::ptr::copy_nonoverlapping( - user_row_ptr as *const u8, - shadow[target_row_offset..].as_mut_ptr(), - row_bytes, - ); - } + } + // Update shadow buffer so terminal reads stay consistent + if let Some(db) = fb_guard.double_buffer_mut() { + let shadow = db.buffer_mut(); + for y in y_start..y_end { + let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride + user_col_offset; + let target_row_offset = y * fb_stride_bytes + shadow_col_offset; + if target_row_offset + copy_row_bytes <= shadow.len() { + unsafe { + core::ptr::copy_nonoverlapping( + user_row_ptr as *const u8, + shadow[target_row_offset..].as_mut_ptr(), + copy_row_bytes, + ); } } } @@ -607,21 +629,158 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult { } } - // Drop SHELL_FRAMEBUFFER lock BEFORE GPU flush to avoid holding - // both SHELL_FRAMEBUFFER + GPU_LOCK simultaneously. The pixel - // copy is done; the render thread can now access the framebuffer - // for terminal text while we submit GPU commands. + // Drop SHELL_FRAMEBUFFER lock before GPU flush drop(fb_guard); - // Synchronous GPU flush — submit resource_flush (or transfer_to_host + - // resource_flush for non-GOP) directly in the syscall. This eliminates - // scheduling latency: bounce's frame is displayed immediately rather - // than waiting for the render thread (5ms+ due to timer tick). + // Synchronous GPU flush — for GOP this is a DSB barrier ensuring + // BAR0 writes are visible to the display controller. For VirtIO + // this submits transfer_to_host + resource_flush. if let Some((fx, fy, fw, fh)) = flush_rect { let _ = crate::graphics::arm64_fb::flush_dirty_rect(fx, fy, fw, fh); } } } + 7 => { + // VirglSubmitFrame: GPU-rendered frame via VirGL + // p1:p2 = pointer to VirglFrameDesc (low:high 32-bit halves) + // color = background color (packed 0x00RRGGBB) + #[cfg(target_arch = "aarch64")] + { + // Drop FB lock — we don't need the software framebuffer for GPU rendering + drop(fb_guard); + + // Reconstruct 64-bit pointer from two i32 halves. + // Cast through u32 first to avoid sign extension. + let desc_ptr = (cmd.p1 as u32 as u64) | ((cmd.p2 as u32 as u64) << 32); + if desc_ptr == 0 || desc_ptr >= USER_SPACE_MAX { + return SyscallResult::Err(super::ErrorCode::Fault as u64); + } + + // Read ball count (first u32 at desc_ptr) + let ball_count = unsafe { core::ptr::read(desc_ptr as *const u32) } as usize; + if ball_count > 16 { + return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64); + } + + // Read ball array starting at desc_ptr + 8 (skip count + padding) + let balls_ptr = (desc_ptr + 8) as *const crate::drivers::virtio::gpu_pci::VirglBall; + let balls_end = desc_ptr + 8 + (ball_count as u64) * core::mem::size_of::() as u64; + if balls_end > USER_SPACE_MAX { + return SyscallResult::Err(super::ErrorCode::Fault as u64); + } + + let balls = unsafe { core::slice::from_raw_parts(balls_ptr, ball_count) }; + + let bg_r = ((cmd.color >> 16) & 0xFF) as f32 / 255.0; + let bg_g = ((cmd.color >> 8) & 0xFF) as f32 / 255.0; + let bg_b = (cmd.color & 0xFF) as f32 / 255.0; + + match crate::drivers::virtio::gpu_pci::virgl_render_frame(balls, bg_r, bg_g, bg_b) { + Ok(()) => {} + Err(e) => { + crate::serial_println!("[virgl-syscall] render_frame FAILED: {}", e); + return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64); + } + } + } + #[cfg(not(target_arch = "aarch64"))] + { + drop(fb_guard); + return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64); + } + } + 8 => { + // FlushBatch: batch flush multiple dirty rects with one DSB barrier. + // p1:p2 = 64-bit pointer to FlushRect array [(x, y, w, h); ...] + // p3 = count of rects (max 16) + // Copies each rect from mmap → BAR0, then ONE dsb sy. + // Saves 12+ syscall round-trips and DSB barriers per frame. + #[cfg(target_arch = "aarch64")] + { + FB_FLUSH_COUNT.fetch_add(1, core::sync::atomic::Ordering::Relaxed); + + let rects_ptr = (cmd.p1 as u32 as u64) | ((cmd.p2 as u32 as u64) << 32); + let count = (cmd.p3 as u32).min(16) as usize; + + // Drop FB lock immediately — batch flush only needs mmap_info + BAR0 + drop(fb_guard); + + if count == 0 { + return SyscallResult::Ok(0); + } + + if rects_ptr == 0 || rects_ptr >= USER_SPACE_MAX { + return SyscallResult::Err(super::ErrorCode::Fault as u64); + } + let rects_end = rects_ptr.saturating_add((count as u64) * 16); + if rects_end > USER_SPACE_MAX { + return SyscallResult::Err(super::ErrorCode::Fault as u64); + } + + #[repr(C)] + #[derive(Clone, Copy)] + struct FlushRect { x: i32, y: i32, w: i32, h: i32 } + + let rects = unsafe { + core::slice::from_raw_parts(rects_ptr as *const FlushRect, count) + }; + + let fb_mmap_info = fb_mmap_info_pre; + + if let Some(mmap_info) = fb_mmap_info { + if crate::graphics::arm64_fb::is_gop_active() { + // Use lock-free FbInfoCache for stride (no FB lock needed) + let fb_stride_bytes = crate::graphics::arm64_fb::FB_INFO_CACHE.get() + .map(|c| c.stride * c.bytes_per_pixel) + .unwrap_or(0); + + if fb_stride_bytes > 0 { + if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() { + let x_byte_offset = mmap_info.x_offset * mmap_info.bpp; + + for rect in rects { + if rect.w <= 0 || rect.h <= 0 { continue; } + + let col_start = (rect.x.max(0) as usize).min(mmap_info.width); + let col_end = (rect.x.max(0) as usize + rect.w as usize).min(mmap_info.width); + let y_start = (rect.y.max(0) as usize).min(mmap_info.height); + let y_end = (rect.y.max(0) as usize + rect.h as usize).min(mmap_info.height); + + let user_col_byte = col_start * mmap_info.bpp; + let target_col_byte = x_byte_offset + col_start * mmap_info.bpp; + let copy_row_bytes = (col_end - col_start) * mmap_info.bpp; + + if copy_row_bytes == 0 { continue; } + + for y in y_start..y_end { + let user_row_ptr = (mmap_info.user_addr as usize) + + y * mmap_info.user_stride + user_col_byte; + let target_row_offset = y * fb_stride_bytes + target_col_byte; + if target_row_offset + copy_row_bytes <= gop_buf.len() { + unsafe { + core::ptr::copy_nonoverlapping( + user_row_ptr as *const u8, + gop_buf[target_row_offset..].as_mut_ptr(), + copy_row_bytes, + ); + } + } + } + } + } + + // ONE DSB for all BAR0 writes + unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); } + } + } + } + } + #[cfg(not(target_arch = "aarch64"))] + { + drop(fb_guard); + return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64); + } + } _ => { return SyscallResult::Err(super::ErrorCode::InvalidArgument as u64); } diff --git a/libs/libbreenix/src/graphics.rs b/libs/libbreenix/src/graphics.rs index 0ecbc143..4e00bcc4 100644 --- a/libs/libbreenix/src/graphics.rs +++ b/libs/libbreenix/src/graphics.rs @@ -103,6 +103,29 @@ pub mod draw_op { pub const DRAW_LINE: u32 = 5; /// Flush the framebuffer (for double-buffering) pub const FLUSH: u32 = 6; + /// Submit a VirGL GPU-rendered frame + pub const VIRGL_SUBMIT_FRAME: u32 = 7; + /// Batch flush multiple dirty rects with one DSB barrier + pub const FLUSH_BATCH: u32 = 8; +} + +/// Ball descriptor for VirGL GPU rendering. +/// Must match kernel's VirglBall in drivers/virtio/gpu_pci.rs. +#[repr(C)] +#[derive(Clone, Copy, Default)] +pub struct VirglBall { + pub x: f32, + pub y: f32, + pub radius: f32, + pub color: [f32; 4], +} + +/// Frame descriptor passed to the VirglSubmitFrame syscall. +#[repr(C)] +pub struct VirglFrameDesc { + pub ball_count: u32, + pub _pad: u32, + pub balls: [VirglBall; 16], } /// Pack RGB color into u32 @@ -238,6 +261,62 @@ pub fn fb_flush_rect(x: i32, y: i32, w: i32, h: i32) -> Result<(), Error> { fbdraw(&cmd) } +/// A dirty rectangle for batch flushing. +#[repr(C)] +#[derive(Clone, Copy)] +pub struct FlushRect { + pub x: i32, + pub y: i32, + pub w: i32, + pub h: i32, +} + +/// Batch flush multiple dirty rectangles with a single syscall and DSB barrier. +/// +/// Instead of calling `fb_flush_rect()` N times (N syscalls, N DSB barriers), +/// this sends all dirty rects at once: 1 syscall, 1 DSB. Each rect is copied +/// from the mmap buffer to BAR0 in sequence. +pub fn fb_flush_rects(rects: &[FlushRect]) -> Result<(), Error> { + if rects.is_empty() { + return Ok(()); + } + let rects_ptr = rects.as_ptr() as u64; + let cmd = FbDrawCmd { + op: draw_op::FLUSH_BATCH, + p1: rects_ptr as i32, + p2: (rects_ptr >> 32) as i32, + p3: rects.len() as i32, + p4: 0, + color: 0, + }; + fbdraw(&cmd) +} + +/// Submit a VirGL GPU-rendered frame. +/// +/// Sends ball positions/colors to the kernel, which renders them via the host +/// GPU and DMA-copies the result to display memory. Zero guest CPU pixel writes. +pub fn virgl_submit_frame(balls: &[VirglBall], bg_color: u32) -> Result<(), Error> { + let mut desc = VirglFrameDesc { + ball_count: balls.len().min(16) as u32, + _pad: 0, + balls: [VirglBall::default(); 16], + }; + for (i, ball) in balls.iter().take(16).enumerate() { + desc.balls[i] = *ball; + } + let desc_ptr = &desc as *const VirglFrameDesc as u64; + let cmd = FbDrawCmd { + op: draw_op::VIRGL_SUBMIT_FRAME, + p1: desc_ptr as i32, // low 32 bits + p2: (desc_ptr >> 32) as i32, // high 32 bits + p3: 0, + p4: 0, + color: bg_color, + }; + fbdraw(&cmd) +} + /// Get the current mouse cursor position. /// /// # Returns diff --git a/scripts/parallels/gl_bench.c b/scripts/parallels/gl_bench.c new file mode 100644 index 00000000..ff8c8ef3 --- /dev/null +++ b/scripts/parallels/gl_bench.c @@ -0,0 +1,423 @@ +/* + * gl_bench.c — Minimal OpenGL ES 2.0 benchmark using GBM + EGL + GLES2 + * + * Renders filled circles (similar to the Breenix bounce demo) on the GPU + * using VirGL, measures FPS, and optionally reads back pixels to verify + * rendering actually works. + * + * This runs headless (no window system) using GBM + EGL, which is exactly + * how a kernel's VirGL implementation would work. + * + * Build: gcc -O2 -o gl_bench gl_bench.c -lEGL -lGLESv2 -lgbm -lm + * Run: ./gl_bench [frames] (default: 300 frames) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* Timing helpers */ +/* ------------------------------------------------------------------ */ + +static uint64_t now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; +} + +/* ------------------------------------------------------------------ */ +/* Shader source */ +/* ------------------------------------------------------------------ */ + +static const char *vert_src = + "attribute vec2 a_pos;\n" + "attribute vec4 a_color;\n" + "varying vec4 v_color;\n" + "uniform vec2 u_resolution;\n" + "void main() {\n" + " vec2 clip = (a_pos / u_resolution) * 2.0 - 1.0;\n" + " clip.y = -clip.y;\n" + " gl_Position = vec4(clip, 0.0, 1.0);\n" + " v_color = a_color;\n" + "}\n"; + +static const char *frag_src = + "precision mediump float;\n" + "varying vec4 v_color;\n" + "void main() {\n" + " gl_FragColor = v_color;\n" + "}\n"; + +/* ------------------------------------------------------------------ */ +/* Ball definition */ +/* ------------------------------------------------------------------ */ + +#define NUM_BALLS 12 +#define CIRCLE_SEGMENTS 20 +#define WIDTH 864 +#define HEIGHT 1080 + +struct Ball { + float x, y; /* position in pixels */ + float vx, vy; /* velocity in pixels/frame */ + float radius; + float r, g, b; + float mass; +}; + +static struct Ball balls[NUM_BALLS] = { + { 100, 100, 11.0, 8.0, 38, 1.0, 0.2, 0.2, 38 }, + { 300, 200, -10.0, 7.0, 33, 0.2, 1.0, 0.2, 33 }, + { 200, 400, 9.0, -9.5, 42, 0.2, 0.2, 1.0, 42 }, + { 400, 300, -8.5, -8.0, 28, 1.0, 1.0, 0.2, 28 }, + { 150, 300, 10.5, 6.0, 24, 1.0, 0.2, 1.0, 24 }, + { 350, 150, -9.0, 7.5, 26, 0.2, 1.0, 1.0, 26 }, + { 450, 500, 8.0, -7.0, 35, 1.0, 0.6, 0.2, 35 }, + { 250, 550, -7.5, 8.5, 30, 0.6, 0.2, 1.0, 30 }, + { 500, 100, 9.5, 9.5, 22, 0.8, 0.8, 0.8, 22 }, + { 120, 500, -11.0, -6.5, 20, 1.0, 0.4, 0.4, 20 }, + { 380, 450, 7.0, 9.0, 32, 0.4, 1.0, 0.4, 32 }, + { 520, 350, -8.0, -8.5, 27, 0.4, 0.6, 1.0, 27 }, +}; + +/* ------------------------------------------------------------------ */ +/* Physics (identical to Breenix bounce demo) */ +/* ------------------------------------------------------------------ */ + +static void ball_step(struct Ball *b) { + b->x += b->vx; + b->y += b->vy; +} + +static void ball_bounce(struct Ball *b) { + if (b->x - b->radius < 0) { b->x = b->radius; b->vx = -b->vx; } + if (b->x + b->radius >= WIDTH) { b->x = WIDTH - b->radius - 1; b->vx = -b->vx; } + if (b->y - b->radius < 0) { b->y = b->radius; b->vy = -b->vy; } + if (b->y + b->radius >= HEIGHT){ b->y = HEIGHT - b->radius - 1; b->vy = -b->vy; } +} + +static void check_collision(struct Ball *a, struct Ball *b) { + float dx = b->x - a->x; + float dy = b->y - a->y; + float touch = a->radius + b->radius; + float dist_sq = dx*dx + dy*dy; + if (dist_sq >= touch*touch || dist_sq == 0) return; + + float dist = sqrtf(dist_sq); + if (dist == 0) { a->x -= 1; b->x += 1; return; } + + float nx = dx / dist; + float ny = dy / dist; + float v1n = a->vx*nx + a->vy*ny; + float v2n = b->vx*nx + b->vy*ny; + if (v1n <= v2n) return; + + float m1 = a->mass, m2 = b->mass, mt = m1 + m2; + float v1n_new = ((m1-m2)*v1n + 2*m2*v2n) / mt; + float v2n_new = ((m2-m1)*v2n + 2*m1*v1n) / mt; + float dv1 = v1n_new - v1n; + float dv2 = v2n_new - v2n; + a->vx += dv1*nx; a->vy += dv1*ny; + b->vx += dv2*nx; b->vy += dv2*ny; + + float overlap = touch - dist + 0.5f; + float push1 = overlap * m2 / mt; + float push2 = overlap * m1 / mt; + a->x -= push1*nx; a->y -= push1*ny; + b->x += push2*nx; b->y += push2*ny; +} + +/* ------------------------------------------------------------------ */ +/* GL helpers */ +/* ------------------------------------------------------------------ */ + +static GLuint compile_shader(GLenum type, const char *src) { + GLuint s = glCreateShader(type); + glShaderSource(s, 1, &src, NULL); + glCompileShader(s); + GLint ok; + glGetShaderiv(s, GL_COMPILE_STATUS, &ok); + if (!ok) { + char buf[512]; + glGetShaderInfoLog(s, sizeof(buf), NULL, buf); + fprintf(stderr, "Shader compile error: %s\n", buf); + exit(1); + } + return s; +} + +/* Build a triangle fan for a circle: center + N+1 edge vertices */ +static int build_circle_vertices(float *verts, float cx, float cy, float r, + float cr, float cg, float cb) { + int n = 0; + /* Center vertex: x, y, r, g, b, a */ + verts[n++] = cx; verts[n++] = cy; + verts[n++] = cr; verts[n++] = cg; verts[n++] = cb; verts[n++] = 1.0f; + + for (int i = 0; i <= CIRCLE_SEGMENTS; i++) { + float angle = (float)i / CIRCLE_SEGMENTS * 2.0f * M_PI; + verts[n++] = cx + r * cosf(angle); + verts[n++] = cy + r * sinf(angle); + verts[n++] = cr; verts[n++] = cg; verts[n++] = cb; verts[n++] = 1.0f; + } + return (CIRCLE_SEGMENTS + 2); /* vertex count */ +} + +/* ------------------------------------------------------------------ */ +/* Main */ +/* ------------------------------------------------------------------ */ + +int main(int argc, char *argv[]) { + int total_frames = 300; + if (argc > 1) total_frames = atoi(argv[1]); + if (total_frames <= 0) total_frames = 300; + + printf("=== GL Bench: %d frames, %dx%d, %d balls ===\n", + total_frames, WIDTH, HEIGHT, NUM_BALLS); + + /* ---- Open DRM device ---- */ + int drm_fd = -1; + const char *cards[] = {"/dev/dri/renderD128", "/dev/dri/card0", "/dev/dri/card1", NULL}; + for (int i = 0; cards[i]; i++) { + drm_fd = open(cards[i], O_RDWR); + if (drm_fd >= 0) { + printf("Opened DRM device: %s\n", cards[i]); + break; + } + } + if (drm_fd < 0) { + fprintf(stderr, "Failed to open any DRM device: %s\n", strerror(errno)); + return 1; + } + + /* ---- GBM device ---- */ + struct gbm_device *gbm = gbm_create_device(drm_fd); + if (!gbm) { + fprintf(stderr, "Failed to create GBM device\n"); + return 1; + } + printf("GBM device created\n"); + + /* ---- EGL setup ---- */ + /* Use eglGetPlatformDisplay (EGL 1.5) with GBM platform */ + EGLDisplay dpy = eglGetPlatformDisplay(EGL_PLATFORM_GBM_MESA, gbm, NULL); + if (dpy == EGL_NO_DISPLAY) { + /* Fallback to legacy eglGetDisplay */ + dpy = eglGetDisplay((EGLNativeDisplayType)gbm); + } + if (dpy == EGL_NO_DISPLAY) { + fprintf(stderr, "Failed to get EGL display\n"); + return 1; + } + + EGLint major, minor; + if (!eglInitialize(dpy, &major, &minor)) { + fprintf(stderr, "eglInitialize failed: 0x%x\n", eglGetError()); + return 1; + } + printf("EGL %d.%d initialized\n", major, minor); + + const char *egl_vendor = eglQueryString(dpy, EGL_VENDOR); + const char *egl_version = eglQueryString(dpy, EGL_VERSION); + printf("EGL vendor: %s\n", egl_vendor ? egl_vendor : "unknown"); + printf("EGL version: %s\n", egl_version ? egl_version : "unknown"); + + eglBindAPI(EGL_OPENGL_ES_API); + + EGLint cfg_attribs[] = { + EGL_SURFACE_TYPE, EGL_WINDOW_BIT, + EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, + EGL_RED_SIZE, 8, + EGL_GREEN_SIZE, 8, + EGL_BLUE_SIZE, 8, + EGL_ALPHA_SIZE, 8, + EGL_NONE + }; + EGLConfig config; + EGLint num_configs; + if (!eglChooseConfig(dpy, cfg_attribs, &config, 1, &num_configs) || num_configs == 0) { + fprintf(stderr, "eglChooseConfig failed\n"); + return 1; + } + + EGLint ctx_attribs[] = { EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE }; + EGLContext ctx = eglCreateContext(dpy, config, EGL_NO_CONTEXT, ctx_attribs); + if (ctx == EGL_NO_CONTEXT) { + fprintf(stderr, "eglCreateContext failed: 0x%x\n", eglGetError()); + return 1; + } + + /* Create GBM surface for offscreen rendering */ + struct gbm_surface *gbm_surf = gbm_surface_create(gbm, WIDTH, HEIGHT, + GBM_FORMAT_ARGB8888, + GBM_BO_USE_RENDERING); + if (!gbm_surf) { + fprintf(stderr, "gbm_surface_create failed\n"); + return 1; + } + + EGLSurface egl_surf = eglCreateWindowSurface(dpy, config, + (EGLNativeWindowType)gbm_surf, NULL); + if (egl_surf == EGL_NO_SURFACE) { + fprintf(stderr, "eglCreateWindowSurface failed: 0x%x\n", eglGetError()); + return 1; + } + + if (!eglMakeCurrent(dpy, egl_surf, egl_surf, ctx)) { + fprintf(stderr, "eglMakeCurrent failed: 0x%x\n", eglGetError()); + return 1; + } + + printf("GL_RENDERER: %s\n", glGetString(GL_RENDERER)); + printf("GL_VENDOR: %s\n", glGetString(GL_VENDOR)); + printf("GL_VERSION: %s\n", glGetString(GL_VERSION)); + + /* ---- Compile shaders ---- */ + GLuint vs = compile_shader(GL_VERTEX_SHADER, vert_src); + GLuint fs = compile_shader(GL_FRAGMENT_SHADER, frag_src); + GLuint prog = glCreateProgram(); + glAttachShader(prog, vs); + glAttachShader(prog, fs); + glBindAttribLocation(prog, 0, "a_pos"); + glBindAttribLocation(prog, 1, "a_color"); + glLinkProgram(prog); + GLint link_ok; + glGetProgramiv(prog, GL_LINK_STATUS, &link_ok); + if (!link_ok) { + char buf[512]; + glGetProgramInfoLog(prog, sizeof(buf), NULL, buf); + fprintf(stderr, "Link error: %s\n", buf); + return 1; + } + glUseProgram(prog); + GLint u_res = glGetUniformLocation(prog, "u_resolution"); + glUniform2f(u_res, (float)WIDTH, (float)HEIGHT); + + glViewport(0, 0, WIDTH, HEIGHT); + glDisable(GL_DEPTH_TEST); + + printf("Shaders compiled, rendering %d frames...\n\n", total_frames); + + /* ---- Vertex buffer for circles ---- */ + /* Max vertices: NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 floats */ + float *verts = malloc(NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 * sizeof(float)); + if (!verts) { perror("malloc"); return 1; } + + GLuint vbo; + glGenBuffers(1, &vbo); + + /* ---- Render loop ---- */ + uint64_t t_start = now_ns(); + uint64_t t_last_print = t_start; + int frames_since_print = 0; + + for (int frame = 0; frame < total_frames; frame++) { + /* Physics */ + for (int s = 0; s < 16; s++) { + for (int i = 0; i < NUM_BALLS; i++) { + balls[i].x += balls[i].vx / 16.0f; + balls[i].y += balls[i].vy / 16.0f; + } + for (int i = 0; i < NUM_BALLS; i++) ball_bounce(&balls[i]); + for (int i = 0; i < NUM_BALLS; i++) + for (int j = i+1; j < NUM_BALLS; j++) + check_collision(&balls[i], &balls[j]); + } + + /* Clear */ + glClearColor(15.0f/255.0f, 15.0f/255.0f, 30.0f/255.0f, 1.0f); + glClear(GL_COLOR_BUFFER_BIT); + + /* Draw each ball as a triangle fan */ + int total_verts_offset = 0; + for (int i = 0; i < NUM_BALLS; i++) { + int nv = build_circle_vertices( + verts + total_verts_offset * 6, + balls[i].x, balls[i].y, balls[i].radius, + balls[i].r, balls[i].g, balls[i].b); + + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, nv * 6 * sizeof(float), + verts + total_verts_offset * 6, GL_DYNAMIC_DRAW); + glEnableVertexAttribArray(0); + glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)0); + glEnableVertexAttribArray(1); + glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)(2*sizeof(float))); + glDrawArrays(GL_TRIANGLE_FAN, 0, nv); + } + + /* Swap / present */ + eglSwapBuffers(dpy, egl_surf); + + /* Release the GBM buffer so we can lock the next one */ + struct gbm_bo *bo = gbm_surface_lock_front_buffer(gbm_surf); + if (bo) gbm_surface_release_buffer(gbm_surf, bo); + + frames_since_print++; + + /* Print FPS every 16 frames */ + if (frames_since_print >= 16) { + uint64_t now = now_ns(); + uint64_t elapsed = now - t_last_print; + if (elapsed > 0) { + double fps = (double)frames_since_print * 1e9 / (double)elapsed; + double ms = (double)elapsed / (double)frames_since_print / 1e6; + printf("[frame %4d] FPS: %.1f (%.2f ms/frame)\n", frame, fps, ms); + } + frames_since_print = 0; + t_last_print = now; + } + + /* Readback a few pixels on first frame to verify rendering */ + if (frame == 0) { + unsigned char pixel[4]; + /* Read center of screen — should be background color */ + glReadPixels(WIDTH/2, HEIGHT/2, 1, 1, GL_RGBA, GL_UNSIGNED_BYTE, pixel); + printf(" Pixel at center: RGBA(%d, %d, %d, %d) — expect ~(15, 15, 30, 255)\n", + pixel[0], pixel[1], pixel[2], pixel[3]); + + /* Read where first ball should be */ + int bx = (int)balls[0].x; + int by = HEIGHT - (int)balls[0].y; /* GL flips Y */ + if (bx >= 0 && bx < WIDTH && by >= 0 && by < HEIGHT) { + glReadPixels(bx, by, 1, 1, GL_RGBA, GL_UNSIGNED_BYTE, pixel); + printf(" Pixel at ball[0] (%d,%d): RGBA(%d, %d, %d, %d) — expect red-ish\n", + bx, by, pixel[0], pixel[1], pixel[2], pixel[3]); + } + } + } + + uint64_t t_end = now_ns(); + double total_secs = (double)(t_end - t_start) / 1e9; + double avg_fps = (double)total_frames / total_secs; + + printf("\n=== Results ===\n"); + printf("Total frames: %d\n", total_frames); + printf("Total time: %.2f s\n", total_secs); + printf("Average FPS: %.1f\n", avg_fps); + printf("Avg ms/frame: %.2f\n", total_secs * 1000.0 / total_frames); + + /* Cleanup */ + glDeleteBuffers(1, &vbo); + glDeleteProgram(prog); + free(verts); + eglDestroySurface(dpy, egl_surf); + gbm_surface_destroy(gbm_surf); + eglDestroyContext(dpy, ctx); + eglTerminate(dpy); + gbm_device_destroy(gbm); + close(drm_fd); + + return 0; +} diff --git a/scripts/parallels/gl_display.c b/scripts/parallels/gl_display.c new file mode 100644 index 00000000..a71807db --- /dev/null +++ b/scripts/parallels/gl_display.c @@ -0,0 +1,470 @@ +/* + * gl_display.c — VirGL rendering to the physical display via DRM/KMS + EGL + * + * This is the critical test: renders circles on the GPU via VirGL and + * presents them to the physical display using DRM page flipping. + * This is what Breenix needs to replicate. + * + * Build: gcc -O2 -o gl_display gl_display.c -lEGL -lGLESv2 -lgbm -ldrm -lm + * Run: ./gl_display [frames] (default: 120) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* Timing */ +/* ------------------------------------------------------------------ */ + +static uint64_t now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; +} + +/* ------------------------------------------------------------------ */ +/* Shaders */ +/* ------------------------------------------------------------------ */ + +static const char *vert_src = + "attribute vec2 a_pos;\n" + "attribute vec4 a_color;\n" + "varying vec4 v_color;\n" + "uniform vec2 u_resolution;\n" + "void main() {\n" + " vec2 clip = (a_pos / u_resolution) * 2.0 - 1.0;\n" + " clip.y = -clip.y;\n" + " gl_Position = vec4(clip, 0.0, 1.0);\n" + " v_color = a_color;\n" + "}\n"; + +static const char *frag_src = + "precision mediump float;\n" + "varying vec4 v_color;\n" + "void main() {\n" + " gl_FragColor = v_color;\n" + "}\n"; + +/* ------------------------------------------------------------------ */ +/* Balls */ +/* ------------------------------------------------------------------ */ + +#define NUM_BALLS 12 +#define CIRCLE_SEGMENTS 20 + +struct Ball { + float x, y, vx, vy, radius, r, g, b, mass; +}; + +static struct Ball balls[NUM_BALLS]; +static int fb_width, fb_height; + +static void init_balls(int w, int h) { + fb_width = w; + fb_height = h; + struct Ball defaults[NUM_BALLS] = { + { 100, 100, 11.0, 8.0, 38, 1.0, 0.2, 0.2, 38 }, + { 300, 200, -10.0, 7.0, 33, 0.2, 1.0, 0.2, 33 }, + { 200, 400, 9.0, -9.5, 42, 0.2, 0.2, 1.0, 42 }, + { 400, 300, -8.5, -8.0, 28, 1.0, 1.0, 0.2, 28 }, + { 150, 300, 10.5, 6.0, 24, 1.0, 0.2, 1.0, 24 }, + { 350, 150, -9.0, 7.5, 26, 0.2, 1.0, 1.0, 26 }, + { 450, 500, 8.0, -7.0, 35, 1.0, 0.6, 0.2, 35 }, + { 250, 550, -7.5, 8.5, 30, 0.6, 0.2, 1.0, 30 }, + { 500, 100, 9.5, 9.5, 22, 0.8, 0.8, 0.8, 22 }, + { 120, 500, -11.0, -6.5, 20, 1.0, 0.4, 0.4, 20 }, + { 380, 450, 7.0, 9.0, 32, 0.4, 1.0, 0.4, 32 }, + { 520, 350, -8.0, -8.5, 27, 0.4, 0.6, 1.0, 27 }, + }; + memcpy(balls, defaults, sizeof(defaults)); +} + +static void ball_bounce(struct Ball *b) { + if (b->x - b->radius < 0) { b->x = b->radius; b->vx = -b->vx; } + if (b->x + b->radius >= fb_width) { b->x = fb_width - b->radius - 1; b->vx = -b->vx; } + if (b->y - b->radius < 0) { b->y = b->radius; b->vy = -b->vy; } + if (b->y + b->radius >= fb_height) { b->y = fb_height - b->radius - 1; b->vy = -b->vy; } +} + +static void check_collision(struct Ball *a, struct Ball *b) { + float dx = b->x - a->x, dy = b->y - a->y; + float touch = a->radius + b->radius; + float dist_sq = dx*dx + dy*dy; + if (dist_sq >= touch*touch || dist_sq == 0) return; + float dist = sqrtf(dist_sq); + if (dist == 0) { a->x -= 1; b->x += 1; return; } + float nx = dx/dist, ny = dy/dist; + float v1n = a->vx*nx + a->vy*ny; + float v2n = b->vx*nx + b->vy*ny; + if (v1n <= v2n) return; + float m1 = a->mass, m2 = b->mass, mt = m1+m2; + float v1n_new = ((m1-m2)*v1n + 2*m2*v2n)/mt; + float v2n_new = ((m2-m1)*v2n + 2*m1*v1n)/mt; + a->vx += (v1n_new-v1n)*nx; a->vy += (v1n_new-v1n)*ny; + b->vx += (v2n_new-v2n)*nx; b->vy += (v2n_new-v2n)*ny; + float overlap = touch - dist + 0.5f; + a->x -= overlap*m2/mt*nx; a->y -= overlap*m2/mt*ny; + b->x += overlap*m1/mt*nx; b->y += overlap*m1/mt*ny; +} + +static void physics_step(void) { + for (int s = 0; s < 16; s++) { + for (int i = 0; i < NUM_BALLS; i++) { + balls[i].x += balls[i].vx/16.0f; + balls[i].y += balls[i].vy/16.0f; + } + for (int i = 0; i < NUM_BALLS; i++) ball_bounce(&balls[i]); + for (int i = 0; i < NUM_BALLS; i++) + for (int j = i+1; j < NUM_BALLS; j++) + check_collision(&balls[i], &balls[j]); + } +} + +/* ------------------------------------------------------------------ */ +/* GL helpers */ +/* ------------------------------------------------------------------ */ + +static GLuint compile_shader(GLenum type, const char *src) { + GLuint s = glCreateShader(type); + glShaderSource(s, 1, &src, NULL); + glCompileShader(s); + GLint ok; + glGetShaderiv(s, GL_COMPILE_STATUS, &ok); + if (!ok) { + char buf[512]; + glGetShaderInfoLog(s, sizeof(buf), NULL, buf); + fprintf(stderr, "Shader error: %s\n", buf); + exit(1); + } + return s; +} + +static int build_circle_verts(float *v, float cx, float cy, float r, + float cr, float cg, float cb) { + int n = 0; + v[n++] = cx; v[n++] = cy; + v[n++] = cr; v[n++] = cg; v[n++] = cb; v[n++] = 1.0f; + for (int i = 0; i <= CIRCLE_SEGMENTS; i++) { + float angle = (float)i / CIRCLE_SEGMENTS * 2.0f * M_PI; + v[n++] = cx + r * cosf(angle); + v[n++] = cy + r * sinf(angle); + v[n++] = cr; v[n++] = cg; v[n++] = cb; v[n++] = 1.0f; + } + return CIRCLE_SEGMENTS + 2; +} + +/* ------------------------------------------------------------------ */ +/* DRM + GBM + EGL display setup */ +/* ------------------------------------------------------------------ */ + +struct drm_state { + int fd; + drmModeConnector *connector; + drmModeEncoder *encoder; + drmModeCrtc *saved_crtc; + uint32_t crtc_id; + drmModeModeInfo mode; +}; + +static int find_drm_display(struct drm_state *drm) { + const char *cards[] = {"/dev/dri/card0", "/dev/dri/card1", NULL}; + for (int i = 0; cards[i]; i++) { + drm->fd = open(cards[i], O_RDWR | O_CLOEXEC); + if (drm->fd < 0) { + fprintf(stderr, " Cannot open %s: %s\n", cards[i], strerror(errno)); + continue; + } + + /* Need master for modesetting */ + drmSetMaster(drm->fd); + + drmModeRes *res = drmModeGetResources(drm->fd); + if (!res) { + fprintf(stderr, " %s: drmModeGetResources failed: %s\n", cards[i], strerror(errno)); + close(drm->fd); + continue; + } + fprintf(stderr, " %s: %d connectors, %d crtcs, %d encoders\n", + cards[i], res->count_connectors, res->count_crtcs, res->count_encoders); + + /* Find connected connector */ + for (int c = 0; c < res->count_connectors; c++) { + drm->connector = drmModeGetConnector(drm->fd, res->connectors[c]); + if (!drm->connector) continue; + if (drm->connector->connection == DRM_MODE_CONNECTED && + drm->connector->count_modes > 0) { + printf("Found connector %d: %s, %dx%d\n", + drm->connector->connector_id, + drm->connector->count_modes > 0 ? "has modes" : "no modes", + drm->connector->modes[0].hdisplay, + drm->connector->modes[0].vdisplay); + drm->mode = drm->connector->modes[0]; + break; + } + drmModeFreeConnector(drm->connector); + drm->connector = NULL; + } + + if (!drm->connector) { + fprintf(stderr, " %s: no connected connector found\n", cards[i]); + drmModeFreeResources(res); + close(drm->fd); + continue; + } + + /* Find encoder + CRTC */ + drm->encoder = drmModeGetEncoder(drm->fd, drm->connector->encoder_id); + if (!drm->encoder) { + /* Try first encoder */ + for (int e = 0; e < res->count_encoders; e++) { + drm->encoder = drmModeGetEncoder(drm->fd, res->encoders[e]); + if (drm->encoder) break; + } + } + if (!drm->encoder) { + fprintf(stderr, "No encoder found\n"); + drmModeFreeResources(res); + continue; + } + + drm->crtc_id = drm->encoder->crtc_id; + if (!drm->crtc_id && res->count_crtcs > 0) + drm->crtc_id = res->crtcs[0]; + + drm->saved_crtc = drmModeGetCrtc(drm->fd, drm->crtc_id); + + printf("Using DRM device: %s\n", cards[i]); + printf("Display: %dx%d @ %dHz\n", + drm->mode.hdisplay, drm->mode.vdisplay, drm->mode.vrefresh); + + drmModeFreeResources(res); + return 0; + } + return -1; +} + +/* ------------------------------------------------------------------ */ +/* Main */ +/* ------------------------------------------------------------------ */ + +int main(int argc, char *argv[]) { + int total_frames = 120; + if (argc > 1) total_frames = atoi(argv[1]); + if (total_frames <= 0) total_frames = 120; + + printf("=== GL Display: %d frames, DRM/KMS page-flip ===\n", total_frames); + + /* ---- Find DRM display ---- */ + struct drm_state drm = {0}; + if (find_drm_display(&drm) < 0) { + fprintf(stderr, "No DRM display found\n"); + return 1; + } + + int width = drm.mode.hdisplay; + int height = drm.mode.vdisplay; + init_balls(width, height); + + /* ---- GBM device ---- */ + struct gbm_device *gbm = gbm_create_device(drm.fd); + if (!gbm) { fprintf(stderr, "GBM failed\n"); return 1; } + + /* ---- GBM surface ---- */ + struct gbm_surface *gbm_surf = gbm_surface_create(gbm, width, height, + GBM_FORMAT_XRGB8888, + GBM_BO_USE_SCANOUT | GBM_BO_USE_RENDERING); + if (!gbm_surf) { fprintf(stderr, "GBM surface failed\n"); return 1; } + + /* ---- EGL ---- */ + EGLDisplay dpy = eglGetPlatformDisplay(EGL_PLATFORM_GBM_MESA, gbm, NULL); + if (dpy == EGL_NO_DISPLAY) + dpy = eglGetDisplay((EGLNativeDisplayType)gbm); + if (dpy == EGL_NO_DISPLAY) { fprintf(stderr, "No EGL display\n"); return 1; } + + EGLint major, minor; + eglInitialize(dpy, &major, &minor); + eglBindAPI(EGL_OPENGL_ES_API); + + /* Enumerate ALL EGL configs and find one matching XRGB8888. + * eglChooseConfig returns ARGB configs by default, which cause + * drmModeSetCrtc EINVAL on Parallels — XRGB8888 is required. */ + EGLint total_configs = 0; + eglGetConfigs(dpy, NULL, 0, &total_configs); + EGLConfig *all_configs = malloc(total_configs * sizeof(EGLConfig)); + eglGetConfigs(dpy, all_configs, total_configs, &total_configs); + printf("Scanning %d EGL configs for XRGB8888 match...\n", total_configs); + + EGLConfig config = NULL; + for (int i = 0; i < total_configs; i++) { + EGLint native_visual, render_type, surf_type, alpha_size; + eglGetConfigAttrib(dpy, all_configs[i], EGL_NATIVE_VISUAL_ID, &native_visual); + eglGetConfigAttrib(dpy, all_configs[i], EGL_RENDERABLE_TYPE, &render_type); + eglGetConfigAttrib(dpy, all_configs[i], EGL_SURFACE_TYPE, &surf_type); + eglGetConfigAttrib(dpy, all_configs[i], EGL_ALPHA_SIZE, &alpha_size); + if (native_visual == (int)GBM_FORMAT_XRGB8888 && + (render_type & EGL_OPENGL_ES2_BIT) && + (surf_type & EGL_WINDOW_BIT)) { + config = all_configs[i]; + printf(" Found XRGB8888 config #%d (alpha=%d)\n", i, alpha_size); + break; + } + } + free(all_configs); + if (!config) { fprintf(stderr, "No XRGB8888 EGL config found\n"); return 1; } + + EGLint ctx_attrs[] = { EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE }; + EGLContext ctx = eglCreateContext(dpy, config, EGL_NO_CONTEXT, ctx_attrs); + if (ctx == EGL_NO_CONTEXT) { fprintf(stderr, "eglCreateContext failed: 0x%x\n", eglGetError()); return 1; } + + EGLSurface egl_surf = eglCreateWindowSurface(dpy, config, + (EGLNativeWindowType)gbm_surf, NULL); + if (egl_surf == EGL_NO_SURFACE) { fprintf(stderr, "eglCreateWindowSurface failed: 0x%x\n", eglGetError()); return 1; } + + eglMakeCurrent(dpy, egl_surf, egl_surf, ctx); + + printf("GL_RENDERER: %s\n", glGetString(GL_RENDERER)); + printf("GL_VERSION: %s\n", glGetString(GL_VERSION)); + + /* ---- Compile shaders ---- */ + GLuint vs = compile_shader(GL_VERTEX_SHADER, vert_src); + GLuint fs = compile_shader(GL_FRAGMENT_SHADER, frag_src); + GLuint prog = glCreateProgram(); + glAttachShader(prog, vs); + glAttachShader(prog, fs); + glBindAttribLocation(prog, 0, "a_pos"); + glBindAttribLocation(prog, 1, "a_color"); + glLinkProgram(prog); + glUseProgram(prog); + glUniform2f(glGetUniformLocation(prog, "u_resolution"), (float)width, (float)height); + glViewport(0, 0, width, height); + glDisable(GL_DEPTH_TEST); + + float *verts = malloc(NUM_BALLS * (CIRCLE_SEGMENTS + 2) * 6 * sizeof(float)); + GLuint vbo; + glGenBuffers(1, &vbo); + + printf("Setup complete. Rendering %d frames to display...\n\n", total_frames); + + /* ---- Render loop with DRM page flipping ---- */ + uint64_t t_start = now_ns(); + uint64_t t_last_print = t_start; + int frames_since_print = 0; + struct gbm_bo *prev_bo = NULL; + uint32_t prev_fb_id = 0; + + for (int frame = 0; frame < total_frames; frame++) { + physics_step(); + + /* Clear + draw */ + glClearColor(15.0f/255.0f, 15.0f/255.0f, 30.0f/255.0f, 1.0f); + glClear(GL_COLOR_BUFFER_BIT); + + for (int i = 0; i < NUM_BALLS; i++) { + int nv = build_circle_verts(verts, balls[i].x, balls[i].y, + balls[i].radius, balls[i].r, balls[i].g, balls[i].b); + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, nv * 6 * sizeof(float), verts, GL_DYNAMIC_DRAW); + glEnableVertexAttribArray(0); + glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)0); + glEnableVertexAttribArray(1); + glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6*sizeof(float), (void*)(2*sizeof(float))); + glDrawArrays(GL_TRIANGLE_FAN, 0, nv); + } + + /* eglSwapBuffers triggers the GPU render */ + eglSwapBuffers(dpy, egl_surf); + + /* Get the front buffer GBM BO */ + struct gbm_bo *bo = gbm_surface_lock_front_buffer(gbm_surf); + if (!bo) { + fprintf(stderr, "Failed to lock front buffer\n"); + continue; + } + + uint32_t handle = gbm_bo_get_handle(bo).u32; + uint32_t stride = gbm_bo_get_stride(bo); + uint32_t fb_id = 0; + + /* Create DRM framebuffer from the GBM BO */ + int ret = drmModeAddFB(drm.fd, width, height, 24, 32, stride, handle, &fb_id); + if (ret) { + fprintf(stderr, "drmModeAddFB failed: %s\n", strerror(errno)); + gbm_surface_release_buffer(gbm_surf, bo); + continue; + } + + /* Set this buffer as the CRTC scanout (blocking page flip) */ + ret = drmModeSetCrtc(drm.fd, drm.crtc_id, fb_id, 0, 0, + &drm.connector->connector_id, 1, &drm.mode); + if (ret) { + fprintf(stderr, "drmModeSetCrtc failed: %s (frame %d)\n", strerror(errno), frame); + } + + /* Release previous buffer */ + if (prev_bo) { + drmModeRmFB(drm.fd, prev_fb_id); + gbm_surface_release_buffer(gbm_surf, prev_bo); + } + prev_bo = bo; + prev_fb_id = fb_id; + + frames_since_print++; + if (frames_since_print >= 16) { + uint64_t now = now_ns(); + uint64_t elapsed = now - t_last_print; + if (elapsed > 0) { + double fps = (double)frames_since_print * 1e9 / (double)elapsed; + double ms = (double)elapsed / (double)frames_since_print / 1e6; + printf("[frame %4d] FPS: %.1f (%.2f ms/frame)\n", frame, fps, ms); + } + frames_since_print = 0; + t_last_print = now; + } + } + + uint64_t t_end = now_ns(); + double total_secs = (double)(t_end - t_start) / 1e9; + printf("\n=== Results ===\n"); + printf("Total frames: %d\n", total_frames); + printf("Total time: %.2f s\n", total_secs); + printf("Average FPS: %.1f\n", (double)total_frames / total_secs); + + /* Restore original display */ + if (drm.saved_crtc) { + drmModeSetCrtc(drm.fd, drm.saved_crtc->crtc_id, drm.saved_crtc->buffer_id, + drm.saved_crtc->x, drm.saved_crtc->y, + &drm.connector->connector_id, 1, &drm.saved_crtc->mode); + drmModeFreeCrtc(drm.saved_crtc); + } + + /* Cleanup */ + if (prev_bo) { + drmModeRmFB(drm.fd, prev_fb_id); + gbm_surface_release_buffer(gbm_surf, prev_bo); + } + glDeleteBuffers(1, &vbo); + free(verts); + eglDestroySurface(dpy, egl_surf); + gbm_surface_destroy(gbm_surf); + eglDestroyContext(dpy, ctx); + eglTerminate(dpy); + gbm_device_destroy(gbm); + close(drm.fd); + + return 0; +} diff --git a/scripts/parallels/vm-type.sh b/scripts/parallels/vm-type.sh new file mode 100755 index 00000000..3ad98cbe --- /dev/null +++ b/scripts/parallels/vm-type.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Type a string into a Parallels VM console using keyboard scancodes. +# Usage: ./vm-type.sh +# Special strings: ENTER, SPACE, TAB, BACKSPACE + +VM="$1" +shift +TEXT="$*" + +if [ -z "$VM" ] || [ -z "$TEXT" ]; then + echo "Usage: $0 " + exit 1 +fi + +send_key() { + local sc="$1" + prlctl send-key-event "$VM" --scancode "$sc" --event press --delay 30 2>/dev/null + prlctl send-key-event "$VM" --scancode "$sc" --event release 2>/dev/null +} + +send_shift_key() { + local sc="$1" + # Press shift + prlctl send-key-event "$VM" --scancode 42 --event press 2>/dev/null + sleep 0.05 + prlctl send-key-event "$VM" --scancode "$sc" --event press --delay 30 2>/dev/null + prlctl send-key-event "$VM" --scancode "$sc" --event release 2>/dev/null + # Release shift + prlctl send-key-event "$VM" --scancode 42 --event release 2>/dev/null +} + +# Handle special words +if [ "$TEXT" = "ENTER" ]; then + send_key 28 + exit 0 +fi + +# Type each character +for (( i=0; i<${#TEXT}; i++ )); do + c="${TEXT:$i:1}" + case "$c" in + a) send_key 30 ;; b) send_key 48 ;; c) send_key 46 ;; d) send_key 32 ;; + e) send_key 18 ;; f) send_key 33 ;; g) send_key 34 ;; h) send_key 35 ;; + i) send_key 23 ;; j) send_key 36 ;; k) send_key 37 ;; l) send_key 38 ;; + m) send_key 50 ;; n) send_key 49 ;; o) send_key 24 ;; p) send_key 25 ;; + q) send_key 16 ;; r) send_key 19 ;; s) send_key 31 ;; t) send_key 20 ;; + u) send_key 22 ;; v) send_key 47 ;; w) send_key 17 ;; x) send_key 45 ;; + y) send_key 21 ;; z) send_key 44 ;; + A) send_shift_key 30 ;; B) send_shift_key 48 ;; C) send_shift_key 46 ;; + D) send_shift_key 32 ;; E) send_shift_key 18 ;; F) send_shift_key 33 ;; + G) send_shift_key 34 ;; H) send_shift_key 35 ;; I) send_shift_key 23 ;; + J) send_shift_key 36 ;; K) send_shift_key 37 ;; L) send_shift_key 38 ;; + M) send_shift_key 50 ;; N) send_shift_key 49 ;; O) send_shift_key 24 ;; + P) send_shift_key 25 ;; Q) send_shift_key 16 ;; R) send_shift_key 19 ;; + S) send_shift_key 31 ;; T) send_shift_key 20 ;; U) send_shift_key 22 ;; + V) send_shift_key 47 ;; W) send_shift_key 17 ;; X) send_shift_key 45 ;; + Y) send_shift_key 21 ;; Z) send_shift_key 44 ;; + 0) send_key 11 ;; 1) send_key 2 ;; 2) send_key 3 ;; 3) send_key 4 ;; + 4) send_key 5 ;; 5) send_key 6 ;; 6) send_key 7 ;; 7) send_key 8 ;; + 8) send_key 9 ;; 9) send_key 10 ;; + ' ') send_key 57 ;; # space + '-') send_key 12 ;; + '=') send_key 13 ;; + '[') send_key 26 ;; + ']') send_key 27 ;; + '\\') send_key 43 ;; + ';') send_key 39 ;; + "'") send_key 40 ;; + '`') send_key 41 ;; + ',') send_key 51 ;; + '.') send_key 52 ;; + '/') send_key 53 ;; + '!') send_shift_key 2 ;; + '@') send_shift_key 3 ;; + '#') send_shift_key 4 ;; + '$') send_shift_key 5 ;; + '%') send_shift_key 6 ;; + '^') send_shift_key 7 ;; + '&') send_shift_key 8 ;; + '*') send_shift_key 9 ;; + '(') send_shift_key 10 ;; + ')') send_shift_key 11 ;; + '_') send_shift_key 12 ;; + '+') send_shift_key 13 ;; + '{') send_shift_key 26 ;; + '}') send_shift_key 27 ;; + '|') send_shift_key 43 ;; + ':') send_shift_key 39 ;; + '"') send_shift_key 40 ;; + '~') send_shift_key 41 ;; + '<') send_shift_key 51 ;; + '>') send_shift_key 52 ;; + '?') send_shift_key 53 ;; + *) echo "Warning: unmapped char '$c'" >&2 ;; + esac + sleep 0.02 +done diff --git a/userspace/programs/src/bounce.rs b/userspace/programs/src/bounce.rs index cb9408e9..eecdaf87 100644 --- a/userspace/programs/src/bounce.rs +++ b/userspace/programs/src/bounce.rs @@ -1,14 +1,18 @@ //! Bouncing balls with collision detection demo for Breenix (std version) //! -//! Uses mmap'd framebuffer for zero-syscall drawing via libgfx. All pixel -//! writes go directly to a userspace buffer; only flush (1 syscall/frame) -//! copies the dirty region to VRAM. +//! Two rendering paths: +//! - **VirGL GPU** (preferred): All rendering on host GPU via VirGL 3D pipeline. +//! Guest sends ~1KB of draw commands, host renders, DMA copies to BAR0. +//! Expected: 60+ FPS. +//! - **mmap fallback**: Software rendering to mmap'd framebuffer with per-ball flush. +//! Guest CPU writes ~340KB to BAR0 per frame. Achieves ~12 FPS on Parallels. //! //! Created for Gus! use std::process; use libbreenix::graphics; +use libbreenix::graphics::{FlushRect, VirglBall}; use libbreenix::time; use libgfx::color::Color; @@ -131,6 +135,9 @@ impl FpsCounter { if elapsed > 0 { self.display_fps = (self.frame_count as u64 * 1_000_000_000 / elapsed) as u32; } + // Log FPS to serial so we can verify from the log + println!("[bounce] FPS: {} ({}ms/frame)", self.display_fps, + elapsed / (self.frame_count as u64 * 1_000_000)); self.frame_count = 0; self.last_time_ns = now; } @@ -163,15 +170,105 @@ impl FpsCounter { // --------------------------------------------------------------------------- fn main() { - println!("Bounce demo starting (for Gus!)"); + // Unique boot ID from monotonic clock — different every boot, proves we're + // running the latest binary (check this value in serial logs). + let boot_id = clock_monotonic_ns(); + println!("Bounce demo starting (for Gus!) [boot_id={:016x}]", boot_id); let info = match graphics::fbinfo() { Ok(info) => info, Err(_e) => { println!("Error: Could not get framebuffer info"); process::exit(1); } }; - let width = info.left_pane_width() as i32; let height = info.height as i32; + + // VirGL uses full viewport (GPU renders everything), mmap uses left pane only + let virgl_width = info.width as i32; + let mmap_width = info.left_pane_width() as i32; + + // 12 balls, fast velocities. Sub-stepping catches edge collisions. + let mut balls = [ + Ball::new(100, 100, 1100, 800, 38, Color::rgb(255, 50, 50)), // Red + Ball::new(300, 200, -1000, 700, 33, Color::rgb( 50, 255, 50)), // Green + Ball::new(200, 400, 900, -950, 42, Color::rgb( 50, 50, 255)), // Blue + Ball::new(400, 300, -850, -800, 28, Color::rgb(255, 255, 50)), // Yellow + Ball::new(150, 300, 1050, 600, 24, Color::rgb(255, 50, 255)), // Magenta + Ball::new(350, 150, -900, 750, 26, Color::rgb( 50, 255, 255)), // Cyan + Ball::new(450, 500, 800, -700, 35, Color::rgb(255, 150, 50)), // Orange + Ball::new(250, 550, -750, 850, 30, Color::rgb(150, 50, 255)), // Purple + Ball::new(500, 100, 950, 950, 22, Color::rgb(200, 200, 200)), // White + Ball::new(120, 500, -1100, -650, 20, Color::rgb(255, 100, 100)), // Salmon + Ball::new(380, 450, 700, 900, 32, Color::rgb(100, 255, 100)), // Lime + Ball::new(520, 350, -800, -850, 27, Color::rgb(100, 150, 255)), // Sky + ]; + + let bg = Color::rgb(15, 15, 30); + let bg_packed = graphics::rgb(15, 15, 30); + + // Try VirGL GPU rendering first. If the first frame succeeds, use GPU path. + let virgl_balls = build_virgl_balls(&balls); + let use_virgl = graphics::virgl_submit_frame(&virgl_balls[..balls.len()], bg_packed).is_ok(); + + if use_virgl { + println!("Starting VirGL GPU-rendered demo (12 balls, {}x{}) [boot_id={:016x}]", + virgl_width, height, boot_id); + run_virgl_loop(&mut balls, virgl_width, height, bg_packed); + } else { + println!("VirGL unavailable, falling back to mmap rendering [boot_id={:016x}]", boot_id); + run_mmap_loop(&mut balls, mmap_width, height, &info, bg); + } +} + +/// Convert Ball array to VirglBall descriptors for GPU rendering. +fn build_virgl_balls(balls: &[Ball]) -> [VirglBall; 12] { + let mut vb = [VirglBall::default(); 12]; + for (i, ball) in balls.iter().enumerate().take(12) { + let c = ball.color; + vb[i] = VirglBall { + x: ball.px() as f32, + y: ball.py() as f32, + radius: ball.radius as f32, + color: [ + c.r as f32 / 255.0, + c.g as f32 / 255.0, + c.b as f32 / 255.0, + 1.0, + ], + }; + } + vb +} + +/// VirGL GPU rendering loop — all rendering on host GPU, zero guest pixel writes. +fn run_virgl_loop(balls: &mut [Ball; 12], width: i32, height: i32, bg_packed: u32) { + const SUBSTEPS: i32 = 16; + let mut fps = FpsCounter::new(); + + loop { + // Sub-step physics + for _ in 0..SUBSTEPS { + for ball in balls.iter_mut() { ball.step(SUBSTEPS); } + for ball in balls.iter_mut() { ball.bounce_walls(width, height); } + for i in 0..balls.len() { + for j in (i + 1)..balls.len() { + let (left, right) = balls.split_at_mut(j); + check_collision(&mut left[i], &mut right[0]); + } + } + } + + // Build VirGL ball descriptors from current positions + let vb = build_virgl_balls(balls); + + // Submit to GPU — one syscall renders everything + let _ = graphics::virgl_submit_frame(&vb[..balls.len()], bg_packed); + + fps.tick(); + } +} + +/// Mmap software rendering loop — fallback when VirGL is unavailable. +fn run_mmap_loop(balls: &mut [Ball; 12], width: i32, height: i32, info: &graphics::FbInfo, bg: Color) { let bpp = info.bytes_per_pixel as usize; let fb_ptr = match graphics::fb_mmap() { @@ -190,31 +287,18 @@ fn main() { ) }; - println!("Starting collision demo (12 balls, mmap mode)..."); - - let bg = Color::rgb(15, 15, 30); - - // 12 balls, fast velocities. Sub-stepping catches edge collisions. - let mut balls = [ - Ball::new(100, 100, 1100, 800, 38, Color::rgb(255, 50, 50)), // Red - Ball::new(300, 200, -1000, 700, 33, Color::rgb( 50, 255, 50)), // Green - Ball::new(200, 400, 900, -950, 42, Color::rgb( 50, 50, 255)), // Blue - Ball::new(400, 300, -850, -800, 28, Color::rgb(255, 255, 50)), // Yellow - Ball::new(150, 300, 1050, 600, 24, Color::rgb(255, 50, 255)), // Magenta - Ball::new(350, 150, -900, 750, 26, Color::rgb( 50, 255, 255)), // Cyan - Ball::new(450, 500, 800, -700, 35, Color::rgb(255, 150, 50)), // Orange - Ball::new(250, 550, -750, 850, 30, Color::rgb(150, 50, 255)), // Purple - Ball::new(500, 100, 950, 950, 22, Color::rgb(200, 200, 200)), // White - Ball::new(120, 500, -1100, -650, 20, Color::rgb(255, 100, 100)), // Salmon - Ball::new(380, 450, 700, 900, 32, Color::rgb(100, 255, 100)), // Lime - Ball::new(520, 350, -800, -850, 27, Color::rgb(100, 150, 255)), // Sky - ]; + println!("Starting collision demo (12 balls, {}x{}, mmap, batch flush)", width, height); // With velocities ~1000 (10 px/frame), 16 sub-steps = ~0.6 px per step. const SUBSTEPS: i32 = 16; let mut fps = FpsCounter::new(); + // Track previous frame ball positions for per-ball flushing. + let mut prev: [(i32, i32, i32); 12] = [(0, 0, 0); 12]; + let mut first_frame = true; + const PAD: i32 = 2; + loop { // Sub-step physics for _ in 0..SUBSTEPS { @@ -232,21 +316,68 @@ fn main() { } } - // Draw — libgfx tracks dirty rects automatically - fb.clear(bg); - for ball in balls.iter() { - ball.draw(&mut fb); - } - fps.tick(); - fps.draw(&mut fb); - - // Flush only the dirty region - if let Some(dirty) = fb.take_dirty() { - let _ = graphics::fb_flush_rect(dirty.x, dirty.y, dirty.w, dirty.h); + if first_frame { + // First frame: full clear + single flush + fb.clear(bg); + for ball in balls.iter() { ball.draw(&mut fb); } + for (i, ball) in balls.iter().enumerate() { + prev[i] = (ball.px(), ball.py(), ball.radius); + } + fps.tick(); + fps.draw(&mut fb); + if let Some(dirty) = fb.take_dirty() { + let _ = graphics::fb_flush_rect(dirty.x, dirty.y, dirty.w, dirty.h); + } + first_frame = false; } else { - let _ = graphics::fb_flush(); + // Phase 1: Erase all previous ball positions + for &(px, py, r) in prev.iter() { + if r > 0 { + shapes::fill_rect(&mut fb, + (px - r - PAD).max(0), (py - r - PAD).max(0), + (r + PAD) * 2 + 1, (r + PAD) * 2 + 1, bg); + } + } + let fps_y = (height - 40).max(0); + shapes::fill_rect(&mut fb, 0, fps_y, 340, 40, bg); + + // Phase 2: Draw all new ball positions + for ball in balls.iter() { ball.draw(&mut fb); } + fps.tick(); + fps.draw(&mut fb); + + // Discard accumulated dirty rect — we use batch flush below + let _ = fb.take_dirty(); + + // Phase 3: Batch flush — all dirty rects in ONE syscall, ONE DSB barrier. + // Saves 12 syscall round-trips + 12 DSB stalls vs per-ball flushing. + let mut flush_rects = [FlushRect { x: 0, y: 0, w: 0, h: 0 }; 13]; + let mut rect_count = 0usize; + for (i, ball) in balls.iter().enumerate() { + let (opx, opy, or) = prev[i]; + let npx = ball.px(); + let npy = ball.py(); + let nr = ball.radius; + let x1 = (opx - or - PAD).min(npx - nr - PAD).max(0); + let y1 = (opy - or - PAD).min(npy - nr - PAD).max(0); + let x2 = (opx + or + PAD + 1).max(npx + nr + PAD + 1).min(width); + let y2 = (opy + or + PAD + 1).max(npy + nr + PAD + 1).min(height); + if x2 > x1 && y2 > y1 { + flush_rects[rect_count] = FlushRect { x: x1, y: y1, w: x2 - x1, h: y2 - y1 }; + rect_count += 1; + } + } + // FPS region + flush_rects[rect_count] = FlushRect { x: 0, y: fps_y, w: 340, h: 40 }; + rect_count += 1; + let _ = graphics::fb_flush_rects(&flush_rects[..rect_count]); + + // Save new positions for next frame's erase + for (i, ball) in balls.iter().enumerate() { + prev[i] = (ball.px(), ball.py(), ball.radius); + } } - let _ = time::sleep_ms(16); // ~60 FPS target + let _ = time::sleep_ms(1); // Yield CPU briefly without wasting frame time } }