diff --git a/examples/playground/gpu_playground.html b/examples/playground/gpu_playground.html
new file mode 100644
index 0000000..e98b420
--- /dev/null
+++ b/examples/playground/gpu_playground.html
@@ -0,0 +1,445 @@
+
+
+
+
+ EigenScript GPU Playground
+
+
+
+
+
+
+
+ GPU
+ WASM
+ EigenScript v0.4-gpu
+ Checking WebGPU...
+
+
+
+
+
+
+
+
+
GPU Time:--
+
CPU Time:--
+
Speedup:--
+
+
+
+
+
+
+
+
+
diff --git a/src/eigenscript/compiler/codegen/wgsl_backend.py b/src/eigenscript/compiler/codegen/wgsl_backend.py
new file mode 100644
index 0000000..81139e4
--- /dev/null
+++ b/src/eigenscript/compiler/codegen/wgsl_backend.py
@@ -0,0 +1,835 @@
+"""
+WGSL Backend for EigenScript GPU Compute.
+
+Generates WebGPU Shading Language (WGSL) compute shaders from EigenScript AST nodes.
+This enables GPU-accelerated matrix operations in the web playground.
+
+Architecture:
+ AST
+ ├── [CPU path] → llvm_backend.py → LLVM IR → native/WASM
+ └── [GPU path] → wgsl_backend.py → WGSL → WebGPU compute shaders
+
+GPU-lite EigenValue Model:
+ On GPU: Only track value + gradient (8 bytes per value)
+ On CPU: Full geometric tracking (value, gradient, stability, history)
+ Sync: Observation triggers device→host transfer + full promotion
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple
+from enum import Enum
+
+from eigenscript.parser.ast_builder import (
+ ASTNode,
+ Literal,
+ Identifier,
+ BinaryOp,
+ UnaryOp,
+ Assignment,
+ FunctionDef,
+ Return,
+ Conditional,
+ Loop,
+ Relation,
+ Interrogative,
+ ListLiteral,
+ Index,
+ GPUBlock,
+)
+
+
+class WGSLType(Enum):
+ """WGSL data types."""
+
+ F32 = "f32"
+ I32 = "i32"
+ U32 = "u32"
+ VEC2F = "vec2"
+ VEC3F = "vec3"
+ VEC4F = "vec4"
+ MAT4X4F = "mat4x4"
+ ARRAY_F32 = "array"
+ EIGEN_LITE = "EigenLite" # GPU-lite EigenValue struct
+
+
+@dataclass
+class WGSLBuffer:
+ """Represents a WebGPU buffer binding."""
+
+ name: str
+ binding: int
+ group: int
+ type: WGSLType
+ access: str # "read", "read_write", "write"
+ size: Optional[int] = None # For arrays
+
+
+@dataclass
+class WGSLKernel:
+ """Represents a generated WGSL compute kernel."""
+
+ name: str
+ source: str
+ workgroup_size: Tuple[int, int, int]
+ buffers: List[WGSLBuffer]
+ entry_point: str = "main"
+
+
+class WGSLCodeGenerator:
+ """
+ Generates WGSL compute shaders from EigenScript GPU blocks.
+
+ This generator produces WebGPU-compatible compute shaders that can be
+ executed in the browser via the WebGPU API.
+
+ Example usage:
+ generator = WGSLCodeGenerator()
+ kernel = generator.generate_matmul(M, N, K)
+ # Returns WGSLKernel with shader code and buffer bindings
+ """
+
+ def __init__(self):
+ self.binding_counter = 0
+ self.temp_counter = 0
+ self.kernels: List[WGSLKernel] = []
+
+ # Track variables in current GPU block
+ self.gpu_vars: Dict[str, WGSLType] = {}
+
+ # Standard workgroup sizes (optimized for common GPUs)
+ self.default_workgroup_size = (8, 8, 1) # 64 threads
+ self.matmul_workgroup_size = (16, 16, 1) # 256 threads for matmul
+
+ def _next_binding(self) -> int:
+ """Get next available binding number."""
+ binding = self.binding_counter
+ self.binding_counter += 1
+ return binding
+
+ def _next_temp(self) -> str:
+ """Generate unique temporary variable name."""
+ name = f"_t{self.temp_counter}"
+ self.temp_counter += 1
+ return name
+
+ def _reset_bindings(self):
+ """Reset binding counter for new kernel."""
+ self.binding_counter = 0
+
+ def generate_eigen_lite_struct(self) -> str:
+ """
+ Generate the GPU-lite EigenValue struct definition.
+
+ This is a minimal version of EigenValue for GPU computation:
+ - value: Current scalar value
+ - gradient: Rate of change (for 'why' queries)
+
+ Full geometric tracking (stability, history) remains on CPU.
+ """
+ return """
+// GPU-lite EigenValue: minimal tracking for GPU computation
+// Full geometric state syncs to host on observation
+struct EigenLite {
+ value: f32,
+ gradient: f32,
+}
+"""
+
+ def generate_matmul_kernel(
+ self, m: int, n: int, k: int, tile_size: int = 16
+ ) -> WGSLKernel:
+ """
+ Generate a tiled matrix multiplication kernel.
+
+ Computes C = A @ B where:
+ A is (M x K)
+ B is (K x N)
+ C is (M x N)
+
+ Uses tiled algorithm with shared workgroup memory for efficiency.
+
+ Args:
+ m: Number of rows in A and C
+ n: Number of columns in B and C
+ k: Number of columns in A / rows in B
+ tile_size: Size of tiles for blocking (default 16)
+
+ Returns:
+ WGSLKernel with shader code and buffer bindings
+ """
+ self._reset_bindings()
+
+ buffers = [
+ WGSLBuffer(
+ "matrix_a", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", m * k
+ ),
+ WGSLBuffer(
+ "matrix_b", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", k * n
+ ),
+ WGSLBuffer(
+ "matrix_c",
+ self._next_binding(),
+ 0,
+ WGSLType.ARRAY_F32,
+ "read_write",
+ m * n,
+ ),
+ WGSLBuffer(
+ "dimensions", self._next_binding(), 0, WGSLType.VEC3F, "read"
+ ), # M, N, K
+ ]
+
+ source = f"""
+// Matrix Multiplication Kernel: C = A @ B
+// A: ({m} x {k}), B: ({k} x {n}), C: ({m} x {n})
+// Tile size: {tile_size}x{tile_size}
+
+@group(0) @binding(0) var matrix_a: array;
+@group(0) @binding(1) var matrix_b: array;
+@group(0) @binding(2) var matrix_c: array;
+@group(0) @binding(3) var dimensions: vec3; // M, N, K
+
+// Shared memory tiles for blocking
+var tile_a: array;
+var tile_b: array;
+
+@compute @workgroup_size({tile_size}, {tile_size}, 1)
+fn main(
+ @builtin(global_invocation_id) global_id: vec3,
+ @builtin(local_invocation_id) local_id: vec3,
+ @builtin(workgroup_id) workgroup_id: vec3
+) {{
+ let M = dimensions.x;
+ let N = dimensions.y;
+ let K = dimensions.z;
+
+ let row = global_id.y;
+ let col = global_id.x;
+
+ let local_row = local_id.y;
+ let local_col = local_id.x;
+
+ var sum: f32 = 0.0;
+
+ // Number of tiles needed to cover K dimension
+ let num_tiles = (K + {tile_size}u - 1u) / {tile_size}u;
+
+ for (var t: u32 = 0u; t < num_tiles; t = t + 1u) {{
+ // Load tile from A into shared memory
+ let a_row = row;
+ let a_col = t * {tile_size}u + local_col;
+ if (a_row < M && a_col < K) {{
+ tile_a[local_row * {tile_size}u + local_col] = matrix_a[a_row * K + a_col];
+ }} else {{
+ tile_a[local_row * {tile_size}u + local_col] = 0.0;
+ }}
+
+ // Load tile from B into shared memory
+ let b_row = t * {tile_size}u + local_row;
+ let b_col = col;
+ if (b_row < K && b_col < N) {{
+ tile_b[local_row * {tile_size}u + local_col] = matrix_b[b_row * N + b_col];
+ }} else {{
+ tile_b[local_row * {tile_size}u + local_col] = 0.0;
+ }}
+
+ // Synchronize to ensure tile is loaded
+ workgroupBarrier();
+
+ // Compute partial dot product for this tile
+ for (var i: u32 = 0u; i < {tile_size}u; i = i + 1u) {{
+ sum = sum + tile_a[local_row * {tile_size}u + i] * tile_b[i * {tile_size}u + local_col];
+ }}
+
+ // Synchronize before loading next tile
+ workgroupBarrier();
+ }}
+
+ // Write result
+ if (row < M && col < N) {{
+ matrix_c[row * N + col] = sum;
+ }}
+}}
+"""
+
+ return WGSLKernel(
+ name="matmul",
+ source=source,
+ workgroup_size=(tile_size, tile_size, 1),
+ buffers=buffers,
+ entry_point="main",
+ )
+
+ def generate_dot_kernel(self, n: int) -> WGSLKernel:
+ """
+ Generate a dot product kernel with parallel reduction.
+
+ Computes: result = sum(a[i] * b[i]) for i in 0..n
+
+ Uses workgroup reduction for efficiency.
+
+ Args:
+ n: Length of input vectors
+
+ Returns:
+ WGSLKernel with shader code and buffer bindings
+ """
+ self._reset_bindings()
+
+ workgroup_size = 256
+
+ buffers = [
+ WGSLBuffer(
+ "vector_a", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer(
+ "vector_b", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer("result", self._next_binding(), 0, WGSLType.F32, "read_write"),
+ WGSLBuffer("length", self._next_binding(), 0, WGSLType.U32, "read"),
+ ]
+
+ source = f"""
+// Dot Product Kernel with Parallel Reduction
+// result = sum(a[i] * b[i])
+
+@group(0) @binding(0) var vector_a: array;
+@group(0) @binding(1) var vector_b: array;
+@group(0) @binding(2) var result: atomic; // Use atomic for reduction
+@group(0) @binding(3) var length: u32;
+
+var shared_data: array;
+
+@compute @workgroup_size({workgroup_size}, 1, 1)
+fn main(
+ @builtin(global_invocation_id) global_id: vec3,
+ @builtin(local_invocation_id) local_id: vec3,
+ @builtin(workgroup_id) workgroup_id: vec3
+) {{
+ let tid = local_id.x;
+ let gid = global_id.x;
+
+ // Each thread computes one product
+ var product: f32 = 0.0;
+ if (gid < length) {{
+ product = vector_a[gid] * vector_b[gid];
+ }}
+ shared_data[tid] = product;
+
+ workgroupBarrier();
+
+ // Parallel reduction in shared memory
+ for (var stride: u32 = {workgroup_size // 2}u; stride > 0u; stride = stride >> 1u) {{
+ if (tid < stride) {{
+ shared_data[tid] = shared_data[tid] + shared_data[tid + stride];
+ }}
+ workgroupBarrier();
+ }}
+
+ // Thread 0 writes result (using atomics for multi-workgroup accumulation)
+ if (tid == 0u) {{
+ // Convert float to bits for atomic add
+ let bits = bitcast(shared_data[0]);
+ atomicAdd(&result, bits);
+ }}
+}}
+"""
+
+ return WGSLKernel(
+ name="dot",
+ source=source,
+ workgroup_size=(workgroup_size, 1, 1),
+ buffers=buffers,
+ entry_point="main",
+ )
+
+ def generate_norm_kernel(self, n: int) -> WGSLKernel:
+ """
+ Generate a vector norm kernel (L2 norm / Euclidean norm).
+
+ Computes: result = sqrt(sum(x[i]^2))
+
+ Args:
+ n: Length of input vector
+
+ Returns:
+ WGSLKernel with shader code and buffer bindings
+ """
+ self._reset_bindings()
+
+ workgroup_size = 256
+
+ buffers = [
+ WGSLBuffer(
+ "vector", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer("result", self._next_binding(), 0, WGSLType.F32, "read_write"),
+ WGSLBuffer("length", self._next_binding(), 0, WGSLType.U32, "read"),
+ ]
+
+ source = f"""
+// Vector Norm Kernel (L2/Euclidean Norm)
+// result = sqrt(sum(x[i]^2))
+
+@group(0) @binding(0) var vector: array;
+@group(0) @binding(1) var result: array; // [0] = sum, [1] = final norm
+@group(0) @binding(2) var length: u32;
+
+var shared_data: array;
+
+@compute @workgroup_size({workgroup_size}, 1, 1)
+fn main(
+ @builtin(global_invocation_id) global_id: vec3,
+ @builtin(local_invocation_id) local_id: vec3,
+ @builtin(num_workgroups) num_workgroups: vec3
+) {{
+ let tid = local_id.x;
+ let gid = global_id.x;
+
+ // Each thread computes squared value
+ var squared: f32 = 0.0;
+ if (gid < length) {{
+ let val = vector[gid];
+ squared = val * val;
+ }}
+ shared_data[tid] = squared;
+
+ workgroupBarrier();
+
+ // Parallel reduction
+ for (var stride: u32 = {workgroup_size // 2}u; stride > 0u; stride = stride >> 1u) {{
+ if (tid < stride) {{
+ shared_data[tid] = shared_data[tid] + shared_data[tid + stride];
+ }}
+ workgroupBarrier();
+ }}
+
+ // Thread 0 accumulates partial sum
+ if (tid == 0u) {{
+ result[0] = result[0] + shared_data[0];
+ }}
+}}
+
+// Second pass kernel to compute sqrt
+@compute @workgroup_size(1, 1, 1)
+fn finalize() {{
+ result[1] = sqrt(result[0]);
+}}
+"""
+
+ return WGSLKernel(
+ name="norm",
+ source=source,
+ workgroup_size=(workgroup_size, 1, 1),
+ buffers=buffers,
+ entry_point="main",
+ )
+
+ def generate_elementwise_kernel(self, operation: str, n: int) -> WGSLKernel:
+ """
+ Generate an element-wise operation kernel.
+
+ Supports: add, sub, mul, div, scale
+
+ Args:
+ operation: One of "add", "sub", "mul", "div", "scale"
+ n: Length of vectors
+
+ Returns:
+ WGSLKernel with shader code and buffer bindings
+ """
+ self._reset_bindings()
+
+ workgroup_size = 256
+
+ # Determine operation and buffers
+ if operation == "scale":
+ buffers = [
+ WGSLBuffer(
+ "input", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer("scalar", self._next_binding(), 0, WGSLType.F32, "read"),
+ WGSLBuffer(
+ "output",
+ self._next_binding(),
+ 0,
+ WGSLType.ARRAY_F32,
+ "read_write",
+ n,
+ ),
+ WGSLBuffer("length", self._next_binding(), 0, WGSLType.U32, "read"),
+ ]
+ op_code = "input[gid] * scalar"
+ bindings = """
+@group(0) @binding(0) var input: array;
+@group(0) @binding(1) var scalar: f32;
+@group(0) @binding(2) var output: array;
+@group(0) @binding(3) var length: u32;
+"""
+ else:
+ buffers = [
+ WGSLBuffer(
+ "input_a", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer(
+ "input_b", self._next_binding(), 0, WGSLType.ARRAY_F32, "read", n
+ ),
+ WGSLBuffer(
+ "output",
+ self._next_binding(),
+ 0,
+ WGSLType.ARRAY_F32,
+ "read_write",
+ n,
+ ),
+ WGSLBuffer("length", self._next_binding(), 0, WGSLType.U32, "read"),
+ ]
+
+ op_map = {
+ "add": "input_a[gid] + input_b[gid]",
+ "sub": "input_a[gid] - input_b[gid]",
+ "mul": "input_a[gid] * input_b[gid]",
+ "div": "input_a[gid] / input_b[gid]",
+ }
+ op_code = op_map.get(operation, "input_a[gid] + input_b[gid]")
+
+ bindings = """
+@group(0) @binding(0) var input_a: array;
+@group(0) @binding(1) var input_b: array;
+@group(0) @binding(2) var output: array;
+@group(0) @binding(3) var length: u32;
+"""
+
+ source = f"""
+// Element-wise {operation.upper()} Kernel
+{bindings}
+
+@compute @workgroup_size({workgroup_size}, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3) {{
+ let gid = global_id.x;
+
+ if (gid < length) {{
+ output[gid] = {op_code};
+ }}
+}}
+"""
+
+ return WGSLKernel(
+ name=f"elementwise_{operation}",
+ source=source,
+ workgroup_size=(workgroup_size, 1, 1),
+ buffers=buffers,
+ entry_point="main",
+ )
+
+ def generate_from_gpu_block(self, node: GPUBlock) -> List[WGSLKernel]:
+ """
+ Generate WGSL kernels from a GPU block AST node.
+
+ Analyzes the GPU block to identify:
+ 1. Matrix operations that benefit from GPU
+ 2. Element-wise operations
+ 3. Reduction operations
+
+ Returns a list of kernels that implement the block.
+
+ Args:
+ node: GPUBlock AST node
+
+ Returns:
+ List of WGSLKernel objects
+ """
+ kernels = []
+
+ for stmt in node.body:
+ kernel = self._analyze_statement(stmt)
+ if kernel:
+ kernels.append(kernel)
+
+ return kernels
+
+ def _analyze_statement(self, stmt: ASTNode) -> Optional[WGSLKernel]:
+ """
+ Analyze a statement to determine if it can be GPU-accelerated.
+
+ Identifies patterns like:
+ - matmul of [A, B]
+ - norm of x
+ - x + y (element-wise)
+ """
+ if isinstance(stmt, Assignment):
+ return self._analyze_expression(stmt.expression)
+ elif isinstance(stmt, Relation):
+ return self._analyze_relation(stmt)
+ return None
+
+ def _analyze_expression(self, expr: ASTNode) -> Optional[WGSLKernel]:
+ """Analyze expression for GPU-acceleratable operations."""
+ if isinstance(expr, Relation):
+ return self._analyze_relation(expr)
+ elif isinstance(expr, BinaryOp):
+ return self._analyze_binary_op(expr)
+ return None
+
+ def _analyze_relation(self, rel: Relation) -> Optional[WGSLKernel]:
+ """
+ Analyze a Relation (OF operator) for GPU operations.
+
+ Patterns:
+ - matmul of [A, B] → matmul kernel
+ - norm of x → norm kernel
+ - dot of [a, b] → dot kernel
+ """
+ if isinstance(rel.left, Identifier):
+ func_name = rel.left.name.lower()
+
+ if func_name == "matmul":
+ # For now, generate placeholder with dynamic sizing
+ return self.generate_matmul_kernel(256, 256, 256)
+ elif func_name == "norm":
+ return self.generate_norm_kernel(1024)
+ elif func_name == "dot":
+ return self.generate_dot_kernel(1024)
+
+ return None
+
+ def _analyze_binary_op(self, op: BinaryOp) -> Optional[WGSLKernel]:
+ """
+ Analyze binary operation for element-wise GPU kernel.
+
+ Patterns:
+ - a + b → elementwise_add
+ - a * b → elementwise_mul (if both are arrays)
+ """
+ op_map = {
+ "+": "add",
+ "-": "sub",
+ "*": "mul",
+ "/": "div",
+ }
+
+ if op.operator in op_map:
+ return self.generate_elementwise_kernel(op_map[op.operator], 1024)
+
+ return None
+
+
+def generate_webgpu_runtime() -> str:
+ """
+ Generate the JavaScript/TypeScript WebGPU runtime code.
+
+ This provides the bridge between EigenScript and WebGPU:
+ - Buffer management
+ - Kernel dispatch
+ - Host↔Device transfers
+ - GPU-lite EigenValue synchronization
+ """
+ return """
+/**
+ * EigenScript WebGPU Runtime
+ *
+ * Provides GPU acceleration for EigenScript matrix operations.
+ * Implements the GPU-lite EigenValue model for efficient geometric tracking.
+ */
+
+class EigenGPU {
+ constructor() {
+ this.device = null;
+ this.adapter = null;
+ this.pipelines = new Map();
+ this.buffers = new Map();
+ }
+
+ async initialize() {
+ if (!navigator.gpu) {
+ throw new Error("WebGPU not supported in this browser");
+ }
+
+ this.adapter = await navigator.gpu.requestAdapter();
+ if (!this.adapter) {
+ throw new Error("Failed to get GPU adapter");
+ }
+
+ this.device = await this.adapter.requestDevice();
+ console.log("EigenGPU initialized:", this.adapter.name);
+ return this;
+ }
+
+ /**
+ * Create a GPU buffer from CPU data.
+ * @param {Float32Array} data - CPU data to upload
+ * @param {string} usage - Buffer usage ("storage", "uniform", "read", "write")
+ * @returns {GPUBuffer} - Created buffer
+ */
+ createBuffer(data, usage = "storage") {
+ const usageFlags = {
+ storage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
+ uniform: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
+ read: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+ write: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+ };
+
+ const buffer = this.device.createBuffer({
+ size: data.byteLength,
+ usage: usageFlags[usage] || usageFlags.storage,
+ mappedAtCreation: true,
+ });
+
+ new Float32Array(buffer.getMappedRange()).set(data);
+ buffer.unmap();
+
+ return buffer;
+ }
+
+ /**
+ * Read data from GPU buffer back to CPU.
+ * @param {GPUBuffer} buffer - GPU buffer to read
+ * @param {number} size - Size in bytes
+ * @returns {Promise} - CPU data
+ */
+ async readBuffer(buffer, size) {
+ const stagingBuffer = this.device.createBuffer({
+ size: size,
+ usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
+ });
+
+ const commandEncoder = this.device.createCommandEncoder();
+ commandEncoder.copyBufferToBuffer(buffer, 0, stagingBuffer, 0, size);
+ this.device.queue.submit([commandEncoder.finish()]);
+
+ await stagingBuffer.mapAsync(GPUMapMode.READ);
+ const result = new Float32Array(stagingBuffer.getMappedRange().slice(0));
+ stagingBuffer.unmap();
+ stagingBuffer.destroy();
+
+ return result;
+ }
+
+ /**
+ * Compile and cache a compute pipeline from WGSL source.
+ * @param {string} name - Pipeline name for caching
+ * @param {string} wgslSource - WGSL shader source
+ * @returns {GPUComputePipeline} - Compiled pipeline
+ */
+ async createPipeline(name, wgslSource) {
+ if (this.pipelines.has(name)) {
+ return this.pipelines.get(name);
+ }
+
+ const shaderModule = this.device.createShaderModule({
+ code: wgslSource,
+ });
+
+ const pipeline = await this.device.createComputePipelineAsync({
+ layout: "auto",
+ compute: {
+ module: shaderModule,
+ entryPoint: "main",
+ },
+ });
+
+ this.pipelines.set(name, pipeline);
+ return pipeline;
+ }
+
+ /**
+ * Dispatch a compute kernel.
+ * @param {string} pipelineName - Name of cached pipeline
+ * @param {Array} buffers - Buffers to bind
+ * @param {Array} workgroups - [x, y, z] workgroup counts
+ */
+ async dispatch(pipelineName, buffers, workgroups) {
+ const pipeline = this.pipelines.get(pipelineName);
+ if (!pipeline) {
+ throw new Error(`Pipeline "${pipelineName}" not found`);
+ }
+
+ // Create bind group
+ const bindGroupEntries = buffers.map((buffer, index) => ({
+ binding: index,
+ resource: { buffer },
+ }));
+
+ const bindGroup = this.device.createBindGroup({
+ layout: pipeline.getBindGroupLayout(0),
+ entries: bindGroupEntries,
+ });
+
+ // Encode and submit
+ const commandEncoder = this.device.createCommandEncoder();
+ const passEncoder = commandEncoder.beginComputePass();
+ passEncoder.setPipeline(pipeline);
+ passEncoder.setBindGroup(0, bindGroup);
+ passEncoder.dispatchWorkgroups(...workgroups);
+ passEncoder.end();
+
+ this.device.queue.submit([commandEncoder.finish()]);
+ await this.device.queue.onSubmittedWorkDone();
+ }
+
+ /**
+ * Matrix multiplication: C = A @ B
+ * @param {Float32Array} a - Matrix A (M x K), row-major
+ * @param {Float32Array} b - Matrix B (K x N), row-major
+ * @param {number} m - Rows of A
+ * @param {number} n - Columns of B
+ * @param {number} k - Columns of A / Rows of B
+ * @returns {Promise} - Result matrix C (M x N)
+ */
+ async matmul(a, b, m, n, k) {
+ const bufferA = this.createBuffer(a, "read");
+ const bufferB = this.createBuffer(b, "read");
+ const bufferC = this.createBuffer(new Float32Array(m * n), "storage");
+ const bufferDims = this.createBuffer(new Uint32Array([m, n, k]), "uniform");
+
+ // Dispatch with appropriate workgroup counts
+ const tileSize = 16;
+ const workgroupsX = Math.ceil(n / tileSize);
+ const workgroupsY = Math.ceil(m / tileSize);
+
+ await this.dispatch("matmul", [bufferA, bufferB, bufferC, bufferDims], [workgroupsX, workgroupsY, 1]);
+
+ const result = await this.readBuffer(bufferC, m * n * 4);
+
+ // Cleanup
+ bufferA.destroy();
+ bufferB.destroy();
+ bufferC.destroy();
+ bufferDims.destroy();
+
+ return result;
+ }
+
+ /**
+ * GPU-lite EigenValue: Sync gradient from GPU to full CPU tracking.
+ * Called when an observation (why, how, etc.) is made on a GPU value.
+ * @param {Float32Array} gpuData - [value, gradient] from GPU
+ * @returns {Object} - Full EigenValue with stability, history, etc.
+ */
+ promoteToFullEigenValue(gpuData) {
+ return {
+ value: gpuData[0],
+ gradient: gpuData[1],
+ stability: 1.0, // Assume stable on first observation
+ iteration: 1,
+ history: [gpuData[0]],
+ prev_value: gpuData[0],
+ prev_gradient: 0.0,
+ };
+ }
+
+ destroy() {
+ // Cleanup all buffers
+ for (const buffer of this.buffers.values()) {
+ buffer.destroy();
+ }
+ this.buffers.clear();
+ this.pipelines.clear();
+ }
+}
+
+// Export for module usage
+if (typeof module !== "undefined") {
+ module.exports = { EigenGPU };
+}
+"""
diff --git a/src/eigenscript/lexer/tokenizer.py b/src/eigenscript/lexer/tokenizer.py
index 882c347..95881e3 100644
--- a/src/eigenscript/lexer/tokenizer.py
+++ b/src/eigenscript/lexer/tokenizer.py
@@ -33,6 +33,7 @@ class TokenType(Enum):
IMPORT = "IMPORT"
FROM = "FROM"
STRUCT = "STRUCT" # User-defined types for self-hosting
+ GPU = "GPU" # GPU compute block keyword
# Interrogatives (geometric projection operators)
WHO = "WHO"
@@ -144,6 +145,7 @@ class Tokenizer:
"import": TokenType.IMPORT,
"from": TokenType.FROM,
"struct": TokenType.STRUCT,
+ "gpu": TokenType.GPU,
# Interrogatives
"who": TokenType.WHO,
"what": TokenType.WHAT,
diff --git a/src/eigenscript/parser/ast_builder.py b/src/eigenscript/parser/ast_builder.py
index dee8fb3..de6dd8e 100644
--- a/src/eigenscript/parser/ast_builder.py
+++ b/src/eigenscript/parser/ast_builder.py
@@ -474,6 +474,30 @@ def __repr__(self) -> str:
return f"StructLiteral({self.struct_name!r}, {len(self.values)} values)"
+@dataclass
+class GPUBlock(ASTNode):
+ """
+ Represents a GPU compute block.
+
+ Groups operations that should be executed on the GPU,
+ enabling batch processing and minimizing host↔device transfers.
+
+ Semantic: GPU blocks use "GPU-lite" EigenValues that only track
+ value + gradient on device. Full geometric tracking (stability,
+ history) is synced to host on observation.
+
+ Example:
+ gpu:
+ result is matmul of [A, B]
+ normalized is result / norm of result
+ """
+
+ body: List[ASTNode]
+
+ def __repr__(self) -> str:
+ return f"GPUBlock(body={len(self.body)})"
+
+
@dataclass
class Program(ASTNode):
"""
@@ -619,6 +643,10 @@ def parse_statement(self) -> Optional[ASTNode]:
if token.type == TokenType.STRUCT:
return self.parse_struct_def()
+ # GPU - GPU compute block
+ if token.type == TokenType.GPU:
+ return self.parse_gpu_block()
+
# IF - conditional
if token.type == TokenType.IF:
return self.parse_conditional()
@@ -972,6 +1000,32 @@ def parse_struct_def(self) -> StructDef:
return StructDef(name, fields)
+ def parse_gpu_block(self) -> GPUBlock:
+ """
+ Parse a GPU compute block.
+
+ Grammar: GPU COLON NEWLINE INDENT statement+ DEDENT
+
+ Example:
+ gpu:
+ result is matmul of [A, B]
+ normalized is result / norm of result
+ """
+ # Consume GPU
+ self.expect(TokenType.GPU)
+
+ # Expect colon
+ self.expect(TokenType.COLON)
+
+ # Consume all newlines (handles comment-only lines)
+ while self.current_token() and self.current_token().type == TokenType.NEWLINE:
+ self.advance()
+
+ # Parse block
+ body = self.parse_block()
+
+ return GPUBlock(body)
+
def parse_conditional(self) -> Conditional:
"""
Parse an IF statement.
diff --git a/tests/test_wgsl_backend.py b/tests/test_wgsl_backend.py
new file mode 100644
index 0000000..6414575
--- /dev/null
+++ b/tests/test_wgsl_backend.py
@@ -0,0 +1,274 @@
+"""
+Tests for the WGSL Backend (GPU support).
+
+These tests verify that the WGSL code generator produces valid
+compute shader code for matrix operations.
+"""
+
+import pytest
+from eigenscript.compiler.codegen.wgsl_backend import (
+ WGSLCodeGenerator,
+ WGSLKernel,
+ WGSLBuffer,
+ WGSLType,
+ generate_webgpu_runtime,
+)
+from eigenscript.parser.ast_builder import GPUBlock, Assignment, Identifier, Relation
+from eigenscript.lexer import Tokenizer
+from eigenscript.parser import Parser
+
+
+class TestWGSLCodeGenerator:
+ """Test the WGSL code generator."""
+
+ def test_generator_initialization(self):
+ """Test generator initializes correctly."""
+ gen = WGSLCodeGenerator()
+ assert gen.binding_counter == 0
+ assert gen.temp_counter == 0
+ assert len(gen.kernels) == 0
+
+ def test_eigen_lite_struct_generation(self):
+ """Test GPU-lite EigenValue struct is generated correctly."""
+ gen = WGSLCodeGenerator()
+ struct = gen.generate_eigen_lite_struct()
+
+ assert "struct EigenLite" in struct
+ assert "value: f32" in struct
+ assert "gradient: f32" in struct
+ # Should NOT contain full geometric tracking fields
+ assert "stability" not in struct
+ assert "history" not in struct
+
+ def test_matmul_kernel_generation(self):
+ """Test matrix multiplication kernel generation."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_matmul_kernel(256, 256, 256)
+
+ assert isinstance(kernel, WGSLKernel)
+ assert kernel.name == "matmul"
+ assert kernel.entry_point == "main"
+
+ # Check workgroup size
+ assert kernel.workgroup_size == (16, 16, 1)
+
+ # Check buffers
+ assert len(kernel.buffers) == 4 # A, B, C, dimensions
+ buffer_names = [b.name for b in kernel.buffers]
+ assert "matrix_a" in buffer_names
+ assert "matrix_b" in buffer_names
+ assert "matrix_c" in buffer_names
+ assert "dimensions" in buffer_names
+
+ # Check WGSL source
+ assert "@compute" in kernel.source
+ assert "@workgroup_size" in kernel.source
+ assert "workgroupBarrier" in kernel.source # Tiled algorithm uses barriers
+ assert "var" in kernel.source # Uses shared memory
+
+ def test_dot_kernel_generation(self):
+ """Test dot product kernel generation."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_dot_kernel(1024)
+
+ assert kernel.name == "dot"
+ assert len(kernel.buffers) == 4 # vector_a, vector_b, result, length
+
+ # Check for parallel reduction
+ assert "workgroupBarrier" in kernel.source
+ assert "var" in kernel.source
+
+ def test_norm_kernel_generation(self):
+ """Test vector norm kernel generation."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_norm_kernel(1024)
+
+ assert kernel.name == "norm"
+
+ # Should compute squared sum then sqrt
+ assert "sqrt" in kernel.source
+ assert "var" in kernel.source
+
+ def test_elementwise_add_kernel(self):
+ """Test element-wise addition kernel."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_elementwise_kernel("add", 1024)
+
+ assert kernel.name == "elementwise_add"
+ assert "input_a[gid] + input_b[gid]" in kernel.source
+
+ def test_elementwise_mul_kernel(self):
+ """Test element-wise multiplication kernel."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_elementwise_kernel("mul", 1024)
+
+ assert kernel.name == "elementwise_mul"
+ assert "input_a[gid] * input_b[gid]" in kernel.source
+
+ def test_elementwise_scale_kernel(self):
+ """Test scalar multiplication kernel."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_elementwise_kernel("scale", 1024)
+
+ assert kernel.name == "elementwise_scale"
+ assert "input[gid] * scalar" in kernel.source
+ assert "var scalar: f32" in kernel.source
+
+ def test_binding_counter_increments(self):
+ """Test that binding numbers increment correctly."""
+ gen = WGSLCodeGenerator()
+
+ kernel1 = gen.generate_matmul_kernel(64, 64, 64)
+ bindings1 = [b.binding for b in kernel1.buffers]
+ assert bindings1 == [0, 1, 2, 3]
+
+ # After reset, should start from 0 again
+ kernel2 = gen.generate_dot_kernel(128)
+ bindings2 = [b.binding for b in kernel2.buffers]
+ assert bindings2 == [0, 1, 2, 3]
+
+
+class TestGPUBlockParsing:
+ """Test parsing of GPU blocks."""
+
+ def test_parse_simple_gpu_block(self):
+ """Test parsing a simple GPU block."""
+ source = """
+gpu:
+ result is matmul of [A, B]
+"""
+ tokenizer = Tokenizer(source)
+ tokens = tokenizer.tokenize()
+ parser = Parser(tokens)
+ ast = parser.parse()
+
+ assert len(ast.statements) == 1
+ gpu_block = ast.statements[0]
+ assert isinstance(gpu_block, GPUBlock)
+ assert len(gpu_block.body) == 1
+
+ def test_parse_multi_statement_gpu_block(self):
+ """Test parsing GPU block with multiple statements."""
+ source = """
+gpu:
+ C is matmul of [A, B]
+ n is norm of C
+ result is C / n
+"""
+ tokenizer = Tokenizer(source)
+ tokens = tokenizer.tokenize()
+ parser = Parser(tokens)
+ ast = parser.parse()
+
+ assert len(ast.statements) == 1
+ gpu_block = ast.statements[0]
+ assert isinstance(gpu_block, GPUBlock)
+ assert len(gpu_block.body) == 3
+
+ def test_gpu_block_in_program(self):
+ """Test GPU block mixed with regular code."""
+ source = """
+A is [[1, 2], [3, 4]]
+B is [[1, 0], [0, 1]]
+
+gpu:
+ C is matmul of [A, B]
+
+print of C
+"""
+ tokenizer = Tokenizer(source)
+ tokens = tokenizer.tokenize()
+ parser = Parser(tokens)
+ ast = parser.parse()
+
+ # Should have: 2 assignments, 1 GPU block, 1 expression
+ assert len(ast.statements) == 4
+ assert isinstance(ast.statements[2], GPUBlock)
+
+
+class TestWebGPURuntime:
+ """Test the JavaScript runtime generation."""
+
+ def test_runtime_generation(self):
+ """Test that WebGPU runtime JS is generated."""
+ runtime = generate_webgpu_runtime()
+
+ assert "class EigenGPU" in runtime
+ assert "async initialize()" in runtime
+ assert "createBuffer" in runtime
+ assert "readBuffer" in runtime
+ assert "dispatch" in runtime
+ assert "matmul" in runtime
+
+ def test_runtime_has_gpu_lite_promotion(self):
+ """Test that runtime includes GPU-lite to full EigenValue promotion."""
+ runtime = generate_webgpu_runtime()
+
+ assert "promoteToFullEigenValue" in runtime
+ assert "gradient" in runtime
+ assert "stability" in runtime
+
+
+class TestWGSLBufferTypes:
+ """Test buffer type handling."""
+
+ def test_buffer_types(self):
+ """Test various buffer types are handled correctly."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_matmul_kernel(64, 64, 64)
+
+ buffer_types = {b.name: b.type for b in kernel.buffers}
+ assert buffer_types["matrix_a"] == WGSLType.ARRAY_F32
+ assert buffer_types["matrix_b"] == WGSLType.ARRAY_F32
+ assert buffer_types["matrix_c"] == WGSLType.ARRAY_F32
+ assert buffer_types["dimensions"] == WGSLType.VEC3F
+
+ def test_buffer_access_modes(self):
+ """Test buffer access modes are set correctly."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_matmul_kernel(64, 64, 64)
+
+ buffer_access = {b.name: b.access for b in kernel.buffers}
+ assert buffer_access["matrix_a"] == "read"
+ assert buffer_access["matrix_b"] == "read"
+ assert buffer_access["matrix_c"] == "read_write"
+ assert buffer_access["dimensions"] == "read"
+
+
+class TestWGSLValidSyntax:
+ """Test that generated WGSL is syntactically valid."""
+
+ def test_matmul_has_required_decorators(self):
+ """Test matmul kernel has required WGSL decorators."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_matmul_kernel(64, 64, 64)
+
+ # Required entry point decorators
+ assert "@compute" in kernel.source
+ assert "@workgroup_size" in kernel.source
+
+ # Required binding decorators
+ assert "@group(0)" in kernel.source
+ assert "@binding(0)" in kernel.source
+ assert "@binding(1)" in kernel.source
+ assert "@binding(2)" in kernel.source
+ assert "@binding(3)" in kernel.source
+
+ # Required builtins
+ assert "@builtin(global_invocation_id)" in kernel.source
+ assert "@builtin(local_invocation_id)" in kernel.source
+
+ def test_no_invalid_wgsl_syntax(self):
+ """Test generated WGSL doesn't contain common syntax errors."""
+ gen = WGSLCodeGenerator()
+ kernel = gen.generate_matmul_kernel(64, 64, 64)
+
+ # Should not have HLSL/GLSL-style syntax
+ assert "float" not in kernel.source # Should use f32
+ assert "int " not in kernel.source # Should use i32 or u32
+ assert "__device__" not in kernel.source # Not CUDA
+ assert "#include" not in kernel.source # Not C
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])