diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index f7cd85f9758..551c35afb8c 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -39,6 +39,7 @@ set(WEBGPU_SRCS runtime/ops/select_as_symint/SelectAsSymint.cpp runtime/ops/quantized_linear/QuantizedLinear.cpp runtime/ops/mul/BinaryOp.cpp + runtime/ops/sigmoid/Sigmoid.cpp runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp runtime/ops/rope/RotaryEmbedding.cpp runtime/ops/prepack/Prepack.cpp diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md index 8c3928c5f49..10d9531abe7 100644 --- a/backends/webgpu/README.md +++ b/backends/webgpu/README.md @@ -2,7 +2,7 @@ Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [Dawn](https://dawn.googlesource.com/dawn), whose Tint compiler is the reference WGSL implementation (Metal on macOS, Vulkan on Linux/Windows). -> **Status: Prototype, under active development.** The backend runs the core of transformer inference today — `add`, `rms_norm`, fused scaled-dot-product attention with KV cache, and 4-bit weight-only quantized linear — plus quantized embedding, rotary embedding, and constant prepacking. See [Progress](#progress) for shipped milestones. +> **Status: Prototype, under active development.** The backend runs the core of transformer inference today — `add`, `mul`, `sigmoid`, `rms_norm`, fused scaled-dot-product attention with KV cache, and 4-bit weight-only quantized linear — plus quantized embedding, rotary embedding, and constant prepacking. See [Progress](#progress) for shipped milestones. ## Progress @@ -20,14 +20,7 @@ Milestones landed on `main`: | 2026-06 | Added the attention core of transformer inference — fused scaled-dot-product attention (`sdpa_with_kv_cache`) with an `update_cache` operator for autoregressive decode | [#20086](https://github.com/pytorch/executorch/pull/20086), [#20087](https://github.com/pytorch/executorch/pull/20087) | | 2026-06 | Added on-GPU kernel timing via WebGPU timestamp queries, for true GPU-side profiling | [#20201](https://github.com/pytorch/executorch/pull/20201) | | 2026-06 | Added the dominant compute in quantized LLMs — 4-bit weight-only quantized linear (`linear_q4gsw`), a dequantize-and-matmul kernel | [#20226](https://github.com/pytorch/executorch/pull/20226), [#20227](https://github.com/pytorch/executorch/pull/20227) | - -In review: - -| Milestone | Pull Request | -|---|---| -| Adds 4-bit quantized embedding (`embedding_q4gsw`) | [#20263](https://github.com/pytorch/executorch/pull/20263) | -| Adds rotary position embedding / RoPE (`apply_rotary_emb`) | [#20264](https://github.com/pytorch/executorch/pull/20264) | -| Adds constant prepacking (`prepack`) for end-to-end model weight handling | [#20265](https://github.com/pytorch/executorch/pull/20265) | +| 2026-06 | Added token embedding, rotary position embedding, and constant prepacking for end-to-end model weight handling | [#20414](https://github.com/pytorch/executorch/pull/20414) | ## Architecture @@ -61,14 +54,17 @@ Key design choices: | Operator | WGSL Shader | Notes | |---|---|---| | `aten.add.Tensor` | `binary_add.wgsl` | Element-wise with alpha: `out = in1 + alpha * in2` | +| `aten.mul.Tensor` | `binary_mul.wgsl` | Element-wise multiply with broadcasting | +| `aten.sigmoid.default` | `sigmoid.wgsl` | Element-wise sigmoid activation | | `et_vk.rms_norm.default` | `rms_norm.wgsl` | Root-mean-square normalization | | `sdpa_with_kv_cache.default` | `sdpa_compute_attn_weights.wgsl`, `sdpa_softmax.wgsl`, `sdpa_compute_out.wgsl` | Fused scaled-dot-product attention (QK / softmax / AV) with KV cache | | `llama.update_cache.default` | `update_cache.wgsl` | In-place KV cache update for autoregressive decode | | `et_vk.linear_q4gsw.default` | `q4gsw_linear.wgsl` | 4-bit weight-only quantized linear (dequantize + matmul) | +| `et_vk.embedding_q4gsw.default` | `embedding_q4gsw.wgsl` | 4-bit groupwise-symmetric quantized embedding | +| `et_vk.apply_rotary_emb.default` | `rotary_embedding.wgsl` | Interleaved rotary positional embedding | +| `et_vk.prepack.default` | N/A | Constant materialization into GPU buffers | -**In review:** quantized embedding (`embedding_q4gsw`), rotary embedding (`apply_rotary_emb`), and constant prepacking (`prepack`). - -**Planned:** `mul`, `sigmoid`, shape ops (`view`, `permute`, `slice`, `select`, `cat`, `squeeze`/`unsqueeze`), and `index` — the remaining ops needed for end-to-end Llama 3.2 1B. +**Planned:** shape ops (`view`, `permute`, `slice`, `select`, `cat`, `squeeze`/`unsqueeze`) and `index` — the remaining ops needed for end-to-end Llama 3.2 1B. ## Quick Start diff --git a/backends/webgpu/TODO.md b/backends/webgpu/TODO.md index 02259d5c3eb..9e9d7a8ca97 100644 --- a/backends/webgpu/TODO.md +++ b/backends/webgpu/TODO.md @@ -1,7 +1,10 @@ # WebGPU Backend — TODO ## Current State (Prototype) -- Single op: `aten.add.Tensor` (fp32, buffer storage) +- Runtime support for transformer-oriented fp32 and LLM custom ops, including + `aten.add.Tensor`, `aten.mul.Tensor`, `aten.sigmoid.default`, + `et_vk.rms_norm.default`, + fused SDPA with KV cache, 4-bit quantized linear/embedding, RoPE, and prepack. - No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner) - Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload) - Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time @@ -30,7 +33,7 @@ element-wise ops (add→relu→mul→clamp) at compile time. Embed via the exist `shaders: [VkBytes]` field in schema.fbs. ## Next Steps -1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm +1. **More ops**: sub, relu, linear (matmul), softmax, layer_norm, shape ops 2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32 3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale 4. **Pipeline caching**: Cache compiled pipelines across runs diff --git a/backends/webgpu/runtime/ops/sigmoid/Sigmoid.cpp b/backends/webgpu/runtime/ops/sigmoid/Sigmoid.cpp new file mode 100644 index 00000000000..781648a53a4 --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/Sigmoid.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +void sigmoid_impl(WebGPUGraph& graph, const std::vector& args) { + // aten.sigmoid.default args: [in, out] + const int in_id = args.at(0); + const int out_id = args.at(1); + + WGPUDevice device = graph.device(); + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + + if (in_tensor.dims != out_tensor.dims) { + throw std::runtime_error("sigmoid: input and output shapes must match"); + } + + TensorMeta out_meta; + fill_tensor_meta(out_tensor, &out_meta); + + if (out_tensor.nbytes != + static_cast(out_meta.numel) * sizeof(float) || + in_tensor.nbytes != static_cast(out_meta.numel) * sizeof(float)) { + throw std::runtime_error("sigmoid: non-fp32 operand (nbytes != numel * 4)"); + } + + uint32_t wg_size = + utils::clamp_workgroup_size(device, kSigmoidWorkgroupSizeX); + uint32_t workgroup_count = utils::compute_1d_workgroup_count( + device, out_meta.numel, wg_size, "sigmoid"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + WGPUBuffer out_meta_buf = + utils::make_uniform(device, &out_meta, sizeof(TensorMeta)); + graph.add_uniform_buffer_bytes(sizeof(TensorMeta)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kSigmoidWGSL, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + WGPUBindGroupLayoutEntry entries[3] = {}; + + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 3; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[3] = {}; + + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_meta_buf; + bg_entries[2].size = sizeof(TensorMeta); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 3; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our ref; the bind group keeps the uniform alive until release. + wgpuBufferRelease(out_meta_buf); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.sigmoid.default, sigmoid_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl new file mode 100644 index 00000000000..3a1ca0c4d7a --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl @@ -0,0 +1,21 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= out_meta.numel) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h new file mode 100644 index 00000000000..a69926cef2e --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from sigmoid.wgsl - DO NOT EDIT. +// wgsl-sha256: 73a26ddce78d1cbd6cbb0c586791b338153cea9af13790dc1400516128a4c278 +inline constexpr const char* kSigmoidWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= out_meta.numel) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} +)"; + +inline constexpr uint32_t kSigmoidWorkgroupSizeX = 64; +inline constexpr uint32_t kSigmoidWorkgroupSizeY = 1; +inline constexpr uint32_t kSigmoidWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS index 9008f32cd2c..0d204f7702f 100644 --- a/backends/webgpu/test/TARGETS +++ b/backends/webgpu/test/TARGETS @@ -17,6 +17,32 @@ python_unittest( ], ) +python_unittest( + name = "test_mul", + srcs = [ + "ops/mul/test_mul.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan/partitioner:vulkan_partitioner", + "//executorch/backends/vulkan:vulkan_preprocess", + "//executorch/exir:lib", + ], +) + +python_unittest( + name = "test_sigmoid", + srcs = [ + "ops/sigmoid/test_sigmoid.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan/partitioner:vulkan_partitioner", + "//executorch/backends/vulkan:vulkan_preprocess", + "//executorch/exir:lib", + ], +) + runtime.python_library( name = "tester", srcs = ["tester.py"], diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index febdbd507a8..c5e0d83fa33 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -16,6 +16,7 @@ from executorch.backends.webgpu.test.op_tests.test_suite import ( Case, InputSpec, + M, M1, M2, register_op_test, @@ -30,12 +31,25 @@ AddModule, AddSelfModule, ) +from executorch.backends.webgpu.test.ops.mul.test_mul import ( + _mul_offset_range, + _mul_range, + MulChainedModule, + MulModule, + MulSelfModule, +) from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import ( _CASES, _linspace_weight, _ramp, RmsNormModule, ) +from executorch.backends.webgpu.test.ops.sigmoid.test_sigmoid import ( + _sigmoid_range, + _sigmoid_wide_range, + SigmoidChainedModule, + SigmoidModule, +) # rms_norm coverage is exactly the 15 cases the native test covered. RMS_NORM_CASES = _CASES @@ -49,6 +63,21 @@ def _add_factory(variant: str = "regular") -> torch.nn.Module: }[variant]() +def _sigmoid_factory(variant: str = "regular") -> torch.nn.Module: + return { + "regular": SigmoidModule, + "chained": SigmoidChainedModule, + }[variant]() + + +def _mul_factory(variant: str = "regular") -> torch.nn.Module: + return { + "regular": MulModule, + "self": MulSelfModule, + "chained": MulChainedModule, + }[variant]() + + @register_op_test("add") def _add_suite() -> WebGPUTestSuite: # Same-shape numeric coverage only: broadcast adds stay export-smoke in @@ -83,6 +112,110 @@ def _add_suite() -> WebGPUTestSuite: ) +@register_op_test("mul") +def _mul_suite() -> WebGPUTestSuite: + return WebGPUTestSuite( + module_factory=_mul_factory, + cases=[ + Case( + name="regular_2d", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(M1, M2), gen=_mul_range), + InputSpec(shape=(M1, M2), gen=_mul_offset_range), + ), + ), + Case( + name="regular_3d", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(S, S1, S2), gen=_mul_range), + InputSpec(shape=(S, S1, S2), gen=_mul_offset_range), + ), + ), + Case( + name="regular_4d", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(XS, S, S1, S2), gen=_mul_range), + InputSpec(shape=(XS, S, S1, S2), gen=_mul_offset_range), + ), + ), + Case( + name="broadcast_last_dim", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(M1, M2), gen=_mul_range), + InputSpec(shape=(M1, 1), gen=_mul_offset_range), + ), + ), + Case( + name="broadcast_first_dim", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(M1, M2), gen=_mul_range), + InputSpec(shape=(1, M2), gen=_mul_offset_range), + ), + ), + Case( + name="broadcast_4d_mixed", + construct={"variant": "regular"}, + inputs=( + InputSpec(shape=(XS, S, S1, S2), gen=_mul_range), + InputSpec(shape=(1, S, 1, S2), gen=_mul_offset_range), + ), + ), + Case( + name="self", + construct={"variant": "self"}, + inputs=(InputSpec(shape=(M1, M2), gen=_mul_range),), + ), + Case( + name="chained", + construct={"variant": "chained"}, + inputs=( + InputSpec(shape=(M1, M2), gen=_mul_range), + InputSpec(shape=(M1, M2), gen=_mul_offset_range), + ), + ), + ], + ) + + +@register_op_test("sigmoid") +def _sigmoid_suite() -> WebGPUTestSuite: + return WebGPUTestSuite( + module_factory=_sigmoid_factory, + cases=[ + Case( + name="regular_1d", + construct={"variant": "regular"}, + inputs=(InputSpec(shape=(M,), gen=_sigmoid_range),), + ), + Case( + name="regular_2d", + construct={"variant": "regular"}, + inputs=(InputSpec(shape=(M1, M2), gen=_sigmoid_range),), + ), + Case( + name="regular_4d", + construct={"variant": "regular"}, + inputs=(InputSpec(shape=(XS, S, S1, S2), gen=_sigmoid_range),), + ), + Case( + name="wide_range", + construct={"variant": "regular"}, + inputs=(InputSpec(shape=(M1, M2), gen=_sigmoid_wide_range),), + ), + Case( + name="chained", + construct={"variant": "chained"}, + inputs=(InputSpec(shape=(M1, M2), gen=_sigmoid_range),), + ), + ], + ) + + def _rms_norm_factory(hidden: int, eps: float, weight_fn) -> torch.nn.Module: model = RmsNormModule(hidden, eps=eps) with torch.no_grad(): diff --git a/backends/webgpu/test/op_tests/generate_op_tests.py b/backends/webgpu/test/op_tests/generate_op_tests.py index 66e7e55bafc..9bc654f0f41 100644 --- a/backends/webgpu/test/op_tests/generate_op_tests.py +++ b/backends/webgpu/test/op_tests/generate_op_tests.py @@ -8,7 +8,7 @@ Per case: export the module to `.pte`, write its inputs + torch golden as raw little-endian fp32, and emit `manifest.json` for the C++ gtest driver to consume. -Run: `python -m ...generate_op_tests --output [--ops add,rms_norm]`. +Run: `python -m ...generate_op_tests --output [--ops add,mul,sigmoid,rms_norm]`. """ from __future__ import annotations @@ -28,6 +28,7 @@ op_test_registry, WebGPUTestSuite, ) +from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS from executorch.exir import to_edge_transform_and_lower @@ -58,7 +59,8 @@ def export_case(suite: WebGPUTestSuite, case) -> tuple[torch.nn.Module, tuple, o inputs = tuple(_materialize(s) for s in case.inputs) ep = torch.export.export(module, inputs) prog = to_edge_transform_and_lower( - ep, partitioner=[VulkanPartitioner()] + ep, + partitioner=[VulkanPartitioner(operator_allowlist=WEBGPU_SUPPORTED_OPS)], ).to_executorch() return module, inputs, prog diff --git a/backends/webgpu/test/op_tests/test_generator.py b/backends/webgpu/test/op_tests/test_generator.py index deae6ecd4a6..6794d08aafc 100644 --- a/backends/webgpu/test/op_tests/test_generator.py +++ b/backends/webgpu/test/op_tests/test_generator.py @@ -19,6 +19,12 @@ def _add_regular_case(): return suite, case +def _mul_broadcast_case(): + suite = op_test_registry["mul"] + case = next(c for c in suite.cases if c.name == "broadcast_last_dim") + return suite, case + + def test_export_case_has_delegate(): suite, case = _add_regular_case() _module, _inputs, prog = g.export_case(suite, case) @@ -50,6 +56,17 @@ def test_generate_case_writes_artifacts(tmp_path): assert entry["golden"]["output_index"] == 0 +def test_generate_mul_broadcast_case_writes_artifacts(tmp_path): + suite, case = _mul_broadcast_case() + entry = g.generate_case("mul", suite, case, str(tmp_path)) + assert (tmp_path / entry["pte"]).exists() + assert len(entry["inputs"]) == 2 + assert entry["inputs"][0]["shape"] == [37, 41] + assert entry["inputs"][1]["shape"] == [37, 1] + assert entry["golden"]["shape"] == [37, 41] + assert (tmp_path / entry["golden"]["path"]).exists() + + def test_generate_manifest(tmp_path): g.generate(str(tmp_path), ops=["add"]) manifest = tmp_path / "manifest.json" @@ -77,7 +94,7 @@ def test_generate_manifest(tmp_path): def test_every_case_delegates(): # Contract: every registered case must lower to a VulkanBackend delegate. An op that # silently CPU-falls-back would otherwise produce a misleading golden-equals-golden pass. - for op in ("add", "rms_norm"): + for op in ("add", "mul", "sigmoid", "rms_norm"): suite = op_test_registry[op] for case in suite.cases: _module, _inputs, prog = g.export_case(suite, case) diff --git a/backends/webgpu/test/op_tests/test_schema.py b/backends/webgpu/test/op_tests/test_schema.py index bcc03a40fd9..61a21ddaefd 100644 --- a/backends/webgpu/test/op_tests/test_schema.py +++ b/backends/webgpu/test/op_tests/test_schema.py @@ -15,6 +15,8 @@ WebGPUTestSuite, XL, ) +from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS +from executorch.exir.dialects._ops import ops as exir_ops def test_decorator_registers(): @@ -36,11 +38,13 @@ def _dummy(): assert suite.atol == 1e-3 and isinstance(XL, int) -def test_add_rms_norm_registered(): +def test_add_mul_sigmoid_rms_norm_registered(): from executorch.backends.webgpu.test.op_tests import cases # registers - assert {"add", "rms_norm"} <= set(op_test_registry) + assert {"add", "mul", "sigmoid", "rms_norm"} <= set(op_test_registry) assert len(op_test_registry["add"].cases) >= 3 # regular/self/scalar/chained + assert len(op_test_registry["mul"].cases) >= 6 # regular/broadcast/self/chained + assert len(op_test_registry["sigmoid"].cases) >= 4 # regular/range/chained # Exact parity, no hardcoded literal (real _CASES == 15; import so it can't drift): assert len(op_test_registry["rms_norm"].cases) == len(cases.RMS_NORM_CASES) # weight is a construction param, NOT a forward input: @@ -59,6 +63,25 @@ def test_add_cases_are_same_shape_no_scalar(): assert len(shapes) == 1, f"{c.name}: not same-shape (broadcast?): {shapes}" +def test_mul_cases_cover_broadcast_without_scalar(): + from executorch.backends.webgpu.test.op_tests import cases # noqa: F401 registers + + saw_broadcast = False + for c in op_test_registry["mul"].cases: + shapes = [tuple(s if isinstance(s, tuple) else s.shape) for s in c.inputs] + if len(shapes) == 2 and shapes[0] != shapes[1]: + saw_broadcast = True + assert c.construct.get("variant") != "scalar", f"{c.name}: scalar mul can't run" + assert saw_broadcast + + +def test_webgpu_supported_ops_include_registered_elementwise_ops(): + assert exir_ops.edge.aten.add.Tensor in WEBGPU_SUPPORTED_OPS + assert exir_ops.edge.aten.mul.Tensor in WEBGPU_SUPPORTED_OPS + assert exir_ops.edge.aten.sigmoid.default in WEBGPU_SUPPORTED_OPS + assert exir_ops.edge.et_vk.rms_norm.default in WEBGPU_SUPPORTED_OPS + + def test_case_required_heavy_golden_fn_defaults(): c = Case(inputs=((M, M),)) assert c.required is True and c.heavy is False and c.golden_fn is None @@ -74,7 +97,9 @@ def test_heavy_forces_not_required(): def test_golden_dtype_default(): from executorch.backends.webgpu.test.op_tests import cases # noqa: F401 registers - # fp64 oracle is the default; the two landed compute ops keep it. (Per-op golden_dtype + # fp64 oracle is the default; these compute ops keep it. (Per-op golden_dtype # for gather/copy ops is asserted in each op's own tests-diff.) assert op_test_registry["add"].golden_dtype == "float64" + assert op_test_registry["mul"].golden_dtype == "float64" + assert op_test_registry["sigmoid"].golden_dtype == "float64" assert op_test_registry["rms_norm"].golden_dtype == "float64" diff --git a/backends/webgpu/test/ops/mul/__init__.py b/backends/webgpu/test/ops/mul/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backends/webgpu/test/ops/mul/test_mul.py b/backends/webgpu/test/ops/mul/test_mul.py new file mode 100644 index 00000000000..5fda5a18faa --- /dev/null +++ b/backends/webgpu/test/ops/mul/test_mul.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + + +class MulModule(torch.nn.Module): + def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + return a * b + + +class MulSelfModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * x + + +class MulChainedModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + z = x * y + z = z * x + z = z * y + return z + + +class TestMul(unittest.TestCase): + """fp32 torch.mul export tests via VulkanPartitioner.""" + + def _export_and_check(self, model, example_inputs) -> None: + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + found_vulkan = False + for plan in et_program.executorch_program.execution_plan: + for delegate in plan.delegates: + if delegate.id == "VulkanBackend": + found_vulkan = True + break + self.assertTrue(found_vulkan, "Expected VulkanBackend delegate in .pte") + self.assertGreater(len(et_program.buffer), 100) + + def test_mul_2d(self) -> None: + self._export_and_check(MulModule(), (torch.randn(4, 4), torch.randn(4, 4))) + + def test_mul_3d(self) -> None: + self._export_and_check( + MulModule(), (torch.randn(2, 3, 4), torch.randn(2, 3, 4)) + ) + + def test_mul_4d(self) -> None: + self._export_and_check( + MulModule(), (torch.randn(1, 2, 3, 4), torch.randn(1, 2, 3, 4)) + ) + + def test_mul_broadcast_last_dim(self) -> None: + self._export_and_check(MulModule(), (torch.randn(4, 4), torch.randn(4, 1))) + + def test_mul_broadcast_first_dim(self) -> None: + self._export_and_check(MulModule(), (torch.randn(4, 4), torch.randn(1, 4))) + + def test_mul_self(self) -> None: + self._export_and_check(MulSelfModule(), (torch.randn(4, 4),)) + + def test_mul_chained(self) -> None: + self._export_and_check( + MulChainedModule(), (torch.randn(4, 4), torch.randn(4, 4)) + ) + + +def _mul_range(shape) -> torch.Tensor: + n = 1 + for d in shape: + n *= d + return torch.linspace(-3.0, 3.0, n, dtype=torch.float32).reshape(shape) + + +def _mul_offset_range(shape) -> torch.Tensor: + n = 1 + for d in shape: + n *= d + return torch.linspace(0.25, 2.25, n, dtype=torch.float32).reshape(shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/webgpu/test/ops/sigmoid/__init__.py b/backends/webgpu/test/ops/sigmoid/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/webgpu/test/ops/sigmoid/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/sigmoid/test_sigmoid.py b/backends/webgpu/test/ops/sigmoid/test_sigmoid.py new file mode 100644 index 00000000000..90b42f60955 --- /dev/null +++ b/backends/webgpu/test/ops/sigmoid/test_sigmoid.py @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + + +class SigmoidModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.sigmoid(x) + + +class SigmoidChainedModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.sigmoid(torch.sigmoid(x)) + + +class TestSigmoid(unittest.TestCase): + """fp32 torch.sigmoid export tests via VulkanPartitioner.""" + + def _export_and_check(self, model, example_inputs) -> None: + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + found_vulkan = False + for plan in et_program.executorch_program.execution_plan: + for delegate in plan.delegates: + if delegate.id == "VulkanBackend": + found_vulkan = True + break + self.assertTrue(found_vulkan, "Expected VulkanBackend delegate in .pte") + self.assertGreater(len(et_program.buffer), 100) + + def test_sigmoid_1d(self) -> None: + self._export_and_check(SigmoidModule(), (torch.randn(17),)) + + def test_sigmoid_2d(self) -> None: + self._export_and_check(SigmoidModule(), (torch.randn(4, 4),)) + + def test_sigmoid_4d(self) -> None: + self._export_and_check(SigmoidModule(), (torch.randn(1, 2, 3, 4),)) + + def test_sigmoid_chained(self) -> None: + self._export_and_check(SigmoidChainedModule(), (torch.randn(4, 4),)) + + +def _sigmoid_range(shape) -> torch.Tensor: + n = 1 + for d in shape: + n *= d + return torch.linspace(-8.0, 8.0, n, dtype=torch.float32).reshape(shape) + + +def _sigmoid_wide_range(shape) -> torch.Tensor: + n = 1 + for d in shape: + n *= d + return torch.linspace(-20.0, 20.0, n, dtype=torch.float32).reshape(shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index 2e67df442e6..125c72af208 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -20,6 +20,8 @@ # Edge ops the WebGPU runtime implements; restricts the Vulkan partitioner. WEBGPU_SUPPORTED_OPS = [ exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.sigmoid.default, exir_ops.edge.et_vk.rms_norm.default, ]