From 017e1981a1fa96698c197a1e3054c8f19d71f402 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Mon, 4 May 2026 19:42:28 -0700 Subject: [PATCH 1/6] WebGPU: add memory aliasing for intermediate tensor buffers The export pipeline already runs a greedy memory planning pass that assigns mem_obj_id to tensors with non-overlapping lifetimes, but the WebGPU runtime was ignoring it and allocating a dedicated WGPUBuffer per tensor. Read mem_obj_id from the flatbuffer during graph build. Tensors sharing the same mem_obj_id now share a single WGPUBuffer sized to the largest user. Constants and tensors without a mem_obj_id still get dedicated buffers. Adds a chained-add native test (z=x+y; z=z+x; z=z+y) that verifies both correctness and that memory aliasing produces savings (~20% for this model). Co-authored with Claude. --- backends/webgpu/runtime/WebGPUBackend.cpp | 8 ++ backends/webgpu/runtime/WebGPUBackend.h | 3 + backends/webgpu/runtime/WebGPUGraph.cpp | 91 +++++++++++++++----- backends/webgpu/runtime/WebGPUGraph.h | 11 ++- backends/webgpu/test/ops/add/test_add.py | 13 +++ backends/webgpu/test/test_webgpu_native.cpp | 95 +++++++++++++++++++++ 6 files changed, 198 insertions(+), 23 deletions(-) diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp index 5321c20aaa4..3132671a6b0 100644 --- a/backends/webgpu/runtime/WebGPUBackend.cpp +++ b/backends/webgpu/runtime/WebGPUBackend.cpp @@ -38,6 +38,12 @@ using executorch::runtime::register_backend; using executorch::runtime::Result; using executorch::runtime::Span; +static WebGPUMemoryStats s_last_memory_stats; + +WebGPUMemoryStats get_last_memory_stats() { + return s_last_memory_stats; +} + bool WebGPUBackend::is_available() const { return true; } @@ -83,6 +89,8 @@ Result WebGPUBackend::init( return Error::DelegateInvalidCompatibility; } + s_last_memory_stats = graph->memory_stats(); + processed->Free(); return graph; diff --git a/backends/webgpu/runtime/WebGPUBackend.h b/backends/webgpu/runtime/WebGPUBackend.h index 9c20a3d53be..e82b8dcca84 100644 --- a/backends/webgpu/runtime/WebGPUBackend.h +++ b/backends/webgpu/runtime/WebGPUBackend.h @@ -8,12 +8,15 @@ #pragma once +#include #include namespace executorch { namespace backends { namespace webgpu { +WebGPUMemoryStats get_last_memory_stats(); + class WebGPUBackend final : public ::executorch::runtime::BackendInterface { public: ~WebGPUBackend() override = default; diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index f0e4c7959c0..a320a57b610 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -50,9 +50,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) { WebGPUGraph::WebGPUGraph() = default; WebGPUGraph::~WebGPUGraph() { - for (auto& t : tensors_) { - if (t.buffer) { - wgpuBufferRelease(t.buffer); + for (size_t i = 0; i < tensors_.size(); i++) { + if (tensors_[i].buffer && + (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) { + wgpuBufferRelease(tensors_[i].buffer); + } + } + for (auto& buf : shared_buffers_) { + if (buf) { + wgpuBufferRelease(buf); } } for (auto& buf : output_staging_buffers_) { @@ -94,6 +100,7 @@ void WebGPUGraph::build( const int num_vals = values ? values->size() : 0; value_types_.resize(num_vals, ValueType::Null); tensors_.resize(num_vals); + tensor_mem_obj_ids_.resize(num_vals, -1); ints_.resize(num_vals, 0); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); @@ -121,27 +128,39 @@ void WebGPUGraph::build( } tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); - // Create GPU buffer - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - // Upload constant data if this tensor has a constant_id int constant_id = vk_tensor->constant_id(); - if (constant_id >= 0 && constant_data) { - const auto* constants = graph->constants(); - if (constants && constant_id < static_cast(constants->size())) { - const auto* vk_bytes = constants->Get(constant_id); - // Only upload from embedded bytes (not named data map) - if (vk_bytes->offset() != UINT64_MAX) { - const uint8_t* src = constant_data + vk_bytes->offset(); - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, src, tensor.nbytes); + int mem_obj_id = vk_tensor->mem_obj_id(); + tensor_mem_obj_ids_[i] = mem_obj_id; + + if (constant_id >= 0 || mem_obj_id < 0) { + // Dedicated buffer: constants or tensors that don't share memory + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + if (constant_id >= 0 && constant_data) { + const auto* constants = graph->constants(); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } } } + } else { + // Shared buffer: track required size, defer allocation to pass 2 + size_t id = static_cast(mem_obj_id); + if (id >= shared_buffer_sizes_.size()) { + shared_buffer_sizes_.resize(id + 1, 0); + } + shared_buffer_sizes_[id] = + std::max(shared_buffer_sizes_[id], tensor.nbytes); } break; } @@ -166,6 +185,24 @@ void WebGPUGraph::build( } } + // Allocate shared buffers and assign to tensors + shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr); + for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) { + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = + shared_buffer_sizes_[id] > 0 ? shared_buffer_sizes_[id] : 4; + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc); + } + for (int i = 0; i < num_vals; i++) { + int mid = tensor_mem_obj_ids_[i]; + if (mid >= 0) { + tensors_[i].buffer = shared_buffers_[mid]; + } + } + // Phase 2: Record input and output IDs const auto* fb_input_ids = graph->input_ids(); if (fb_input_ids) { @@ -315,10 +352,20 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { WebGPUMemoryStats stats; for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { - stats.tensor_buffer_bytes += tensors_[i].nbytes; stats.num_tensors++; + if (i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0) { + // Shared tensor — actual allocation tracked via shared_buffer_sizes_ + } else { + stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; + } } } + for (size_t s : shared_buffer_sizes_) { + stats.shared_buffer_bytes += s; + } + stats.num_shared_objects = static_cast(shared_buffers_.size()); + stats.tensor_buffer_bytes = + stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes; for (size_t i = 0; i < output_ids_.size(); i++) { stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes; } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 2d6996e9219..d68d33267ab 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -32,13 +32,17 @@ struct WebGPUDispatch { struct WebGPUMemoryStats { size_t tensor_buffer_bytes = 0; + size_t shared_buffer_bytes = 0; + int num_shared_objects = 0; + size_t unshared_tensor_buffer_bytes = 0; size_t staging_buffer_bytes = 0; size_t uniform_buffer_bytes = 0; int num_tensors = 0; int num_dispatches = 0; size_t total_bytes() const { - return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes; + return shared_buffer_bytes + unshared_tensor_buffer_bytes + + staging_buffer_bytes + uniform_buffer_bytes; } }; @@ -134,6 +138,11 @@ class WebGPUGraph { std::vector input_ids_; std::vector output_ids_; + // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer. + std::vector tensor_mem_obj_ids_; + std::vector shared_buffers_; + std::vector shared_buffer_sizes_; + // Staging buffers for reading back outputs (MapRead | CopyDst). std::vector output_staging_buffers_; diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py index f4b33ced76d..8829dfe704e 100644 --- a/backends/webgpu/test/ops/add/test_add.py +++ b/backends/webgpu/test/ops/add/test_add.py @@ -97,5 +97,18 @@ def export_add_model(output_path: str) -> None: print(f"Exported {output_path}") +def export_chained_add_model(output_path: str) -> None: + """Export a chained add model (z=x+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" + model = AddChainedModule() + example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + with open(output_path, "wb") as f: + f.write(et_program.buffer) + print(f"Exported {output_path}") + + if __name__ == "__main__": unittest.main() diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index c60695e11c9..bf9c44a858a 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -75,6 +76,91 @@ static bool test_single_add(const std::string& model_path) { return true; } +static bool test_chained_add_memory(const std::string& model_path) { + printf("\n--- Test: chained add memory aliasing (1024x1024) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + auto stats = get_last_memory_stats(); + printf("Memory stats after build:\n"); + printf(" num_tensors: %d\n", stats.num_tensors); + printf(" num_shared_objects: %d\n", stats.num_shared_objects); + printf(" shared_buffer_bytes: %zu\n", stats.shared_buffer_bytes); + printf( + " unshared_tensor_buffer_bytes: %zu\n", + stats.unshared_tensor_buffer_bytes); + printf(" tensor_buffer_bytes: %zu\n", stats.tensor_buffer_bytes); + printf(" total_bytes: %zu\n", stats.total_bytes()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector x_data(size); + std::vector y_data(size); + for (int i = 0; i < size; i++) { + x_data[i] = static_cast(i % 100) * 0.01f; + y_data[i] = static_cast(i % 50) * 0.02f; + } + + auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); + auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); + + auto result = module.forward({EValue(x), EValue(y)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + // z = x+y; z = z+x = 2x+y; z = z+y = 2x+2y + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + for (int i = 0; i < size; i++) { + float expected = 2.0f * x_data[i] + 2.0f * y_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e\n", max_error); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + + if (stats.num_shared_objects > 0) { + printf( + "PASS: memory aliasing is active (%d shared objects)\n", + stats.num_shared_objects); + } else { + printf( + "INFO: no shared objects (memory aliasing not used by this model)\n"); + } + + size_t naive_bytes = + static_cast(stats.num_tensors) * dim * dim * sizeof(float); + printf("Naive tensor bytes: %zu\n", naive_bytes); + printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes); + if (stats.num_shared_objects > 0 && stats.tensor_buffer_bytes < naive_bytes) { + printf("PASS: memory savings from aliasing confirmed\n"); + } + + printf("PASS: chained add memory test\n"); + return true; +} + int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; if (argc > 1) { @@ -84,6 +170,11 @@ int main(int argc, char** argv) { model_path = env; } + std::string chained_model_path; + if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { + chained_model_path = env; + } + WebGPUContext ctx; try { ctx = create_webgpu_context(); @@ -97,6 +188,10 @@ int main(int argc, char** argv) { bool ok = test_single_add(model_path); + if (!chained_model_path.empty()) { + ok = test_chained_add_memory(chained_model_path) && ok; + } + set_default_webgpu_context(nullptr); destroy_webgpu_context(ctx); From 1e6143d05b2da55c4150671797ee12bee0958750 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 5 May 2026 08:27:44 -0700 Subject: [PATCH 2/6] WebGPU: replace zero-size buffer fallbacks with asserts Replace the silent `nbytes > 0 ? nbytes : 4` fallback pattern with ET_CHECK_MSG assertions. If a zero-byte tensor reaches buffer creation, we want to know immediately rather than silently creating a dummy 4-byte buffer that masks the issue. Co-authored with Claude. --- backends/webgpu/runtime/WebGPUGraph.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index a320a57b610..6af585d3514 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -14,6 +14,8 @@ #include #include +#include + #include #include @@ -135,7 +137,8 @@ void WebGPUGraph::build( if (constant_id >= 0 || mem_obj_id < 0) { // Dedicated buffer: constants or tensors that don't share memory WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; + ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes"); + buf_desc.size = tensor.nbytes; buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | WGPUBufferUsage_CopySrc; buf_desc.mappedAtCreation = false; @@ -189,8 +192,8 @@ void WebGPUGraph::build( shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr); for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) { WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = - shared_buffer_sizes_[id] > 0 ? shared_buffer_sizes_[id] : 4; + ET_CHECK_MSG(shared_buffer_sizes_[id] > 0, "Shared buffer has zero bytes"); + buf_desc.size = shared_buffer_sizes_[id]; buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | WGPUBufferUsage_CopySrc; buf_desc.mappedAtCreation = false; @@ -218,7 +221,8 @@ void WebGPUGraph::build( // Create staging buffer for output readback WGPUBufferDescriptor staging_desc = {}; - staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4; + ET_CHECK_MSG(tensors_[oid].nbytes > 0, "Output tensor has zero bytes"); + staging_desc.size = tensors_[oid].nbytes; staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; staging_desc.mappedAtCreation = false; output_staging_buffers_.push_back( From 8f787922e0a8f03b1237c7c5f0a8cc536924a261 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 5 May 2026 08:30:34 -0700 Subject: [PATCH 3/6] WebGPU: clean up empty if-branch in memory_stats() Invert the condition to eliminate the empty if-body with a comment. Co-authored with Claude. --- backends/webgpu/runtime/WebGPUGraph.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 6af585d3514..8d567828608 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -357,9 +357,10 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { stats.num_tensors++; - if (i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0) { - // Shared tensor — actual allocation tracked via shared_buffer_sizes_ - } else { + // Shared tensors are tracked via shared_buffer_sizes_ + bool is_shared = + i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; + if (!is_shared) { stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; } } From d4333e3c67ebf80d8e890568996a5b874245172e Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 5 May 2026 08:32:27 -0700 Subject: [PATCH 4/6] WebGPU: add chained-add model to test script Export and run the chained-add memory aliasing test in test_build_webgpu.sh so it runs automatically instead of requiring a manual WEBGPU_TEST_CHAINED_MODEL env var. Co-authored with Claude. --- backends/webgpu/test/test_build_webgpu.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 684926cb181..a42b2304ee7 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── -echo "=== Step 2: Export test model ===" +echo "=== Step 2: Export test models ===" PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') " # ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── @@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} echo "=== Step 4: Run native test ===" WEBGPU_TEST_MODEL="${PTE_MODEL}" \ +WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" echo "=== Done ===" From 08b37d85386766964e91b60eb34656f1b5b85a1f Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 5 May 2026 08:33:42 -0700 Subject: [PATCH 5/6] WebGPU: extend chained add test to 5 ops for better aliasing coverage Longer chain produces more intermediates, giving the memory planner more opportunity to alias buffers. Expected output: 3x + 3y. Co-authored with Claude. --- backends/webgpu/test/ops/add/test_add.py | 4 +++- backends/webgpu/test/test_webgpu_native.cpp | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py index 8829dfe704e..e8da644a1f9 100644 --- a/backends/webgpu/test/ops/add/test_add.py +++ b/backends/webgpu/test/ops/add/test_add.py @@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: z = x + y z = z + x z = z + y + z = z + x + z = z + y return z @@ -98,7 +100,7 @@ def export_add_model(output_path: str) -> None: def export_chained_add_model(output_path: str) -> None: - """Export a chained add model (z=x+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" + """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" model = AddChainedModule() example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) ep = torch.export.export(model, example_inputs) diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index bf9c44a858a..ebb1beb83a6 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -123,13 +123,13 @@ static bool test_chained_add_memory(const std::string& model_path) { return false; } - // z = x+y; z = z+x = 2x+y; z = z+y = 2x+2y + // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y const auto& out_tensor = outputs[0].toTensor(); const float* out_data = out_tensor.const_data_ptr(); float max_error = 0.0f; for (int i = 0; i < size; i++) { - float expected = 2.0f * x_data[i] + 2.0f * y_data[i]; + float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; float error = std::abs(out_data[i] - expected); max_error = std::max(max_error, error); } From 55ceea4469f0c9cc3595a2206e4244a552576d07 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 5 May 2026 12:20:01 -0700 Subject: [PATCH 6/6] WebGPU: fix constant tensor buffer leak and make aliasing test strict Fix: if a constant tensor has mem_obj_id >= 0, force it to -1 so the dedicated buffer path and the destructor stay consistent. Previously the buffer would leak and get overwritten by the shared buffer pass. Also make the chained-add test actually fail when aliasing is absent instead of just printing informational messages. Co-authored with Claude. --- backends/webgpu/runtime/WebGPUGraph.cpp | 5 +++-- backends/webgpu/test/test_webgpu_native.cpp | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 8d567828608..ac110225a2e 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -132,10 +132,10 @@ void WebGPUGraph::build( int constant_id = vk_tensor->constant_id(); int mem_obj_id = vk_tensor->mem_obj_id(); - tensor_mem_obj_ids_[i] = mem_obj_id; + // Constants always get dedicated buffers regardless of mem_obj_id if (constant_id >= 0 || mem_obj_id < 0) { - // Dedicated buffer: constants or tensors that don't share memory + tensor_mem_obj_ids_[i] = -1; WGPUBufferDescriptor buf_desc = {}; ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes"); buf_desc.size = tensor.nbytes; @@ -158,6 +158,7 @@ void WebGPUGraph::build( } } else { // Shared buffer: track required size, defer allocation to pass 2 + tensor_mem_obj_ids_[i] = mem_obj_id; size_t id = static_cast(mem_obj_id); if (id >= shared_buffer_sizes_.size()) { shared_buffer_sizes_.resize(id + 1, 0); diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index ebb1beb83a6..1028e64a26e 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -140,22 +140,23 @@ static bool test_chained_add_memory(const std::string& model_path) { return false; } - if (stats.num_shared_objects > 0) { - printf( - "PASS: memory aliasing is active (%d shared objects)\n", - stats.num_shared_objects); - } else { - printf( - "INFO: no shared objects (memory aliasing not used by this model)\n"); + if (stats.num_shared_objects <= 0) { + printf("FAIL: expected shared objects but got none\n"); + return false; } + printf( + "PASS: memory aliasing is active (%d shared objects)\n", + stats.num_shared_objects); size_t naive_bytes = static_cast(stats.num_tensors) * dim * dim * sizeof(float); printf("Naive tensor bytes: %zu\n", naive_bytes); printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes); - if (stats.num_shared_objects > 0 && stats.tensor_buffer_bytes < naive_bytes) { - printf("PASS: memory savings from aliasing confirmed\n"); + if (stats.tensor_buffer_bytes >= naive_bytes) { + printf("FAIL: expected memory savings but actual >= naive\n"); + return false; } + printf("PASS: memory savings from aliasing confirmed\n"); printf("PASS: chained add memory test\n"); return true;