diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp index 5321c20aaa4..3132671a6b0 100644 --- a/backends/webgpu/runtime/WebGPUBackend.cpp +++ b/backends/webgpu/runtime/WebGPUBackend.cpp @@ -38,6 +38,12 @@ using executorch::runtime::register_backend; using executorch::runtime::Result; using executorch::runtime::Span; +static WebGPUMemoryStats s_last_memory_stats; + +WebGPUMemoryStats get_last_memory_stats() { + return s_last_memory_stats; +} + bool WebGPUBackend::is_available() const { return true; } @@ -83,6 +89,8 @@ Result WebGPUBackend::init( return Error::DelegateInvalidCompatibility; } + s_last_memory_stats = graph->memory_stats(); + processed->Free(); return graph; diff --git a/backends/webgpu/runtime/WebGPUBackend.h b/backends/webgpu/runtime/WebGPUBackend.h index 9c20a3d53be..e82b8dcca84 100644 --- a/backends/webgpu/runtime/WebGPUBackend.h +++ b/backends/webgpu/runtime/WebGPUBackend.h @@ -8,12 +8,15 @@ #pragma once +#include #include namespace executorch { namespace backends { namespace webgpu { +WebGPUMemoryStats get_last_memory_stats(); + class WebGPUBackend final : public ::executorch::runtime::BackendInterface { public: ~WebGPUBackend() override = default; diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index f0e4c7959c0..ac110225a2e 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -14,6 +14,8 @@ #include #include +#include + #include #include @@ -50,9 +52,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) { WebGPUGraph::WebGPUGraph() = default; WebGPUGraph::~WebGPUGraph() { - for (auto& t : tensors_) { - if (t.buffer) { - wgpuBufferRelease(t.buffer); + for (size_t i = 0; i < tensors_.size(); i++) { + if (tensors_[i].buffer && + (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) { + wgpuBufferRelease(tensors_[i].buffer); + } + } + for (auto& buf : shared_buffers_) { + if (buf) { + wgpuBufferRelease(buf); } } for (auto& buf : output_staging_buffers_) { @@ -94,6 +102,7 @@ void WebGPUGraph::build( const int num_vals = values ? values->size() : 0; value_types_.resize(num_vals, ValueType::Null); tensors_.resize(num_vals); + tensor_mem_obj_ids_.resize(num_vals, -1); ints_.resize(num_vals, 0); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); @@ -121,27 +130,41 @@ void WebGPUGraph::build( } tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); - // Create GPU buffer - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - // Upload constant data if this tensor has a constant_id int constant_id = vk_tensor->constant_id(); - if (constant_id >= 0 && constant_data) { - const auto* constants = graph->constants(); - if (constants && constant_id < static_cast(constants->size())) { - const auto* vk_bytes = constants->Get(constant_id); - // Only upload from embedded bytes (not named data map) - if (vk_bytes->offset() != UINT64_MAX) { - const uint8_t* src = constant_data + vk_bytes->offset(); - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, src, tensor.nbytes); + int mem_obj_id = vk_tensor->mem_obj_id(); + + // Constants always get dedicated buffers regardless of mem_obj_id + if (constant_id >= 0 || mem_obj_id < 0) { + tensor_mem_obj_ids_[i] = -1; + WGPUBufferDescriptor buf_desc = {}; + ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes"); + buf_desc.size = tensor.nbytes; + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + if (constant_id >= 0 && constant_data) { + const auto* constants = graph->constants(); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } } } + } else { + // Shared buffer: track required size, defer allocation to pass 2 + tensor_mem_obj_ids_[i] = mem_obj_id; + size_t id = static_cast(mem_obj_id); + if (id >= shared_buffer_sizes_.size()) { + shared_buffer_sizes_.resize(id + 1, 0); + } + shared_buffer_sizes_[id] = + std::max(shared_buffer_sizes_[id], tensor.nbytes); } break; } @@ -166,6 +189,24 @@ void WebGPUGraph::build( } } + // Allocate shared buffers and assign to tensors + shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr); + for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) { + WGPUBufferDescriptor buf_desc = {}; + ET_CHECK_MSG(shared_buffer_sizes_[id] > 0, "Shared buffer has zero bytes"); + buf_desc.size = shared_buffer_sizes_[id]; + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc); + } + for (int i = 0; i < num_vals; i++) { + int mid = tensor_mem_obj_ids_[i]; + if (mid >= 0) { + tensors_[i].buffer = shared_buffers_[mid]; + } + } + // Phase 2: Record input and output IDs const auto* fb_input_ids = graph->input_ids(); if (fb_input_ids) { @@ -181,7 +222,8 @@ void WebGPUGraph::build( // Create staging buffer for output readback WGPUBufferDescriptor staging_desc = {}; - staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4; + ET_CHECK_MSG(tensors_[oid].nbytes > 0, "Output tensor has zero bytes"); + staging_desc.size = tensors_[oid].nbytes; staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; staging_desc.mappedAtCreation = false; output_staging_buffers_.push_back( @@ -315,10 +357,21 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { WebGPUMemoryStats stats; for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { - stats.tensor_buffer_bytes += tensors_[i].nbytes; stats.num_tensors++; + // Shared tensors are tracked via shared_buffer_sizes_ + bool is_shared = + i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; + if (!is_shared) { + stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; + } } } + for (size_t s : shared_buffer_sizes_) { + stats.shared_buffer_bytes += s; + } + stats.num_shared_objects = static_cast(shared_buffers_.size()); + stats.tensor_buffer_bytes = + stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes; for (size_t i = 0; i < output_ids_.size(); i++) { stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes; } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 2d6996e9219..d68d33267ab 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -32,13 +32,17 @@ struct WebGPUDispatch { struct WebGPUMemoryStats { size_t tensor_buffer_bytes = 0; + size_t shared_buffer_bytes = 0; + int num_shared_objects = 0; + size_t unshared_tensor_buffer_bytes = 0; size_t staging_buffer_bytes = 0; size_t uniform_buffer_bytes = 0; int num_tensors = 0; int num_dispatches = 0; size_t total_bytes() const { - return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes; + return shared_buffer_bytes + unshared_tensor_buffer_bytes + + staging_buffer_bytes + uniform_buffer_bytes; } }; @@ -134,6 +138,11 @@ class WebGPUGraph { std::vector input_ids_; std::vector output_ids_; + // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer. + std::vector tensor_mem_obj_ids_; + std::vector shared_buffers_; + std::vector shared_buffer_sizes_; + // Staging buffers for reading back outputs (MapRead | CopyDst). std::vector output_staging_buffers_; diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py index f4b33ced76d..e8da644a1f9 100644 --- a/backends/webgpu/test/ops/add/test_add.py +++ b/backends/webgpu/test/ops/add/test_add.py @@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: z = x + y z = z + x z = z + y + z = z + x + z = z + y return z @@ -97,5 +99,18 @@ def export_add_model(output_path: str) -> None: print(f"Exported {output_path}") +def export_chained_add_model(output_path: str) -> None: + """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" + model = AddChainedModule() + example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + with open(output_path, "wb") as f: + f.write(et_program.buffer) + print(f"Exported {output_path}") + + if __name__ == "__main__": unittest.main() diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 684926cb181..a42b2304ee7 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── -echo "=== Step 2: Export test model ===" +echo "=== Step 2: Export test models ===" PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') " # ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── @@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} echo "=== Step 4: Run native test ===" WEBGPU_TEST_MODEL="${PTE_MODEL}" \ +WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" echo "=== Done ===" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index c60695e11c9..1028e64a26e 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -75,6 +76,92 @@ static bool test_single_add(const std::string& model_path) { return true; } +static bool test_chained_add_memory(const std::string& model_path) { + printf("\n--- Test: chained add memory aliasing (1024x1024) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + auto stats = get_last_memory_stats(); + printf("Memory stats after build:\n"); + printf(" num_tensors: %d\n", stats.num_tensors); + printf(" num_shared_objects: %d\n", stats.num_shared_objects); + printf(" shared_buffer_bytes: %zu\n", stats.shared_buffer_bytes); + printf( + " unshared_tensor_buffer_bytes: %zu\n", + stats.unshared_tensor_buffer_bytes); + printf(" tensor_buffer_bytes: %zu\n", stats.tensor_buffer_bytes); + printf(" total_bytes: %zu\n", stats.total_bytes()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector x_data(size); + std::vector y_data(size); + for (int i = 0; i < size; i++) { + x_data[i] = static_cast(i % 100) * 0.01f; + y_data[i] = static_cast(i % 50) * 0.02f; + } + + auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); + auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); + + auto result = module.forward({EValue(x), EValue(y)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + for (int i = 0; i < size; i++) { + float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e\n", max_error); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + + if (stats.num_shared_objects <= 0) { + printf("FAIL: expected shared objects but got none\n"); + return false; + } + printf( + "PASS: memory aliasing is active (%d shared objects)\n", + stats.num_shared_objects); + + size_t naive_bytes = + static_cast(stats.num_tensors) * dim * dim * sizeof(float); + printf("Naive tensor bytes: %zu\n", naive_bytes); + printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes); + if (stats.tensor_buffer_bytes >= naive_bytes) { + printf("FAIL: expected memory savings but actual >= naive\n"); + return false; + } + printf("PASS: memory savings from aliasing confirmed\n"); + + printf("PASS: chained add memory test\n"); + return true; +} + int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; if (argc > 1) { @@ -84,6 +171,11 @@ int main(int argc, char** argv) { model_path = env; } + std::string chained_model_path; + if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { + chained_model_path = env; + } + WebGPUContext ctx; try { ctx = create_webgpu_context(); @@ -97,6 +189,10 @@ int main(int argc, char** argv) { bool ok = test_single_add(model_path); + if (!chained_model_path.empty()) { + ok = test_chained_add_memory(chained_model_path) && ok; + } + set_default_webgpu_context(nullptr); destroy_webgpu_context(ctx);