From 017e1981a1fa96698c197a1e3054c8f19d71f402 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Mon, 4 May 2026 19:42:28 -0700
Subject: [PATCH 1/6] WebGPU: add memory aliasing for intermediate tensor
 buffers

The export pipeline already runs a greedy memory planning pass that assigns
mem_obj_id to tensors with non-overlapping lifetimes, but the WebGPU runtime
was ignoring it and allocating a dedicated WGPUBuffer per tensor.

Read mem_obj_id from the flatbuffer during graph build. Tensors sharing the
same mem_obj_id now share a single WGPUBuffer sized to the largest user.
Constants and tensors without a mem_obj_id still get dedicated buffers.

Adds a chained-add native test (z=x+y; z=z+x; z=z+y) that verifies both
correctness and that memory aliasing produces savings (~20% for this model).

Co-authored with Claude.
---
 backends/webgpu/runtime/WebGPUBackend.cpp   |  8 ++
 backends/webgpu/runtime/WebGPUBackend.h     |  3 +
 backends/webgpu/runtime/WebGPUGraph.cpp     | 91 +++++++++++++++-----
 backends/webgpu/runtime/WebGPUGraph.h       | 11 ++-
 backends/webgpu/test/ops/add/test_add.py    | 13 +++
 backends/webgpu/test/test_webgpu_native.cpp | 95 +++++++++++++++++++++
 6 files changed, 198 insertions(+), 23 deletions(-)

diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp
index 5321c20aaa4..3132671a6b0 100644
--- a/backends/webgpu/runtime/WebGPUBackend.cpp
+++ b/backends/webgpu/runtime/WebGPUBackend.cpp
@@ -38,6 +38,12 @@ using executorch::runtime::register_backend;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
+static WebGPUMemoryStats s_last_memory_stats;
+
+WebGPUMemoryStats get_last_memory_stats() {
+  return s_last_memory_stats;
+}
+
 bool WebGPUBackend::is_available() const {
   return true;
 }
@@ -83,6 +89,8 @@ Result<DelegateHandle*> WebGPUBackend::init(
     return Error::DelegateInvalidCompatibility;
   }
 
+  s_last_memory_stats = graph->memory_stats();
+
   processed->Free();
 
   return graph;
diff --git a/backends/webgpu/runtime/WebGPUBackend.h b/backends/webgpu/runtime/WebGPUBackend.h
index 9c20a3d53be..e82b8dcca84 100644
--- a/backends/webgpu/runtime/WebGPUBackend.h
+++ b/backends/webgpu/runtime/WebGPUBackend.h
@@ -8,12 +8,15 @@
 
 #pragma once
 
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
 #include <executorch/runtime/backend/interface.h>
 
 namespace executorch {
 namespace backends {
 namespace webgpu {
 
+WebGPUMemoryStats get_last_memory_stats();
+
 class WebGPUBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~WebGPUBackend() override = default;
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index f0e4c7959c0..a320a57b610 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -50,9 +50,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) {
 WebGPUGraph::WebGPUGraph() = default;
 
 WebGPUGraph::~WebGPUGraph() {
-  for (auto& t : tensors_) {
-    if (t.buffer) {
-      wgpuBufferRelease(t.buffer);
+  for (size_t i = 0; i < tensors_.size(); i++) {
+    if (tensors_[i].buffer &&
+        (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) {
+      wgpuBufferRelease(tensors_[i].buffer);
+    }
+  }
+  for (auto& buf : shared_buffers_) {
+    if (buf) {
+      wgpuBufferRelease(buf);
     }
   }
   for (auto& buf : output_staging_buffers_) {
@@ -94,6 +100,7 @@ void WebGPUGraph::build(
   const int num_vals = values ? values->size() : 0;
   value_types_.resize(num_vals, ValueType::Null);
   tensors_.resize(num_vals);
+  tensor_mem_obj_ids_.resize(num_vals, -1);
   ints_.resize(num_vals, 0);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
@@ -121,27 +128,39 @@ void WebGPUGraph::build(
         }
         tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());
 
-        // Create GPU buffer
-        WGPUBufferDescriptor buf_desc = {};
-        buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
-        buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-            WGPUBufferUsage_CopySrc;
-        buf_desc.mappedAtCreation = false;
-        tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-        // Upload constant data if this tensor has a constant_id
         int constant_id = vk_tensor->constant_id();
-        if (constant_id >= 0 && constant_data) {
-          const auto* constants = graph->constants();
-          if (constants && constant_id < static_cast<int>(constants->size())) {
-            const auto* vk_bytes = constants->Get(constant_id);
-            // Only upload from embedded bytes (not named data map)
-            if (vk_bytes->offset() != UINT64_MAX) {
-              const uint8_t* src = constant_data + vk_bytes->offset();
-              wgpuQueueWriteBuffer(
-                  queue_, tensor.buffer, 0, src, tensor.nbytes);
+        int mem_obj_id = vk_tensor->mem_obj_id();
+        tensor_mem_obj_ids_[i] = mem_obj_id;
+
+        if (constant_id >= 0 || mem_obj_id < 0) {
+          // Dedicated buffer: constants or tensors that don't share memory
+          WGPUBufferDescriptor buf_desc = {};
+          buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
+          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+              WGPUBufferUsage_CopySrc;
+          buf_desc.mappedAtCreation = false;
+          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+          if (constant_id >= 0 && constant_data) {
+            const auto* constants = graph->constants();
+            if (constants &&
+                constant_id < static_cast<int>(constants->size())) {
+              const auto* vk_bytes = constants->Get(constant_id);
+              if (vk_bytes->offset() != UINT64_MAX) {
+                const uint8_t* src = constant_data + vk_bytes->offset();
+                wgpuQueueWriteBuffer(
+                    queue_, tensor.buffer, 0, src, tensor.nbytes);
+              }
             }
           }
+        } else {
+          // Shared buffer: track required size, defer allocation to pass 2
+          size_t id = static_cast<size_t>(mem_obj_id);
+          if (id >= shared_buffer_sizes_.size()) {
+            shared_buffer_sizes_.resize(id + 1, 0);
+          }
+          shared_buffer_sizes_[id] =
+              std::max(shared_buffer_sizes_[id], tensor.nbytes);
         }
         break;
       }
@@ -166,6 +185,24 @@ void WebGPUGraph::build(
     }
   }
 
+  // Allocate shared buffers and assign to tensors
+  shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr);
+  for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) {
+    WGPUBufferDescriptor buf_desc = {};
+    buf_desc.size =
+        shared_buffer_sizes_[id] > 0 ? shared_buffer_sizes_[id] : 4;
+    buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+        WGPUBufferUsage_CopySrc;
+    buf_desc.mappedAtCreation = false;
+    shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc);
+  }
+  for (int i = 0; i < num_vals; i++) {
+    int mid = tensor_mem_obj_ids_[i];
+    if (mid >= 0) {
+      tensors_[i].buffer = shared_buffers_[mid];
+    }
+  }
+
   // Phase 2: Record input and output IDs
   const auto* fb_input_ids = graph->input_ids();
   if (fb_input_ids) {
@@ -315,10 +352,20 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   WebGPUMemoryStats stats;
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
-      stats.tensor_buffer_bytes += tensors_[i].nbytes;
       stats.num_tensors++;
+      if (i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0) {
+        // Shared tensor — actual allocation tracked via shared_buffer_sizes_
+      } else {
+        stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
+      }
     }
   }
+  for (size_t s : shared_buffer_sizes_) {
+    stats.shared_buffer_bytes += s;
+  }
+  stats.num_shared_objects = static_cast<int>(shared_buffers_.size());
+  stats.tensor_buffer_bytes =
+      stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes;
   for (size_t i = 0; i < output_ids_.size(); i++) {
     stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
   }
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 2d6996e9219..d68d33267ab 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -32,13 +32,17 @@ struct WebGPUDispatch {
 
 struct WebGPUMemoryStats {
   size_t tensor_buffer_bytes = 0;
+  size_t shared_buffer_bytes = 0;
+  int num_shared_objects = 0;
+  size_t unshared_tensor_buffer_bytes = 0;
   size_t staging_buffer_bytes = 0;
   size_t uniform_buffer_bytes = 0;
   int num_tensors = 0;
   int num_dispatches = 0;
 
   size_t total_bytes() const {
-    return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes;
+    return shared_buffer_bytes + unshared_tensor_buffer_bytes +
+        staging_buffer_bytes + uniform_buffer_bytes;
   }
 };
 
@@ -134,6 +138,11 @@ class WebGPUGraph {
   std::vector<int> input_ids_;
   std::vector<int> output_ids_;
 
+  // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer.
+  std::vector<int> tensor_mem_obj_ids_;
+  std::vector<WGPUBuffer> shared_buffers_;
+  std::vector<size_t> shared_buffer_sizes_;
+
   // Staging buffers for reading back outputs (MapRead | CopyDst).
   std::vector<WGPUBuffer> output_staging_buffers_;
 
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
index f4b33ced76d..8829dfe704e 100644
--- a/backends/webgpu/test/ops/add/test_add.py
+++ b/backends/webgpu/test/ops/add/test_add.py
@@ -97,5 +97,18 @@ def export_add_model(output_path: str) -> None:
     print(f"Exported {output_path}")
 
 
+def export_chained_add_model(output_path: str) -> None:
+    """Export a chained add model (z=x+y; z=z+x; z=z+y) to .pte for memory aliasing testing."""
+    model = AddChainedModule()
+    example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
+    ep = torch.export.export(model, example_inputs)
+    et_program = to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+    with open(output_path, "wb") as f:
+        f.write(et_program.buffer)
+    print(f"Exported {output_path}")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index c60695e11c9..bf9c44a858a 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/webgpu/runtime/WebGPUBackend.h>
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -75,6 +76,91 @@ static bool test_single_add(const std::string& model_path) {
   return true;
 }
 
+static bool test_chained_add_memory(const std::string& model_path) {
+  printf("\n--- Test: chained add memory aliasing (1024x1024) ---\n");
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  auto stats = get_last_memory_stats();
+  printf("Memory stats after build:\n");
+  printf("  num_tensors:                %d\n", stats.num_tensors);
+  printf("  num_shared_objects:         %d\n", stats.num_shared_objects);
+  printf("  shared_buffer_bytes:        %zu\n", stats.shared_buffer_bytes);
+  printf(
+      "  unshared_tensor_buffer_bytes: %zu\n",
+      stats.unshared_tensor_buffer_bytes);
+  printf("  tensor_buffer_bytes:        %zu\n", stats.tensor_buffer_bytes);
+  printf("  total_bytes:                %zu\n", stats.total_bytes());
+
+  constexpr int dim = 1024;
+  constexpr int size = dim * dim;
+
+  std::vector<float> x_data(size);
+  std::vector<float> y_data(size);
+  for (int i = 0; i < size; i++) {
+    x_data[i] = static_cast<float>(i % 100) * 0.01f;
+    y_data[i] = static_cast<float>(i % 50) * 0.02f;
+  }
+
+  auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
+  auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));
+
+  auto result = module.forward({EValue(x), EValue(y)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+
+  // z = x+y; z = z+x = 2x+y; z = z+y = 2x+2y
+  const auto& out_tensor = outputs[0].toTensor();
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_error = 0.0f;
+  for (int i = 0; i < size; i++) {
+    float expected = 2.0f * x_data[i] + 2.0f * y_data[i];
+    float error = std::abs(out_data[i] - expected);
+    max_error = std::max(max_error, error);
+  }
+
+  printf("Max error: %e\n", max_error);
+  if (max_error > 1e-3f) {
+    printf("FAIL: max error exceeds tolerance 1e-3\n");
+    return false;
+  }
+
+  if (stats.num_shared_objects > 0) {
+    printf(
+        "PASS: memory aliasing is active (%d shared objects)\n",
+        stats.num_shared_objects);
+  } else {
+    printf(
+        "INFO: no shared objects (memory aliasing not used by this model)\n");
+  }
+
+  size_t naive_bytes =
+      static_cast<size_t>(stats.num_tensors) * dim * dim * sizeof(float);
+  printf("Naive tensor bytes:  %zu\n", naive_bytes);
+  printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes);
+  if (stats.num_shared_objects > 0 && stats.tensor_buffer_bytes < naive_bytes) {
+    printf("PASS: memory savings from aliasing confirmed\n");
+  }
+
+  printf("PASS: chained add memory test\n");
+  return true;
+}
+
 int main(int argc, char** argv) {
   std::string model_path = "webgpu_add_test.pte";
   if (argc > 1) {
@@ -84,6 +170,11 @@ int main(int argc, char** argv) {
     model_path = env;
   }
 
+  std::string chained_model_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
+    chained_model_path = env;
+  }
+
   WebGPUContext ctx;
   try {
     ctx = create_webgpu_context();
@@ -97,6 +188,10 @@ int main(int argc, char** argv) {
 
   bool ok = test_single_add(model_path);
 
+  if (!chained_model_path.empty()) {
+    ok = test_chained_add_memory(chained_model_path) && ok;
+  }
+
   set_default_webgpu_context(nullptr);
   destroy_webgpu_context(ctx);
 

From 1e6143d05b2da55c4150671797ee12bee0958750 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 5 May 2026 08:27:44 -0700
Subject: [PATCH 2/6] WebGPU: replace zero-size buffer fallbacks with asserts

Replace the silent `nbytes > 0 ? nbytes : 4` fallback pattern with
ET_CHECK_MSG assertions. If a zero-byte tensor reaches buffer creation,
we want to know immediately rather than silently creating a dummy 4-byte
buffer that masks the issue.

Co-authored with Claude.
---
 backends/webgpu/runtime/WebGPUGraph.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index a320a57b610..6af585d3514 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -14,6 +14,8 @@
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 #include <webgpu/wgpu.h>
 
+#include <executorch/runtime/platform/assert.h>
+
 #include <cstring>
 #include <stdexcept>
 
@@ -135,7 +137,8 @@ void WebGPUGraph::build(
         if (constant_id >= 0 || mem_obj_id < 0) {
           // Dedicated buffer: constants or tensors that don't share memory
           WGPUBufferDescriptor buf_desc = {};
-          buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
+          ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes");
+          buf_desc.size = tensor.nbytes;
           buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
               WGPUBufferUsage_CopySrc;
           buf_desc.mappedAtCreation = false;
@@ -189,8 +192,8 @@ void WebGPUGraph::build(
   shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr);
   for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) {
     WGPUBufferDescriptor buf_desc = {};
-    buf_desc.size =
-        shared_buffer_sizes_[id] > 0 ? shared_buffer_sizes_[id] : 4;
+    ET_CHECK_MSG(shared_buffer_sizes_[id] > 0, "Shared buffer has zero bytes");
+    buf_desc.size = shared_buffer_sizes_[id];
     buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
         WGPUBufferUsage_CopySrc;
     buf_desc.mappedAtCreation = false;
@@ -218,7 +221,8 @@ void WebGPUGraph::build(
 
       // Create staging buffer for output readback
       WGPUBufferDescriptor staging_desc = {};
-      staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4;
+      ET_CHECK_MSG(tensors_[oid].nbytes > 0, "Output tensor has zero bytes");
+      staging_desc.size = tensors_[oid].nbytes;
       staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
       staging_desc.mappedAtCreation = false;
       output_staging_buffers_.push_back(

From 8f787922e0a8f03b1237c7c5f0a8cc536924a261 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 5 May 2026 08:30:34 -0700
Subject: [PATCH 3/6] WebGPU: clean up empty if-branch in memory_stats()

Invert the condition to eliminate the empty if-body with a comment.

Co-authored with Claude.
---
 backends/webgpu/runtime/WebGPUGraph.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 6af585d3514..8d567828608 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -357,9 +357,10 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
       stats.num_tensors++;
-      if (i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0) {
-        // Shared tensor — actual allocation tracked via shared_buffer_sizes_
-      } else {
+      // Shared tensors are tracked via shared_buffer_sizes_
+      bool is_shared =
+          i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
+      if (!is_shared) {
         stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
       }
     }

From d4333e3c67ebf80d8e890568996a5b874245172e Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 5 May 2026 08:32:27 -0700
Subject: [PATCH 4/6] WebGPU: add chained-add model to test script

Export and run the chained-add memory aliasing test in
test_build_webgpu.sh so it runs automatically instead of requiring
a manual WEBGPU_TEST_CHAINED_MODEL env var.

Co-authored with Claude.
---
 backends/webgpu/test/test_build_webgpu.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 684926cb181..a42b2304ee7 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
-echo "=== Step 2: Export test model ==="
+echo "=== Step 2: Export test models ==="
 PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
 export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
 "
 
 # ── Step 3: Native build + test (wgpu-native) ────────────────────────────────
@@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
 
 echo "=== Step 4: Run native test ==="
 WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
     "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
 
 echo "=== Done ==="

From 08b37d85386766964e91b60eb34656f1b5b85a1f Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 5 May 2026 08:33:42 -0700
Subject: [PATCH 5/6] WebGPU: extend chained add test to 5 ops for better
 aliasing coverage

Longer chain produces more intermediates, giving the memory planner
more opportunity to alias buffers. Expected output: 3x + 3y.

Co-authored with Claude.
---
 backends/webgpu/test/ops/add/test_add.py    | 4 +++-
 backends/webgpu/test/test_webgpu_native.cpp | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
index 8829dfe704e..e8da644a1f9 100644
--- a/backends/webgpu/test/ops/add/test_add.py
+++ b/backends/webgpu/test/ops/add/test_add.py
@@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         z = x + y
         z = z + x
         z = z + y
+        z = z + x
+        z = z + y
         return z
 
 
@@ -98,7 +100,7 @@ def export_add_model(output_path: str) -> None:
 
 
 def export_chained_add_model(output_path: str) -> None:
-    """Export a chained add model (z=x+y; z=z+x; z=z+y) to .pte for memory aliasing testing."""
+    """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing."""
     model = AddChainedModule()
     example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
     ep = torch.export.export(model, example_inputs)
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index bf9c44a858a..ebb1beb83a6 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -123,13 +123,13 @@ static bool test_chained_add_memory(const std::string& model_path) {
     return false;
   }
 
-  // z = x+y; z = z+x = 2x+y; z = z+y = 2x+2y
+  // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
   const auto& out_tensor = outputs[0].toTensor();
   const float* out_data = out_tensor.const_data_ptr<float>();
 
   float max_error = 0.0f;
   for (int i = 0; i < size; i++) {
-    float expected = 2.0f * x_data[i] + 2.0f * y_data[i];
+    float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
     float error = std::abs(out_data[i] - expected);
     max_error = std::max(max_error, error);
   }

From 55ceea4469f0c9cc3595a2206e4244a552576d07 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 5 May 2026 12:20:01 -0700
Subject: [PATCH 6/6] WebGPU: fix constant tensor buffer leak and make aliasing
 test strict

Fix: if a constant tensor has mem_obj_id >= 0, force it to -1 so the
dedicated buffer path and the destructor stay consistent. Previously
the buffer would leak and get overwritten by the shared buffer pass.

Also make the chained-add test actually fail when aliasing is absent
instead of just printing informational messages.

Co-authored with Claude.
---
 backends/webgpu/runtime/WebGPUGraph.cpp     |  5 +++--
 backends/webgpu/test/test_webgpu_native.cpp | 19 ++++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 8d567828608..ac110225a2e 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -132,10 +132,10 @@ void WebGPUGraph::build(
 
         int constant_id = vk_tensor->constant_id();
         int mem_obj_id = vk_tensor->mem_obj_id();
-        tensor_mem_obj_ids_[i] = mem_obj_id;
 
+        // Constants always get dedicated buffers regardless of mem_obj_id
         if (constant_id >= 0 || mem_obj_id < 0) {
-          // Dedicated buffer: constants or tensors that don't share memory
+          tensor_mem_obj_ids_[i] = -1;
           WGPUBufferDescriptor buf_desc = {};
           ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes");
           buf_desc.size = tensor.nbytes;
@@ -158,6 +158,7 @@ void WebGPUGraph::build(
           }
         } else {
           // Shared buffer: track required size, defer allocation to pass 2
+          tensor_mem_obj_ids_[i] = mem_obj_id;
           size_t id = static_cast<size_t>(mem_obj_id);
           if (id >= shared_buffer_sizes_.size()) {
             shared_buffer_sizes_.resize(id + 1, 0);
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index ebb1beb83a6..1028e64a26e 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -140,22 +140,23 @@ static bool test_chained_add_memory(const std::string& model_path) {
     return false;
   }
 
-  if (stats.num_shared_objects > 0) {
-    printf(
-        "PASS: memory aliasing is active (%d shared objects)\n",
-        stats.num_shared_objects);
-  } else {
-    printf(
-        "INFO: no shared objects (memory aliasing not used by this model)\n");
+  if (stats.num_shared_objects <= 0) {
+    printf("FAIL: expected shared objects but got none\n");
+    return false;
   }
+  printf(
+      "PASS: memory aliasing is active (%d shared objects)\n",
+      stats.num_shared_objects);
 
   size_t naive_bytes =
       static_cast<size_t>(stats.num_tensors) * dim * dim * sizeof(float);
   printf("Naive tensor bytes:  %zu\n", naive_bytes);
   printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes);
-  if (stats.num_shared_objects > 0 && stats.tensor_buffer_bytes < naive_bytes) {
-    printf("PASS: memory savings from aliasing confirmed\n");
+  if (stats.tensor_buffer_bytes >= naive_bytes) {
+    printf("FAIL: expected memory savings but actual >= naive\n");
+    return false;
   }
+  printf("PASS: memory savings from aliasing confirmed\n");
 
   printf("PASS: chained add memory test\n");
   return true;