From 3372b05dc9f0ea6308fe73f7aa1c6233a00fd9db Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 26 May 2026 22:52:30 +0200
Subject: [PATCH 1/6] perf: cache lexicographic chunk coords in sharding codec

The subchunk_write_order feature (#3826) regressed sharded write
performance: _encode_partial_single rebuilt the full per-shard chunk
coordinate grid on every write via
`np.array(list(_subchunk_order_iter(..., "lexicographic")))`, and
`to_dict_vectorized` rebuilt a tuple key per row with `tuple(coords.ravel())`.
For a single-chunk write into a shard with tens of thousands of chunks this
roughly doubled write time (~22ms -> ~40ms on test_sharded_morton_write_single_chunk,
matching the -44% CodSpeed regression).

Add cached `_lexicographic_order` (array) and `_lexicographic_order_keys`
(tuples) helpers in indexing.py, mirroring `_morton_order`/`_morton_order_keys`,
and pass the cached keys into `to_dict_vectorized` instead of deriving them
row-by-row. This restores write throughput to the pre-#3826 baseline while
preserving identical chunk ordering (verified equal to np.ndindex across
shapes including 0-d and empty).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changes/4001.misc.md        |  4 ++++
 src/zarr/codecs/sharding.py | 24 ++++++++++++++++++------
 src/zarr/core/indexing.py   | 27 +++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 changes/4001.misc.md

diff --git a/changes/4001.misc.md b/changes/4001.misc.md
new file mode 100644
index 0000000000..e90f16a9e8
--- /dev/null
+++ b/changes/4001.misc.md
@@ -0,0 +1,4 @@
+Restore sharding write performance for shards with many chunks. The
+`subchunk_write_order` feature inadvertently rebuilt the per-shard chunk
+coordinate grid (up to tens of thousands of tuples) on every partial write;
+these coordinates are now cached, restoring throughput to its previous level.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 33c8602ecb..535c682053 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -46,8 +46,11 @@
     BasicIndexer,
     ChunkProjection,
     SelectorTuple,
+    _lexicographic_order,
+    _lexicographic_order_keys,
     c_order_iter,
     get_indexer,
+    lexicographic_order_iter,
     morton_order_iter,
 )
 from zarr.core.metadata.v3 import (
@@ -266,6 +269,7 @@ def __iter__(self) -> Iterator[tuple[int, ...]]:
     def to_dict_vectorized(
         self,
         chunk_coords_array: npt.NDArray[np.integer[Any]],
+        chunk_coords_keys: tuple[tuple[int, ...], ...],
     ) -> dict[tuple[int, ...], Buffer | None]:
         """Build a dict of chunk coordinates to buffers using vectorized lookup.
 
@@ -273,6 +277,11 @@ def to_dict_vectorized(
         ----------
         chunk_coords_array : ndarray of shape (n_chunks, n_dims)
             Array of chunk coordinates for vectorized index lookup.
+        chunk_coords_keys : tuple of coordinate tuples
+            The same coordinates as `chunk_coords_array`, in the same order, as
+            plain tuples for use as dict keys. Passed in (rather than derived
+            row-by-row from the array) so the cached value can be reused instead
+            of rebuilding 35k tuples on every write.
 
         Returns
         -------
@@ -281,11 +290,11 @@ def to_dict_vectorized(
         starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
 
         result: dict[tuple[int, ...], Buffer | None] = {}
-        for i, coords in enumerate(chunk_coords_array):
+        for i, coords in enumerate(chunk_coords_keys):
             if valid[i]:
-                result[tuple(coords.ravel())] = self.buf[int(starts[i]) : int(ends[i])]
+                result[coords] = self.buf[int(starts[i]) : int(ends[i])]
             else:
-                result[tuple(coords.ravel())] = None
+                result[coords] = None
 
         return result
 
@@ -533,7 +542,7 @@ def _subchunk_order_iter(
             case "morton":
                 subchunk_iter = morton_order_iter(chunks_per_shard)
             case "lexicographic":
-                subchunk_iter = np.ndindex(chunks_per_shard)
+                subchunk_iter = lexicographic_order_iter(chunks_per_shard)
             case "colexicographic":
                 subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1]))
             case "unordered":
@@ -612,9 +621,12 @@ async def _encode_partial_single(
                 chunks_per_shard=chunks_per_shard,
             )
             shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
-            # Use vectorized lookup for better performance
+            # Use vectorized lookup for better performance. The lexicographic
+            # coordinate array and keys are cached, so neither is rebuilt on
+            # every write.
             shard_dict = shard_reader.to_dict_vectorized(
-                np.array(list(self._subchunk_order_iter(chunks_per_shard, "lexicographic")))
+                _lexicographic_order(chunks_per_shard),
+                _lexicographic_order_keys(chunks_per_shard),
             )
 
         await self.codec_pipeline.write(
diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index cb81164209..ab658a4924 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1584,6 +1584,33 @@ def morton_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]
     return iter(_morton_order_keys(tuple(chunk_shape)))
 
 
+@lru_cache(maxsize=16)
+def _lexicographic_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]:
+    # Lexicographic (C-order) coordinates, computed vectorized and cached so that
+    # the sharding codec's per-shard chunk grid is not rebuilt on every call.
+    # Equivalent to `np.array(list(np.ndindex(chunk_shape)))` but without the
+    # Python-level iteration over every coordinate.
+    n_dims = len(chunk_shape)
+    if n_dims == 0:
+        # A 0-d shard holds a single chunk addressed by the empty coordinate, so
+        # the coordinate array has one row and zero columns. np.indices(()) cannot
+        # express this, so build it directly. Matches list(np.ndindex(())) == [()].
+        order = np.empty((1, 0), dtype=np.intp)
+    else:
+        order = np.indices(chunk_shape, dtype=np.intp).reshape(n_dims, -1).T
+    order.flags.writeable = False
+    return order
+
+
+@lru_cache(maxsize=16)
+def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+    return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape))
+
+
+def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
+    return iter(_lexicographic_order_keys(tuple(chunk_shape)))
+
+
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
     return itertools.product(*(range(x) for x in chunks_per_shard))
 

From 6e02c8f833fd64aaf106c84b540324e9aefc71b1 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 2 Jun 2026 15:26:58 +0200
Subject: [PATCH 2/6] refactor(sharding): derive coords inside
 to_dict_vectorized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback: `_ShardReader.to_dict_vectorized` took the
lexicographic coordinate array and key tuples as parameters, even though
the reader already knows its own `chunks_per_shard` and both structures
are `lru_cache`d. Thread nothing in — fetch them inside the method via
`_lexicographic_order`/`_lexicographic_order_keys`. Same cache, so no
perf change; the call site collapses to `to_dict_vectorized()`.

Add a unit test covering the method directly across 0-d, 1-d, and 2-d
shard grids: present chunks map to their stored bytes, empty chunks to
None, and every lexicographic coordinate appears as a key.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/sharding.py        | 28 +++++++++--------------
 tests/test_codecs/test_sharding.py | 36 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 535c682053..bcc783f386 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -266,27 +266,22 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[tuple[int, ...]]:
         return c_order_iter(self.index.chunks_per_shard)
 
-    def to_dict_vectorized(
-        self,
-        chunk_coords_array: npt.NDArray[np.integer[Any]],
-        chunk_coords_keys: tuple[tuple[int, ...], ...],
-    ) -> dict[tuple[int, ...], Buffer | None]:
+    def to_dict_vectorized(self) -> dict[tuple[int, ...], Buffer | None]:
         """Build a dict of chunk coordinates to buffers using vectorized lookup.
 
-        Parameters
-        ----------
-        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
-            Array of chunk coordinates for vectorized index lookup.
-        chunk_coords_keys : tuple of coordinate tuples
-            The same coordinates as `chunk_coords_array`, in the same order, as
-            plain tuples for use as dict keys. Passed in (rather than derived
-            row-by-row from the array) so the cached value can be reused instead
-            of rebuilding 35k tuples on every write.
+        The full per-shard chunk coordinate grid (both the array used for the
+        vectorized index lookup and the plain tuples used as dict keys) is
+        cached on `chunks_per_shard`, so neither is rebuilt on every call. For a
+        shard with tens of thousands of chunks this avoids reconstructing that
+        many tuples on every partial write.
 
         Returns
         -------
         dict mapping chunk coordinate tuples to Buffer or None
         """
+        chunks_per_shard = self.index.chunks_per_shard
+        chunk_coords_array = _lexicographic_order(chunks_per_shard)
+        chunk_coords_keys = _lexicographic_order_keys(chunks_per_shard)
         starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
 
         result: dict[tuple[int, ...], Buffer | None] = {}
@@ -624,10 +619,7 @@ async def _encode_partial_single(
             # Use vectorized lookup for better performance. The lexicographic
             # coordinate array and keys are cached, so neither is rebuilt on
             # every write.
-            shard_dict = shard_reader.to_dict_vectorized(
-                _lexicographic_order(chunks_per_shard),
-                _lexicographic_order_keys(chunks_per_shard),
-            )
+            shard_dict = shard_reader.to_dict_vectorized()
 
         await self.codec_pipeline.write(
             [
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 74e4a7e0d5..811de031e8 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -992,3 +992,39 @@ def test_shard_index_get_chunk_slices_vectorized(chunks_per_shard: tuple[int, ..
     assert starts[0] == 10
     assert ends[0] == 14
     np.testing.assert_array_equal(starts[~expected_valid], MAX_UINT_64)
+
+
+@pytest.mark.parametrize("chunks_per_shard", [(), (3,), (2, 3)])
+def test_shard_reader_to_dict_vectorized(chunks_per_shard: tuple[int, ...]) -> None:
+    """to_dict_vectorized derives its own coords and maps present chunks to buffers, empty to None.
+
+    The reader is given the full per-shard chunk grid implicitly (it reads
+    ``chunks_per_shard`` off its own index), so the result must contain every
+    lexicographic coordinate as a key, with the stored bytes for present chunks
+    and ``None`` for empty ones.
+    """
+    all_coords = list(c_order_iter(chunks_per_shard))
+    # Lay two chunks back-to-back in the buffer; leave the rest (if any) empty.
+    payload = b"abcdXY"
+    index = _ShardIndex.create_empty(chunks_per_shard)
+    index.set_chunk_slice(all_coords[0], slice(0, 4))
+    present = {all_coords[0]: payload[0:4]}
+    if len(all_coords) > 1:
+        index.set_chunk_slice(all_coords[1], slice(4, 6))
+        present[all_coords[1]] = payload[4:6]
+
+    reader = _ShardReader()
+    reader.index = index
+    reader.buf = default_buffer_prototype().buffer.from_bytes(payload)
+
+    result = reader.to_dict_vectorized()
+
+    # Every lexicographic coordinate is present as a key, in order.
+    assert list(result.keys()) == all_coords
+    for coords in all_coords:
+        buf = result[coords]
+        if coords in present:
+            assert buf is not None
+            assert buf.to_bytes() == present[coords]
+        else:
+            assert buf is None

From 51d0f050ceea4e2839b39271efbc14b8d38e116f Mon Sep 17 00:00:00 2001
From: Davis Bennett <davis.v.bennett@gmail.com>
Date: Tue, 2 Jun 2026 17:26:54 +0200
Subject: [PATCH 3/6] Update src/zarr/core/indexing.py

Co-authored-by: Ilan Gold <ilanbassgold@gmail.com>
---
 src/zarr/core/indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index ab658a4924..1e3ad0375f 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1608,7 +1608,7 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int,
 
 
 def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    return iter(_lexicographic_order_keys(tuple(chunk_shape)))
+    return iter(_lexicographic_order_keys(chunk_shape))
 
 
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]:

From f79a3a2039bc8f4c21308f318b6d1119abc7d458 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 2 Jun 2026 18:01:35 +0200
Subject: [PATCH 4/6] refactor(sharding): drop redundant
 lexicographic_order_iter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback from @ilan-gold and @chuckwondo on the
`lexicographic_order_iter` helper.

`lexicographic_order_iter` returned a *lazy* iterator over an
*eagerly-built, cached* tuple (`_lexicographic_order_keys`), which
chuckwondo rightly flagged as confusing — and its output is byte-for-byte
identical to the pre-existing, genuinely-lazy `c_order_iter`
(verified across 0-d, empty, and N-d shapes). So the name promised
laziness the implementation didn't provide, over a sequence we could
already produce.

Remove the wrapper and use the cached `_lexicographic_order_keys` tuple
directly at the two `dict.fromkeys` call sites and in
`_subchunk_order_iter`. This keeps the eager/cached coordinate tuples —
which is the actual optimization: `dict.fromkeys` over the cached tuple
is ~1.4x faster than over lazy `c_order_iter` at 32^3 (≈900us vs ≈1300us),
because the cache amortizes tuple construction across repeated writes to
same-shaped shards. Switching to `c_order_iter` would have reintroduced
that cost, so it is deliberately not used here.

Also drop the now-dead `tuple()` wrap in `morton_order_iter` (its
argument is typed `tuple[int, ...]` and every caller passes one), per
ilan-gold.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/sharding.py | 10 ++++++----
 src/zarr/core/indexing.py   |  6 +-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index bcc783f386..d8048f2c26 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -50,7 +50,6 @@
     _lexicographic_order_keys,
     c_order_iter,
     get_indexer,
-    lexicographic_order_iter,
     morton_order_iter,
 )
 from zarr.core.metadata.v3 import (
@@ -537,7 +536,10 @@ def _subchunk_order_iter(
             case "morton":
                 subchunk_iter = morton_order_iter(chunks_per_shard)
             case "lexicographic":
-                subchunk_iter = lexicographic_order_iter(chunks_per_shard)
+                # _lexicographic_order_keys returns the cached tuple of coords in
+                # C order; iterate it directly rather than via a wrapper that
+                # only hid the (already materialized) tuple behind iter().
+                subchunk_iter = iter(_lexicographic_order_keys(chunks_per_shard))
             case "colexicographic":
                 subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1]))
             case "unordered":
@@ -565,7 +567,7 @@ async def _encode_single(
                 chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
             )
         )
-        shard_builder = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic"))
+        shard_builder = dict.fromkeys(_lexicographic_order_keys(chunks_per_shard))
 
         await self.codec_pipeline.write(
             [
@@ -608,7 +610,7 @@ async def _encode_partial_single(
         )
 
         if self._is_complete_shard_write(indexer, chunks_per_shard):
-            shard_dict = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic"))
+            shard_dict = dict.fromkeys(_lexicographic_order_keys(chunks_per_shard))
         else:
             shard_reader = await self._load_full_shard_maybe(
                 byte_getter=byte_setter,
diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index ab658a4924..42ab962801 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1581,7 +1581,7 @@ def _morton_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], .
 
 
 def morton_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    return iter(_morton_order_keys(tuple(chunk_shape)))
+    return iter(_morton_order_keys(chunk_shape))
 
 
 @lru_cache(maxsize=16)
@@ -1607,10 +1607,6 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int,
     return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape))
 
 
-def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    return iter(_lexicographic_order_keys(tuple(chunk_shape)))
-
-
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
     return itertools.product(*(range(x) for x in chunks_per_shard))
 

From 500104fc2a42248f3271180d37a2775bd55f1d36 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 2 Jun 2026 18:13:23 +0200
Subject: [PATCH 5/6] refactor(indexing): prefer lexicographic_order_iter,
 soft-deprecate c_order_iter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`c_order_iter` names a memory layout ("C order") rather than what the
iterator actually yields. Reintroduce `lexicographic_order_iter` as the
clearer name for the same row-major coordinate sequence, and make
`c_order_iter` a thin alias that delegates to it, with a docstring note
steering new code to the preferred name. No runtime warning — these are
internal helpers.

`lexicographic_order_iter` keeps the eager/cached implementation
(iter over the lru_cached `_lexicographic_order_keys` tuple), which is
~1.4x faster than the old lazy `itertools.product` on the
`dict.fromkeys` shard-write path and is the optimization this branch
exists to deliver. The alias therefore changes `c_order_iter` from lazy
to eager/cached; all in-repo callers (_ShardReader.__iter__,
_is_total_shard, _subchunk_order_iter, and two tests) are migrated to
`lexicographic_order_iter`, so nothing in-tree relies on the old laziness.

Output is unchanged: lexicographic_order_iter, the c_order_iter alias,
and np.ndindex all agree across 0-d, empty, and N-d shapes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/sharding.py        | 12 +++++-------
 src/zarr/core/indexing.py          | 15 ++++++++++++++-
 tests/test_codecs/test_sharding.py |  6 +++---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index d8048f2c26..e88c4038e0 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -48,8 +48,8 @@
     SelectorTuple,
     _lexicographic_order,
     _lexicographic_order_keys,
-    c_order_iter,
     get_indexer,
+    lexicographic_order_iter,
     morton_order_iter,
 )
 from zarr.core.metadata.v3 import (
@@ -263,7 +263,7 @@ def __len__(self) -> int:
         return int(self.index.offsets_and_lengths.size / 2)
 
     def __iter__(self) -> Iterator[tuple[int, ...]]:
-        return c_order_iter(self.index.chunks_per_shard)
+        return lexicographic_order_iter(self.index.chunks_per_shard)
 
     def to_dict_vectorized(self) -> dict[tuple[int, ...], Buffer | None]:
         """Build a dict of chunk coordinates to buffers using vectorized lookup.
@@ -536,10 +536,7 @@ def _subchunk_order_iter(
             case "morton":
                 subchunk_iter = morton_order_iter(chunks_per_shard)
             case "lexicographic":
-                # _lexicographic_order_keys returns the cached tuple of coords in
-                # C order; iterate it directly rather than via a wrapper that
-                # only hid the (already materialized) tuple behind iter().
-                subchunk_iter = iter(_lexicographic_order_keys(chunks_per_shard))
+                subchunk_iter = lexicographic_order_iter(chunks_per_shard)
             case "colexicographic":
                 subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1]))
             case "unordered":
@@ -692,7 +689,8 @@ def _is_total_shard(
         self, all_chunk_coords: set[tuple[int, ...]], chunks_per_shard: tuple[int, ...]
     ) -> bool:
         return len(all_chunk_coords) == product(chunks_per_shard) and all(
-            chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(chunks_per_shard)
+            chunk_coords in all_chunk_coords
+            for chunk_coords in lexicographic_order_iter(chunks_per_shard)
         )
 
     def _is_complete_shard_write(
diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 42ab962801..63f5f76861 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1607,8 +1607,21 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int,
     return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape))
 
 
+def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
+    # Iterate the chunk grid in lexicographic (row-major / C) order. The full
+    # coordinate tuple is built once and cached on `chunk_shape` (see
+    # _lexicographic_order_keys), so repeated calls for the same shape — e.g. one
+    # per shard write — reuse it rather than rebuilding tens of thousands of
+    # tuples each time.
+    return iter(_lexicographic_order_keys(chunk_shape))
+
+
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    return itertools.product(*(range(x) for x in chunks_per_shard))
+    # Soft-deprecated alias for `lexicographic_order_iter`. "C order" names the
+    # memory layout rather than the ordering; prefer `lexicographic_order_iter`,
+    # which describes what the iterator yields. Kept for backwards compatibility;
+    # new code should call `lexicographic_order_iter` directly.
+    return lexicographic_order_iter(chunks_per_shard)
 
 
 def get_indexer(
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 811de031e8..d811d82979 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -21,7 +21,7 @@
 )
 from zarr.codecs.sharding import MAX_UINT_64, SubchunkWriteOrder, _ShardIndex, _ShardReader
 from zarr.core.buffer import NDArrayLike, default_buffer_prototype
-from zarr.core.indexing import c_order_iter
+from zarr.core.indexing import lexicographic_order_iter
 from zarr.storage import MemoryStore, StorePath, ZipStore
 
 from ..conftest import ArrayRequest
@@ -978,7 +978,7 @@ def test_shard_index_get_chunk_slices_vectorized(chunks_per_shard: tuple[int, ..
     """get_chunk_slices_vectorized works uniformly across chunk grid ranks, including 0-D."""
     index = _ShardIndex.create_empty(chunks_per_shard)
     # Write the first chunk; leave the rest (if any) empty.
-    all_coords = list(c_order_iter(chunks_per_shard))
+    all_coords = list(lexicographic_order_iter(chunks_per_shard))
     index.set_chunk_slice(all_coords[0], slice(10, 14))
 
     coords_array = np.array(all_coords, dtype=np.uint64).reshape(
@@ -1003,7 +1003,7 @@ def test_shard_reader_to_dict_vectorized(chunks_per_shard: tuple[int, ...]) -> N
     lexicographic coordinate as a key, with the stored bytes for present chunks
     and ``None`` for empty ones.
     """
-    all_coords = list(c_order_iter(chunks_per_shard))
+    all_coords = list(lexicographic_order_iter(chunks_per_shard))
     # Lay two chunks back-to-back in the buffer; leave the rest (if any) empty.
     payload = b"abcdXY"
     index = _ShardIndex.create_empty(chunks_per_shard)

From f6d91e610ae293953421605c74215a8eb57b0a84 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 2 Jun 2026 20:13:52 +0200
Subject: [PATCH 6/6] refactor(indexing): make lexicographic_order_iter the
 lazy primitive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per review from @mkitti: invert the relationship between the lazy
iterator and the eagerly-collected tuple. `lexicographic_order_iter` is
now a genuine lazy generator over the chunk-grid coordinates, and
`_lexicographic_order_keys` collects it into a cached tuple — the eager
version is "collect the lazy one", not the other way around.

Previously lexicographic_order_iter returned iter() over the cached
tuple, so any consumer that only needed a prefix still paid to
materialize the entire grid. _is_total_shard does exactly that — an
early-exit `all(coord in set for coord in ...)` — and on a cold cache
for a 32^3 shard whose first coordinate is absent this dropped from
~15.8ms to ~24us (the lazy generator builds one coordinate and bails).

The hot path is unchanged: the two dict.fromkeys sites consume the full
grid and use the cached `_lexicographic_order_keys` tuple directly
(~0.9ms at 32^3), so the regression fix this branch delivers is intact.
This also resolves @chuckwondo's point — the iterator is now actually
lazy rather than a thin wrapper over eager data.

Co-authored-by: Mark Kittisopikul <mkitti@users.noreply.github.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/zarr/core/indexing.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 63f5f76861..9af7bbb6d7 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1602,18 +1602,22 @@ def _lexicographic_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]:
     return order
 
 
-@lru_cache(maxsize=16)
-def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
-    return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape))
+def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
+    # Lazily yield the chunk grid coordinates in lexicographic (row-major / C)
+    # order. This is the primitive: callers that only need a prefix (e.g. an
+    # early-exit `all(...)`) pay only for the coordinates they consume, instead
+    # of materializing the whole grid. Callers that need every coordinate
+    # repeatedly for the same shape should use `_lexicographic_order_keys`, which
+    # caches the fully collected tuple.
+    return (tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape))
 
 
-def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    # Iterate the chunk grid in lexicographic (row-major / C) order. The full
-    # coordinate tuple is built once and cached on `chunk_shape` (see
-    # _lexicographic_order_keys), so repeated calls for the same shape — e.g. one
-    # per shard write — reuse it rather than rebuilding tens of thousands of
-    # tuples each time.
-    return iter(_lexicographic_order_keys(chunk_shape))
+@lru_cache(maxsize=16)
+def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+    # Eagerly collect `lexicographic_order_iter` into a tuple, cached per shape so
+    # the per-shard chunk grid is built once and reused across writes (the hot
+    # path feeds this straight into `dict.fromkeys`).
+    return tuple(lexicographic_order_iter(chunk_shape))
 
 
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: