From 3372b05dc9f0ea6308fe73f7aa1c6233a00fd9db Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 26 May 2026 22:52:30 +0200 Subject: [PATCH 1/6] perf: cache lexicographic chunk coords in sharding codec The subchunk_write_order feature (#3826) regressed sharded write performance: _encode_partial_single rebuilt the full per-shard chunk coordinate grid on every write via `np.array(list(_subchunk_order_iter(..., "lexicographic")))`, and `to_dict_vectorized` rebuilt a tuple key per row with `tuple(coords.ravel())`. For a single-chunk write into a shard with tens of thousands of chunks this roughly doubled write time (~22ms -> ~40ms on test_sharded_morton_write_single_chunk, matching the -44% CodSpeed regression). Add cached `_lexicographic_order` (array) and `_lexicographic_order_keys` (tuples) helpers in indexing.py, mirroring `_morton_order`/`_morton_order_keys`, and pass the cached keys into `to_dict_vectorized` instead of deriving them row-by-row. This restores write throughput to the pre-#3826 baseline while preserving identical chunk ordering (verified equal to np.ndindex across shapes including 0-d and empty). Co-Authored-By: Claude Opus 4.7 (1M context) --- changes/4001.misc.md | 4 ++++ src/zarr/codecs/sharding.py | 24 ++++++++++++++++++------ src/zarr/core/indexing.py | 27 +++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 changes/4001.misc.md diff --git a/changes/4001.misc.md b/changes/4001.misc.md new file mode 100644 index 0000000000..e90f16a9e8 --- /dev/null +++ b/changes/4001.misc.md @@ -0,0 +1,4 @@ +Restore sharding write performance for shards with many chunks. The +`subchunk_write_order` feature inadvertently rebuilt the per-shard chunk +coordinate grid (up to tens of thousands of tuples) on every partial write; +these coordinates are now cached, restoring throughput to its previous level. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 33c8602ecb..535c682053 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -46,8 +46,11 @@ BasicIndexer, ChunkProjection, SelectorTuple, + _lexicographic_order, + _lexicographic_order_keys, c_order_iter, get_indexer, + lexicographic_order_iter, morton_order_iter, ) from zarr.core.metadata.v3 import ( @@ -266,6 +269,7 @@ def __iter__(self) -> Iterator[tuple[int, ...]]: def to_dict_vectorized( self, chunk_coords_array: npt.NDArray[np.integer[Any]], + chunk_coords_keys: tuple[tuple[int, ...], ...], ) -> dict[tuple[int, ...], Buffer | None]: """Build a dict of chunk coordinates to buffers using vectorized lookup. @@ -273,6 +277,11 @@ def to_dict_vectorized( ---------- chunk_coords_array : ndarray of shape (n_chunks, n_dims) Array of chunk coordinates for vectorized index lookup. + chunk_coords_keys : tuple of coordinate tuples + The same coordinates as `chunk_coords_array`, in the same order, as + plain tuples for use as dict keys. Passed in (rather than derived + row-by-row from the array) so the cached value can be reused instead + of rebuilding 35k tuples on every write. Returns ------- @@ -281,11 +290,11 @@ def to_dict_vectorized( starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array) result: dict[tuple[int, ...], Buffer | None] = {} - for i, coords in enumerate(chunk_coords_array): + for i, coords in enumerate(chunk_coords_keys): if valid[i]: - result[tuple(coords.ravel())] = self.buf[int(starts[i]) : int(ends[i])] + result[coords] = self.buf[int(starts[i]) : int(ends[i])] else: - result[tuple(coords.ravel())] = None + result[coords] = None return result @@ -533,7 +542,7 @@ def _subchunk_order_iter( case "morton": subchunk_iter = morton_order_iter(chunks_per_shard) case "lexicographic": - subchunk_iter = np.ndindex(chunks_per_shard) + subchunk_iter = lexicographic_order_iter(chunks_per_shard) case "colexicographic": subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1])) case "unordered": @@ -612,9 +621,12 @@ async def _encode_partial_single( chunks_per_shard=chunks_per_shard, ) shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard) - # Use vectorized lookup for better performance + # Use vectorized lookup for better performance. The lexicographic + # coordinate array and keys are cached, so neither is rebuilt on + # every write. shard_dict = shard_reader.to_dict_vectorized( - np.array(list(self._subchunk_order_iter(chunks_per_shard, "lexicographic"))) + _lexicographic_order(chunks_per_shard), + _lexicographic_order_keys(chunks_per_shard), ) await self.codec_pipeline.write( diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index cb81164209..ab658a4924 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1584,6 +1584,33 @@ def morton_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]] return iter(_morton_order_keys(tuple(chunk_shape))) +@lru_cache(maxsize=16) +def _lexicographic_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]: + # Lexicographic (C-order) coordinates, computed vectorized and cached so that + # the sharding codec's per-shard chunk grid is not rebuilt on every call. + # Equivalent to `np.array(list(np.ndindex(chunk_shape)))` but without the + # Python-level iteration over every coordinate. + n_dims = len(chunk_shape) + if n_dims == 0: + # A 0-d shard holds a single chunk addressed by the empty coordinate, so + # the coordinate array has one row and zero columns. np.indices(()) cannot + # express this, so build it directly. Matches list(np.ndindex(())) == [()]. + order = np.empty((1, 0), dtype=np.intp) + else: + order = np.indices(chunk_shape, dtype=np.intp).reshape(n_dims, -1).T + order.flags.writeable = False + return order + + +@lru_cache(maxsize=16) +def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: + return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape)) + + +def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: + return iter(_lexicographic_order_keys(tuple(chunk_shape))) + + def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: return itertools.product(*(range(x) for x in chunks_per_shard)) From 6e02c8f833fd64aaf106c84b540324e9aefc71b1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 2 Jun 2026 15:26:58 +0200 Subject: [PATCH 2/6] refactor(sharding): derive coords inside to_dict_vectorized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback: `_ShardReader.to_dict_vectorized` took the lexicographic coordinate array and key tuples as parameters, even though the reader already knows its own `chunks_per_shard` and both structures are `lru_cache`d. Thread nothing in — fetch them inside the method via `_lexicographic_order`/`_lexicographic_order_keys`. Same cache, so no perf change; the call site collapses to `to_dict_vectorized()`. Add a unit test covering the method directly across 0-d, 1-d, and 2-d shard grids: present chunks map to their stored bytes, empty chunks to None, and every lexicographic coordinate appears as a key. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/codecs/sharding.py | 28 +++++++++-------------- tests/test_codecs/test_sharding.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 535c682053..bcc783f386 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -266,27 +266,22 @@ def __len__(self) -> int: def __iter__(self) -> Iterator[tuple[int, ...]]: return c_order_iter(self.index.chunks_per_shard) - def to_dict_vectorized( - self, - chunk_coords_array: npt.NDArray[np.integer[Any]], - chunk_coords_keys: tuple[tuple[int, ...], ...], - ) -> dict[tuple[int, ...], Buffer | None]: + def to_dict_vectorized(self) -> dict[tuple[int, ...], Buffer | None]: """Build a dict of chunk coordinates to buffers using vectorized lookup. - Parameters - ---------- - chunk_coords_array : ndarray of shape (n_chunks, n_dims) - Array of chunk coordinates for vectorized index lookup. - chunk_coords_keys : tuple of coordinate tuples - The same coordinates as `chunk_coords_array`, in the same order, as - plain tuples for use as dict keys. Passed in (rather than derived - row-by-row from the array) so the cached value can be reused instead - of rebuilding 35k tuples on every write. + The full per-shard chunk coordinate grid (both the array used for the + vectorized index lookup and the plain tuples used as dict keys) is + cached on `chunks_per_shard`, so neither is rebuilt on every call. For a + shard with tens of thousands of chunks this avoids reconstructing that + many tuples on every partial write. Returns ------- dict mapping chunk coordinate tuples to Buffer or None """ + chunks_per_shard = self.index.chunks_per_shard + chunk_coords_array = _lexicographic_order(chunks_per_shard) + chunk_coords_keys = _lexicographic_order_keys(chunks_per_shard) starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array) result: dict[tuple[int, ...], Buffer | None] = {} @@ -624,10 +619,7 @@ async def _encode_partial_single( # Use vectorized lookup for better performance. The lexicographic # coordinate array and keys are cached, so neither is rebuilt on # every write. - shard_dict = shard_reader.to_dict_vectorized( - _lexicographic_order(chunks_per_shard), - _lexicographic_order_keys(chunks_per_shard), - ) + shard_dict = shard_reader.to_dict_vectorized() await self.codec_pipeline.write( [ diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 74e4a7e0d5..811de031e8 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -992,3 +992,39 @@ def test_shard_index_get_chunk_slices_vectorized(chunks_per_shard: tuple[int, .. assert starts[0] == 10 assert ends[0] == 14 np.testing.assert_array_equal(starts[~expected_valid], MAX_UINT_64) + + +@pytest.mark.parametrize("chunks_per_shard", [(), (3,), (2, 3)]) +def test_shard_reader_to_dict_vectorized(chunks_per_shard: tuple[int, ...]) -> None: + """to_dict_vectorized derives its own coords and maps present chunks to buffers, empty to None. + + The reader is given the full per-shard chunk grid implicitly (it reads + ``chunks_per_shard`` off its own index), so the result must contain every + lexicographic coordinate as a key, with the stored bytes for present chunks + and ``None`` for empty ones. + """ + all_coords = list(c_order_iter(chunks_per_shard)) + # Lay two chunks back-to-back in the buffer; leave the rest (if any) empty. + payload = b"abcdXY" + index = _ShardIndex.create_empty(chunks_per_shard) + index.set_chunk_slice(all_coords[0], slice(0, 4)) + present = {all_coords[0]: payload[0:4]} + if len(all_coords) > 1: + index.set_chunk_slice(all_coords[1], slice(4, 6)) + present[all_coords[1]] = payload[4:6] + + reader = _ShardReader() + reader.index = index + reader.buf = default_buffer_prototype().buffer.from_bytes(payload) + + result = reader.to_dict_vectorized() + + # Every lexicographic coordinate is present as a key, in order. + assert list(result.keys()) == all_coords + for coords in all_coords: + buf = result[coords] + if coords in present: + assert buf is not None + assert buf.to_bytes() == present[coords] + else: + assert buf is None From 51d0f050ceea4e2839b39271efbc14b8d38e116f Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 2 Jun 2026 17:26:54 +0200 Subject: [PATCH 3/6] Update src/zarr/core/indexing.py Co-authored-by: Ilan Gold --- src/zarr/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index ab658a4924..1e3ad0375f 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1608,7 +1608,7 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return iter(_lexicographic_order_keys(tuple(chunk_shape))) + return iter(_lexicographic_order_keys(chunk_shape)) def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: From f79a3a2039bc8f4c21308f318b6d1119abc7d458 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 2 Jun 2026 18:01:35 +0200 Subject: [PATCH 4/6] refactor(sharding): drop redundant lexicographic_order_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback from @ilan-gold and @chuckwondo on the `lexicographic_order_iter` helper. `lexicographic_order_iter` returned a *lazy* iterator over an *eagerly-built, cached* tuple (`_lexicographic_order_keys`), which chuckwondo rightly flagged as confusing — and its output is byte-for-byte identical to the pre-existing, genuinely-lazy `c_order_iter` (verified across 0-d, empty, and N-d shapes). So the name promised laziness the implementation didn't provide, over a sequence we could already produce. Remove the wrapper and use the cached `_lexicographic_order_keys` tuple directly at the two `dict.fromkeys` call sites and in `_subchunk_order_iter`. This keeps the eager/cached coordinate tuples — which is the actual optimization: `dict.fromkeys` over the cached tuple is ~1.4x faster than over lazy `c_order_iter` at 32^3 (≈900us vs ≈1300us), because the cache amortizes tuple construction across repeated writes to same-shaped shards. Switching to `c_order_iter` would have reintroduced that cost, so it is deliberately not used here. Also drop the now-dead `tuple()` wrap in `morton_order_iter` (its argument is typed `tuple[int, ...]` and every caller passes one), per ilan-gold. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/codecs/sharding.py | 10 ++++++---- src/zarr/core/indexing.py | 6 +----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index bcc783f386..d8048f2c26 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -50,7 +50,6 @@ _lexicographic_order_keys, c_order_iter, get_indexer, - lexicographic_order_iter, morton_order_iter, ) from zarr.core.metadata.v3 import ( @@ -537,7 +536,10 @@ def _subchunk_order_iter( case "morton": subchunk_iter = morton_order_iter(chunks_per_shard) case "lexicographic": - subchunk_iter = lexicographic_order_iter(chunks_per_shard) + # _lexicographic_order_keys returns the cached tuple of coords in + # C order; iterate it directly rather than via a wrapper that + # only hid the (already materialized) tuple behind iter(). + subchunk_iter = iter(_lexicographic_order_keys(chunks_per_shard)) case "colexicographic": subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1])) case "unordered": @@ -565,7 +567,7 @@ async def _encode_single( chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape), ) ) - shard_builder = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic")) + shard_builder = dict.fromkeys(_lexicographic_order_keys(chunks_per_shard)) await self.codec_pipeline.write( [ @@ -608,7 +610,7 @@ async def _encode_partial_single( ) if self._is_complete_shard_write(indexer, chunks_per_shard): - shard_dict = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic")) + shard_dict = dict.fromkeys(_lexicographic_order_keys(chunks_per_shard)) else: shard_reader = await self._load_full_shard_maybe( byte_getter=byte_setter, diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index ab658a4924..42ab962801 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1581,7 +1581,7 @@ def _morton_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], . def morton_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return iter(_morton_order_keys(tuple(chunk_shape))) + return iter(_morton_order_keys(chunk_shape)) @lru_cache(maxsize=16) @@ -1607,10 +1607,6 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape)) -def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return iter(_lexicographic_order_keys(tuple(chunk_shape))) - - def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: return itertools.product(*(range(x) for x in chunks_per_shard)) From 500104fc2a42248f3271180d37a2775bd55f1d36 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 2 Jun 2026 18:13:23 +0200 Subject: [PATCH 5/6] refactor(indexing): prefer lexicographic_order_iter, soft-deprecate c_order_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `c_order_iter` names a memory layout ("C order") rather than what the iterator actually yields. Reintroduce `lexicographic_order_iter` as the clearer name for the same row-major coordinate sequence, and make `c_order_iter` a thin alias that delegates to it, with a docstring note steering new code to the preferred name. No runtime warning — these are internal helpers. `lexicographic_order_iter` keeps the eager/cached implementation (iter over the lru_cached `_lexicographic_order_keys` tuple), which is ~1.4x faster than the old lazy `itertools.product` on the `dict.fromkeys` shard-write path and is the optimization this branch exists to deliver. The alias therefore changes `c_order_iter` from lazy to eager/cached; all in-repo callers (_ShardReader.__iter__, _is_total_shard, _subchunk_order_iter, and two tests) are migrated to `lexicographic_order_iter`, so nothing in-tree relies on the old laziness. Output is unchanged: lexicographic_order_iter, the c_order_iter alias, and np.ndindex all agree across 0-d, empty, and N-d shapes. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/codecs/sharding.py | 12 +++++------- src/zarr/core/indexing.py | 15 ++++++++++++++- tests/test_codecs/test_sharding.py | 6 +++--- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index d8048f2c26..e88c4038e0 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -48,8 +48,8 @@ SelectorTuple, _lexicographic_order, _lexicographic_order_keys, - c_order_iter, get_indexer, + lexicographic_order_iter, morton_order_iter, ) from zarr.core.metadata.v3 import ( @@ -263,7 +263,7 @@ def __len__(self) -> int: return int(self.index.offsets_and_lengths.size / 2) def __iter__(self) -> Iterator[tuple[int, ...]]: - return c_order_iter(self.index.chunks_per_shard) + return lexicographic_order_iter(self.index.chunks_per_shard) def to_dict_vectorized(self) -> dict[tuple[int, ...], Buffer | None]: """Build a dict of chunk coordinates to buffers using vectorized lookup. @@ -536,10 +536,7 @@ def _subchunk_order_iter( case "morton": subchunk_iter = morton_order_iter(chunks_per_shard) case "lexicographic": - # _lexicographic_order_keys returns the cached tuple of coords in - # C order; iterate it directly rather than via a wrapper that - # only hid the (already materialized) tuple behind iter(). - subchunk_iter = iter(_lexicographic_order_keys(chunks_per_shard)) + subchunk_iter = lexicographic_order_iter(chunks_per_shard) case "colexicographic": subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1])) case "unordered": @@ -692,7 +689,8 @@ def _is_total_shard( self, all_chunk_coords: set[tuple[int, ...]], chunks_per_shard: tuple[int, ...] ) -> bool: return len(all_chunk_coords) == product(chunks_per_shard) and all( - chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(chunks_per_shard) + chunk_coords in all_chunk_coords + for chunk_coords in lexicographic_order_iter(chunks_per_shard) ) def _is_complete_shard_write( diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 42ab962801..63f5f76861 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1607,8 +1607,21 @@ def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape)) +def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: + # Iterate the chunk grid in lexicographic (row-major / C) order. The full + # coordinate tuple is built once and cached on `chunk_shape` (see + # _lexicographic_order_keys), so repeated calls for the same shape — e.g. one + # per shard write — reuse it rather than rebuilding tens of thousands of + # tuples each time. + return iter(_lexicographic_order_keys(chunk_shape)) + + def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - return itertools.product(*(range(x) for x in chunks_per_shard)) + # Soft-deprecated alias for `lexicographic_order_iter`. "C order" names the + # memory layout rather than the ordering; prefer `lexicographic_order_iter`, + # which describes what the iterator yields. Kept for backwards compatibility; + # new code should call `lexicographic_order_iter` directly. + return lexicographic_order_iter(chunks_per_shard) def get_indexer( diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 811de031e8..d811d82979 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -21,7 +21,7 @@ ) from zarr.codecs.sharding import MAX_UINT_64, SubchunkWriteOrder, _ShardIndex, _ShardReader from zarr.core.buffer import NDArrayLike, default_buffer_prototype -from zarr.core.indexing import c_order_iter +from zarr.core.indexing import lexicographic_order_iter from zarr.storage import MemoryStore, StorePath, ZipStore from ..conftest import ArrayRequest @@ -978,7 +978,7 @@ def test_shard_index_get_chunk_slices_vectorized(chunks_per_shard: tuple[int, .. """get_chunk_slices_vectorized works uniformly across chunk grid ranks, including 0-D.""" index = _ShardIndex.create_empty(chunks_per_shard) # Write the first chunk; leave the rest (if any) empty. - all_coords = list(c_order_iter(chunks_per_shard)) + all_coords = list(lexicographic_order_iter(chunks_per_shard)) index.set_chunk_slice(all_coords[0], slice(10, 14)) coords_array = np.array(all_coords, dtype=np.uint64).reshape( @@ -1003,7 +1003,7 @@ def test_shard_reader_to_dict_vectorized(chunks_per_shard: tuple[int, ...]) -> N lexicographic coordinate as a key, with the stored bytes for present chunks and ``None`` for empty ones. """ - all_coords = list(c_order_iter(chunks_per_shard)) + all_coords = list(lexicographic_order_iter(chunks_per_shard)) # Lay two chunks back-to-back in the buffer; leave the rest (if any) empty. payload = b"abcdXY" index = _ShardIndex.create_empty(chunks_per_shard) From f6d91e610ae293953421605c74215a8eb57b0a84 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 2 Jun 2026 20:13:52 +0200 Subject: [PATCH 6/6] refactor(indexing): make lexicographic_order_iter the lazy primitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review from @mkitti: invert the relationship between the lazy iterator and the eagerly-collected tuple. `lexicographic_order_iter` is now a genuine lazy generator over the chunk-grid coordinates, and `_lexicographic_order_keys` collects it into a cached tuple — the eager version is "collect the lazy one", not the other way around. Previously lexicographic_order_iter returned iter() over the cached tuple, so any consumer that only needed a prefix still paid to materialize the entire grid. _is_total_shard does exactly that — an early-exit `all(coord in set for coord in ...)` — and on a cold cache for a 32^3 shard whose first coordinate is absent this dropped from ~15.8ms to ~24us (the lazy generator builds one coordinate and bails). The hot path is unchanged: the two dict.fromkeys sites consume the full grid and use the cached `_lexicographic_order_keys` tuple directly (~0.9ms at 32^3), so the regression fix this branch delivers is intact. This also resolves @chuckwondo's point — the iterator is now actually lazy rather than a thin wrapper over eager data. Co-authored-by: Mark Kittisopikul Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/core/indexing.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 63f5f76861..9af7bbb6d7 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1602,18 +1602,22 @@ def _lexicographic_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]: return order -@lru_cache(maxsize=16) -def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: - return tuple(tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape)) +def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: + # Lazily yield the chunk grid coordinates in lexicographic (row-major / C) + # order. This is the primitive: callers that only need a prefix (e.g. an + # early-exit `all(...)`) pay only for the coordinates they consume, instead + # of materializing the whole grid. Callers that need every coordinate + # repeatedly for the same shape should use `_lexicographic_order_keys`, which + # caches the fully collected tuple. + return (tuple(int(x) for x in row) for row in _lexicographic_order(chunk_shape)) -def lexicographic_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]: - # Iterate the chunk grid in lexicographic (row-major / C) order. The full - # coordinate tuple is built once and cached on `chunk_shape` (see - # _lexicographic_order_keys), so repeated calls for the same shape — e.g. one - # per shard write — reuse it rather than rebuilding tens of thousands of - # tuples each time. - return iter(_lexicographic_order_keys(chunk_shape)) +@lru_cache(maxsize=16) +def _lexicographic_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: + # Eagerly collect `lexicographic_order_iter` into a tuple, cached per shape so + # the per-shard chunk grid is built once and reused across writes (the hot + # path feeds this straight into `dict.fromkeys`). + return tuple(lexicographic_order_iter(chunk_shape)) def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]: