Merge branch 'perf/prepared-write-v2' of github.com:d-v-b/zarr-python into perf/prepared-write-v2-bench

d-v-b · d-v-b · commit 8330cde91e4e · 2026-04-09T17:35:11.000+02:00
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -763,6 +763,13 @@ class ChunkLayout:
     def is_sharded(self) -> bool:
         return False
 
+    def needed_coords(self, chunk_selection: SelectorTuple) -> set[tuple[int, ...]] | None:
+        """Compute which inner chunk coordinates overlap a selection.
+
+        Returns ``None`` for trivial layouts (only one inner chunk).
+        """
+        return None
+
     def unpack_blob(self, blob: Buffer) -> dict[tuple[int, ...], Buffer | None]:
         raise NotImplementedError
 
@@ -771,18 +778,31 @@ def pack_blob(
     ) -> Buffer | None:
         raise NotImplementedError
 
-    async def fetch_full_shard(
-        self, byte_getter: Any
+    async def fetch(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
-        """Fetch all inner chunk buffers. IO phase.
+        """Fetch inner chunk buffers from the store. IO phase.
 
-        For non-sharded, fetches the full blob. For sharded, fetches the
-        index and then the needed inner chunks via byte-range reads.
+        Parameters
+        ----------
+        byte_getter
+            The store path to read from.
+        needed_coords
+            The set of inner chunk coordinates to fetch. ``None`` means all.
+
+        Returns
+        -------
+        A mapping from inner chunk coordinates to their raw bytes, or
+        ``None`` if the blob/shard does not exist in the store.
         """
         raise NotImplementedError
 
-    def fetch_full_shard_sync(
-        self, byte_getter: Any
+    def fetch_sync(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
         raise NotImplementedError
 
@@ -806,8 +826,10 @@ def pack_blob(
         key = (0,) * len(self.chunks_per_shard)
         return chunk_dict.get(key)
 
-    async def fetch_full_shard(
-        self, byte_getter: Any
+    async def fetch(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
         from zarr.core.buffer import default_buffer_prototype
 
@@ -816,8 +838,10 @@ async def fetch_full_shard(
             return None
         return self.unpack_blob(blob)
 
-    def fetch_full_shard_sync(
-        self, byte_getter: Any
+    def fetch_sync(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
         from zarr.core.buffer import default_buffer_prototype
 
@@ -843,6 +867,19 @@ class ShardedChunkLayout(ChunkLayout):
 
     chunk_shape: tuple[int, ...]
     inner_chunk_shape: tuple[int, ...]
+
+    def needed_coords(self, chunk_selection: SelectorTuple) -> set[tuple[int, ...]] | None:
+        """Compute which inner chunks overlap the selection."""
+        from zarr.core.chunk_grids import ChunkGrid as _ChunkGrid
+        from zarr.core.indexing import get_indexer
+
+        indexer = get_indexer(
+            chunk_selection,
+            shape=self.chunk_shape,
+            chunk_grid=_ChunkGrid.from_sizes(self.chunk_shape, self.inner_chunk_shape),
+        )
+        return {coords for coords, *_ in indexer}
+
     chunks_per_shard: tuple[int, ...]
     inner_transform: ChunkTransform
     _index_transform: ChunkTransform
@@ -919,24 +956,36 @@ def pack_blob(
 
         return template.combine(buffers)
 
-    async def fetch_full_shard(
-        self, byte_getter: Any
+    async def fetch(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
-        """Fetch shard index + all inner chunks via byte-range reads."""
+        """Fetch shard index + inner chunks via byte-range reads.
+
+        If ``needed_coords`` is None, fetches all inner chunks.
+        Otherwise fetches only the specified coordinates.
+        """
         index = await self._fetch_index(byte_getter)
         if index is None:
             return None
-        all_coords = set(np.ndindex(self.chunks_per_shard))
-        return await self._fetch_chunks(byte_getter, index, all_coords)
+        coords = (
+            needed_coords if needed_coords is not None else set(np.ndindex(self.chunks_per_shard))
+        )
+        return await self._fetch_chunks(byte_getter, index, coords)
 
-    def fetch_full_shard_sync(
-        self, byte_getter: Any
+    def fetch_sync(
+        self,
+        byte_getter: Any,
+        needed_coords: set[tuple[int, ...]] | None = None,
     ) -> dict[tuple[int, ...], Buffer | None] | None:
         index = self._fetch_index_sync(byte_getter)
         if index is None:
             return None
-        all_coords = set(np.ndindex(self.chunks_per_shard))
-        return self._fetch_chunks_sync(byte_getter, index, all_coords)
+        coords = (
+            needed_coords if needed_coords is not None else set(np.ndindex(self.chunks_per_shard))
+        )
+        return self._fetch_chunks_sync(byte_getter, index, coords)
 
     async def _fetch_index(self, byte_getter: Any) -> Any:
         from zarr.abc.store import RangeByteRequest, SuffixByteRequest
@@ -1512,14 +1561,16 @@ async def _fetch_and_decode(
         self,
         byte_getter: Any,
         chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
         layout: ChunkLayout,
     ) -> NDBuffer | None:
-        """IO + compute: fetch all inner chunk buffers, then decode into chunk-shaped array.
+        """IO + compute: fetch inner chunk buffers, then decode into chunk-shaped array.
 
-        1. IO: ``layout.fetch_full_shard`` fetches the blob or byte-ranges
+        1. IO: ``layout.fetch`` fetches only the inner chunks that overlap the selection
         2. Compute: decode each inner chunk and assemble into chunk-shaped output
         """
-        chunk_dict = await layout.fetch_full_shard(byte_getter)
+        needed = layout.needed_coords(chunk_selection)
+        chunk_dict = await layout.fetch(byte_getter, needed_coords=needed)
         if chunk_dict is None:
             return None
         return self._decode_shard(chunk_dict, chunk_spec, layout)
@@ -1538,7 +1589,10 @@ async def read(
             # Sharded: use selective byte-range reads per shard
             decoded: list[NDBuffer | None] = list(
                 await concurrent_map(
-                    [(bg, cs, self._get_layout(cs)) for bg, cs, *_ in batch],
+                    [
+                        (bg, cs, chunk_sel, self._get_layout(cs))
+                        for bg, cs, chunk_sel, _, _ in batch
+                    ],
                     self._fetch_and_decode,
                     config.get("async.concurrency"),
                 )
@@ -1634,10 +1688,12 @@ def _fetch_and_decode_sync(
         self,
         byte_getter: Any,
         chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
         layout: ChunkLayout,
     ) -> NDBuffer | None:
-        """Sync IO + compute: fetch all inner chunk buffers, then decode."""
-        chunk_dict = layout.fetch_full_shard_sync(byte_getter)
+        """Sync IO + compute: fetch inner chunk buffers, then decode."""
+        needed = layout.needed_coords(chunk_selection)
+        chunk_dict = layout.fetch_sync(byte_getter, needed_coords=needed)
         if chunk_dict is None:
             return None
         return self._decode_shard(chunk_dict, chunk_spec, layout)
@@ -1657,7 +1713,8 @@ def read_sync(
         if self.layout is not None and self.layout.is_sharded:
             # Sharded: selective byte-range reads per shard
             decoded: list[NDBuffer | None] = [
-                self._fetch_and_decode_sync(bg, cs, self._get_layout(cs)) for bg, cs, *_ in batch
+                self._fetch_and_decode_sync(bg, cs, chunk_sel, self._get_layout(cs))
+                for bg, cs, chunk_sel, _, _ in batch
             ]
         else:
             # Non-sharded: fetch full blobs, decode (optionally threaded)