feat: define PreparedWrite and SupportsChunkPacking data structures

d-v-b · d-v-b · commit a75604ac051b · 2026-04-07T10:38:41.000+02:00
`PreparedWrite` models a set of per-chunk changes that would be applied to a stored chunk. `SupportsChunkPacking`
is a protocol for array -&gt; bytes codecs that can use `PreparedWrite` objects to update an existing chunk.
diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
@@ -2,6 +2,7 @@
 
 from abc import abstractmethod
 from collections.abc import Mapping
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal, Protocol, TypeGuard, runtime_checkable
 
 from typing_extensions import ReadOnly, TypedDict
@@ -13,13 +14,13 @@
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable, Iterable
-    from typing import Self
+    from typing import Any, Self
 
     from zarr.abc.store import ByteGetter, ByteSetter, Store
     from zarr.core.array_spec import ArraySpec
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
-    from zarr.core.indexing import SelectorTuple
+    from zarr.core.indexing import ChunkProjection, SelectorTuple
     from zarr.core.metadata import ArrayMetadata
 
 __all__ = [
@@ -33,6 +34,9 @@
     "CodecOutput",
     "CodecPipeline",
     "GetResult",
+    "PreparedWrite",
+    "SupportsChunkCodec",
+    "SupportsChunkPacking",
     "SupportsSyncCodec",
 ]
 
@@ -82,6 +86,116 @@ def _decode_sync(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: ...
     def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ...
 
 
+class SupportsChunkCodec(Protocol):
+    """Protocol for objects that can decode/encode whole chunks synchronously.
+
+    `ChunkTransform` satisfies this protocol.
+    """
+
+    array_spec: ArraySpec
+
+    def decode_chunk(self, chunk_bytes: Buffer) -> NDBuffer: ...
+
+    def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ...
+
+
+class SupportsChunkPacking(Protocol):
+    """Protocol for codecs that can pack/unpack inner chunks into a storage blob
+    and manage the prepare/finalize IO lifecycle.
+
+    `BytesCodec` and `ShardingCodec` implement this protocol. The pipeline
+    uses it to separate IO (prepare/finalize) from compute (encode/decode),
+    enabling the compute phase to run in a thread pool.
+
+    The lifecycle is:
+
+    1. **Prepare**: fetch existing bytes from the store (if partial write),
+       unpack into per-inner-chunk buffers → `PreparedWrite`
+    2. **Compute**: iterate `PreparedWrite.indexer`, decode each inner chunk,
+       merge new data, re-encode, update `PreparedWrite.chunk_dict`
+    3. **Finalize**: pack `chunk_dict` back into a blob and write to store
+    """
+
+    @property
+    def inner_codec_chain(self) -> SupportsChunkCodec | None:
+        """The codec chain for inner chunks, or `None` to use the pipeline's."""
+        ...
+
+    def unpack_chunks(
+        self,
+        raw: Buffer | None,
+        chunk_spec: ArraySpec,
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Unpack a storage blob into per-inner-chunk encoded buffers."""
+        ...
+
+    def pack_chunks(
+        self,
+        chunk_dict: dict[tuple[int, ...], Buffer | None],
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Pack per-inner-chunk encoded buffers into a single storage blob."""
+        ...
+
+    def prepare_read_sync(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Fetch and decode a chunk synchronously, returning the selected region."""
+        ...
+
+    def prepare_write_sync(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Prepare a synchronous write: fetch existing data if needed, unpack."""
+        ...
+
+    def finalize_write_sync(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Pack the prepared chunk data and write it to the store."""
+        ...
+
+    async def prepare_read(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Async variant of `prepare_read_sync`."""
+        ...
+
+    async def prepare_write(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Async variant of `prepare_write_sync`."""
+        ...
+
+    async def finalize_write(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Async variant of `finalize_write_sync`."""
+        ...
+
+
 class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata):
     """Generic base class for codecs.
 
@@ -207,6 +321,37 @@ class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
     """Base class for array-to-array codecs."""
 
 
+@dataclass
+class PreparedWrite:
+    """Intermediate state between reading existing data and writing new data.
+
+    Created by `prepare_write_sync` / `prepare_write`, consumed by
+    `finalize_write_sync` / `finalize_write`. The compute phase sits
+    in between: iterate over `indexer`, decode the corresponding entry
+    in `chunk_dict`, merge new data, re-encode, and store the result
+    back into `chunk_dict`.
+
+    Attributes
+    ----------
+    chunk_dict : dict[tuple[int, ...], Buffer | None]
+        Per-inner-chunk encoded bytes, keyed by chunk coordinates.
+        For a regular array this is `{(0,): <bytes>}`. For a sharded
+        array it contains one entry per inner chunk in the shard,
+        including chunks not being modified (they pass through
+        unchanged). `None` means the chunk did not exist on disk.
+    indexer : list[ChunkProjection]
+        The inner chunks to modify. Each entry's `chunk_coords`
+        corresponds to a key in `chunk_dict`. `chunk_selection`
+        identifies the region within that inner chunk, and
+        `out_selection` identifies the corresponding region in the
+        source value array. This is a subset of `chunk_dict`'s keys
+        — untouched chunks are not listed.
+    """
+
+    chunk_dict: dict[tuple[int, ...], Buffer | None]
+    indexer: list[ChunkProjection]
+
+
 class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]):
     """Base class for array-to-bytes codecs."""
 
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
@@ -5,15 +5,16 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-from zarr.abc.codec import ArrayBytesCodec
+from zarr.abc.codec import ArrayBytesCodec, PreparedWrite, SupportsChunkCodec
 from zarr.core.buffer import Buffer, NDBuffer
 from zarr.core.common import JSON, parse_enum, parse_named_configuration
 from zarr.core.dtype.common import HasEndianness
 
 if TYPE_CHECKING:
-    from typing import Self
+    from typing import Any, Self
 
     from zarr.core.array_spec import ArraySpec
+    from zarr.core.indexing import SelectorTuple
 
 
 class Endian(Enum):
@@ -125,3 +126,114 @@ async def _encode_single(
 
     def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         return input_byte_length
+
+    # -- SupportsChunkPacking --
+
+    @property
+    def inner_codec_chain(self) -> SupportsChunkCodec | None:
+        """Returns `None` — the pipeline should use its own codec chain."""
+        return None
+
+    def unpack_chunks(
+        self,
+        raw: Buffer | None,
+        chunk_spec: ArraySpec,
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Single chunk keyed at `(0,)`."""
+        return {(0,): raw}
+
+    def pack_chunks(
+        self,
+        chunk_dict: dict[tuple[int, ...], Buffer | None],
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Return the single chunk's bytes."""
+        return chunk_dict.get((0,))
+
+    def prepare_read_sync(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Fetch, decode, and return the selected region synchronously."""
+        raw = byte_getter.get_sync(prototype=codec_chain.array_spec.prototype)
+        if raw is None:
+            return None
+        chunk_array = codec_chain.decode_chunk(raw)
+        return chunk_array[chunk_selection]
+
+    def prepare_write_sync(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Fetch existing data if needed, unpack, return `PreparedWrite`."""
+        from zarr.core.indexing import ChunkProjection
+
+        existing: Buffer | None = None
+        if not replace:
+            existing = byte_setter.get_sync(prototype=codec_chain.array_spec.prototype)
+        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
+        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
+        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
+
+    def finalize_write_sync(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Pack and write to store, or delete if empty."""
+        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            byte_setter.delete_sync()
+        else:
+            byte_setter.set_sync(blob)
+
+    async def prepare_read(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Async variant of `prepare_read_sync`."""
+        raw = await byte_getter.get(prototype=codec_chain.array_spec.prototype)
+        if raw is None:
+            return None
+        chunk_array = codec_chain.decode_chunk(raw)
+        return chunk_array[chunk_selection]
+
+    async def prepare_write(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Async variant of `prepare_write_sync`."""
+        from zarr.core.indexing import ChunkProjection
+
+        existing: Buffer | None = None
+        if not replace:
+            existing = await byte_setter.get(prototype=codec_chain.array_spec.prototype)
+        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
+        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
+        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
+
+    async def finalize_write(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Async variant of `finalize_write_sync`."""
+        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            await byte_setter.delete()
+        else:
+            await byte_setter.set(blob)
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -118,7 +118,7 @@ def __post_init__(self) -> None:
             bb_sync.append(bb_codec)
         self._bb_codecs = tuple(bb_sync)
 
-    def decode(
+    def decode_chunk(
         self,
         chunk_bytes: Buffer,
     ) -> NDBuffer:
@@ -137,7 +137,7 @@ def decode(
 
         return chunk_array
 
-    def encode(
+    def encode_chunk(
         self,
         chunk_array: NDBuffer,
     ) -> Buffer | None:
diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py
@@ -99,9 +99,9 @@ def test_encode_decode_roundtrip(
     chain = ChunkTransform(codecs=codecs, array_spec=spec)
     nd_buf = _make_nd_buffer(arr)
 
-    encoded = chain.encode(nd_buf)
+    encoded = chain.encode_chunk(nd_buf)
     assert encoded is not None
-    decoded = chain.decode(encoded)
+    decoded = chain.decode_chunk(encoded)
     np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
 
@@ -142,4 +142,4 @@ def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer
     )
     arr = np.arange(12, dtype="float64").reshape(3, 4)
     nd_buf = _make_nd_buffer(arr)
-    assert chain.encode(nd_buf) is None
+    assert chain.encode_chunk(nd_buf) is None