|
2 | 2 |
|
3 | 3 | from abc import abstractmethod |
4 | 4 | from collections.abc import Mapping |
| 5 | +from dataclasses import dataclass |
5 | 6 | from typing import TYPE_CHECKING, Literal, Protocol, TypeGuard, runtime_checkable |
6 | 7 |
|
7 | 8 | from typing_extensions import ReadOnly, TypedDict |
|
13 | 14 |
|
14 | 15 | if TYPE_CHECKING: |
15 | 16 | from collections.abc import Awaitable, Callable, Iterable |
16 | | - from typing import Self |
| 17 | + from typing import Any, Self |
17 | 18 |
|
18 | 19 | from zarr.abc.store import ByteGetter, ByteSetter, Store |
19 | 20 | from zarr.core.array_spec import ArraySpec |
20 | 21 | from zarr.core.chunk_grids import ChunkGrid |
21 | 22 | from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType |
22 | | - from zarr.core.indexing import SelectorTuple |
| 23 | + from zarr.core.indexing import ChunkProjection, SelectorTuple |
23 | 24 | from zarr.core.metadata import ArrayMetadata |
24 | 25 |
|
25 | 26 | __all__ = [ |
|
33 | 34 | "CodecOutput", |
34 | 35 | "CodecPipeline", |
35 | 36 | "GetResult", |
| 37 | + "PreparedWrite", |
| 38 | + "SupportsChunkCodec", |
| 39 | + "SupportsChunkPacking", |
36 | 40 | "SupportsSyncCodec", |
37 | 41 | ] |
38 | 42 |
|
@@ -82,6 +86,116 @@ def _decode_sync(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: ... |
82 | 86 | def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ... |
83 | 87 |
|
84 | 88 |
|
| 89 | +class SupportsChunkCodec(Protocol): |
| 90 | + """Protocol for objects that can decode/encode whole chunks synchronously. |
| 91 | +
|
| 92 | + `ChunkTransform` satisfies this protocol. |
| 93 | + """ |
| 94 | + |
| 95 | + array_spec: ArraySpec |
| 96 | + |
| 97 | + def decode_chunk(self, chunk_bytes: Buffer) -> NDBuffer: ... |
| 98 | + |
| 99 | + def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ... |
| 100 | + |
| 101 | + |
| 102 | +class SupportsChunkPacking(Protocol): |
| 103 | + """Protocol for codecs that can pack/unpack inner chunks into a storage blob |
| 104 | + and manage the prepare/finalize IO lifecycle. |
| 105 | +
|
| 106 | + `BytesCodec` and `ShardingCodec` implement this protocol. The pipeline |
| 107 | + uses it to separate IO (prepare/finalize) from compute (encode/decode), |
| 108 | + enabling the compute phase to run in a thread pool. |
| 109 | +
|
| 110 | + The lifecycle is: |
| 111 | +
|
| 112 | + 1. **Prepare**: fetch existing bytes from the store (if partial write), |
| 113 | + unpack into per-inner-chunk buffers → `PreparedWrite` |
| 114 | + 2. **Compute**: iterate `PreparedWrite.indexer`, decode each inner chunk, |
| 115 | + merge new data, re-encode, update `PreparedWrite.chunk_dict` |
| 116 | + 3. **Finalize**: pack `chunk_dict` back into a blob and write to store |
| 117 | + """ |
| 118 | + |
| 119 | + @property |
| 120 | + def inner_codec_chain(self) -> SupportsChunkCodec | None: |
| 121 | + """The codec chain for inner chunks, or `None` to use the pipeline's.""" |
| 122 | + ... |
| 123 | + |
| 124 | + def unpack_chunks( |
| 125 | + self, |
| 126 | + raw: Buffer | None, |
| 127 | + chunk_spec: ArraySpec, |
| 128 | + ) -> dict[tuple[int, ...], Buffer | None]: |
| 129 | + """Unpack a storage blob into per-inner-chunk encoded buffers.""" |
| 130 | + ... |
| 131 | + |
| 132 | + def pack_chunks( |
| 133 | + self, |
| 134 | + chunk_dict: dict[tuple[int, ...], Buffer | None], |
| 135 | + chunk_spec: ArraySpec, |
| 136 | + ) -> Buffer | None: |
| 137 | + """Pack per-inner-chunk encoded buffers into a single storage blob.""" |
| 138 | + ... |
| 139 | + |
| 140 | + def prepare_read_sync( |
| 141 | + self, |
| 142 | + byte_getter: Any, |
| 143 | + chunk_selection: SelectorTuple, |
| 144 | + codec_chain: SupportsChunkCodec, |
| 145 | + ) -> NDBuffer | None: |
| 146 | + """Fetch and decode a chunk synchronously, returning the selected region.""" |
| 147 | + ... |
| 148 | + |
| 149 | + def prepare_write_sync( |
| 150 | + self, |
| 151 | + byte_setter: Any, |
| 152 | + codec_chain: SupportsChunkCodec, |
| 153 | + chunk_selection: SelectorTuple, |
| 154 | + out_selection: SelectorTuple, |
| 155 | + replace: bool, |
| 156 | + ) -> PreparedWrite: |
| 157 | + """Prepare a synchronous write: fetch existing data if needed, unpack.""" |
| 158 | + ... |
| 159 | + |
| 160 | + def finalize_write_sync( |
| 161 | + self, |
| 162 | + prepared: PreparedWrite, |
| 163 | + chunk_spec: ArraySpec, |
| 164 | + byte_setter: Any, |
| 165 | + ) -> None: |
| 166 | + """Pack the prepared chunk data and write it to the store.""" |
| 167 | + ... |
| 168 | + |
| 169 | + async def prepare_read( |
| 170 | + self, |
| 171 | + byte_getter: Any, |
| 172 | + chunk_selection: SelectorTuple, |
| 173 | + codec_chain: SupportsChunkCodec, |
| 174 | + ) -> NDBuffer | None: |
| 175 | + """Async variant of `prepare_read_sync`.""" |
| 176 | + ... |
| 177 | + |
| 178 | + async def prepare_write( |
| 179 | + self, |
| 180 | + byte_setter: Any, |
| 181 | + codec_chain: SupportsChunkCodec, |
| 182 | + chunk_selection: SelectorTuple, |
| 183 | + out_selection: SelectorTuple, |
| 184 | + replace: bool, |
| 185 | + ) -> PreparedWrite: |
| 186 | + """Async variant of `prepare_write_sync`.""" |
| 187 | + ... |
| 188 | + |
| 189 | + async def finalize_write( |
| 190 | + self, |
| 191 | + prepared: PreparedWrite, |
| 192 | + chunk_spec: ArraySpec, |
| 193 | + byte_setter: Any, |
| 194 | + ) -> None: |
| 195 | + """Async variant of `finalize_write_sync`.""" |
| 196 | + ... |
| 197 | + |
| 198 | + |
85 | 199 | class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata): |
86 | 200 | """Generic base class for codecs. |
87 | 201 |
|
@@ -207,6 +321,37 @@ class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): |
207 | 321 | """Base class for array-to-array codecs.""" |
208 | 322 |
|
209 | 323 |
|
| 324 | +@dataclass |
| 325 | +class PreparedWrite: |
| 326 | + """Intermediate state between reading existing data and writing new data. |
| 327 | +
|
| 328 | + Created by `prepare_write_sync` / `prepare_write`, consumed by |
| 329 | + `finalize_write_sync` / `finalize_write`. The compute phase sits |
| 330 | + in between: iterate over `indexer`, decode the corresponding entry |
| 331 | + in `chunk_dict`, merge new data, re-encode, and store the result |
| 332 | + back into `chunk_dict`. |
| 333 | +
|
| 334 | + Attributes |
| 335 | + ---------- |
| 336 | + chunk_dict : dict[tuple[int, ...], Buffer | None] |
| 337 | + Per-inner-chunk encoded bytes, keyed by chunk coordinates. |
| 338 | + For a regular array this is `{(0,): <bytes>}`. For a sharded |
| 339 | + array it contains one entry per inner chunk in the shard, |
| 340 | + including chunks not being modified (they pass through |
| 341 | + unchanged). `None` means the chunk did not exist on disk. |
| 342 | + indexer : list[ChunkProjection] |
| 343 | + The inner chunks to modify. Each entry's `chunk_coords` |
| 344 | + corresponds to a key in `chunk_dict`. `chunk_selection` |
| 345 | + identifies the region within that inner chunk, and |
| 346 | + `out_selection` identifies the corresponding region in the |
| 347 | + source value array. This is a subset of `chunk_dict`'s keys |
| 348 | + — untouched chunks are not listed. |
| 349 | + """ |
| 350 | + |
| 351 | + chunk_dict: dict[tuple[int, ...], Buffer | None] |
| 352 | + indexer: list[ChunkProjection] |
| 353 | + |
| 354 | + |
210 | 355 | class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]): |
211 | 356 | """Base class for array-to-bytes codecs.""" |
212 | 357 |
|
|
0 commit comments