diff --git a/bench/ctable/bench_nested_parquet_roundtrip.py b/bench/ctable/bench_nested_parquet_roundtrip.py new file mode 100644 index 00000000..33b8030b --- /dev/null +++ b/bench/ctable/bench_nested_parquet_roundtrip.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +from __future__ import annotations + +import argparse +import os +import shutil +import tempfile +import time +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq + +import blosc2 + + +def _dir_size(path: Path) -> int: + total = 0 + for root, _, files in os.walk(path): + for f in files: + total += (Path(root) / f).stat().st_size + return total + + +def main() -> None: + p = argparse.ArgumentParser(description="Benchmark CTable nested Parquet roundtrip") + p.add_argument("parquet", help="Input Parquet file") + p.add_argument("--rows", type=int, default=0, help="Sample first N rows (0 = full file)") + p.add_argument("--keep", action="store_true", help="Keep temporary outputs") + args = p.parse_args() + + src = Path(args.parquet) + if not src.exists(): + raise FileNotFoundError(src) + + workdir = Path(tempfile.mkdtemp(prefix="b2-nested-bench-")) + sample_path = workdir / "sample.parquet" + out_b2d = workdir / "out.b2d" + out_parquet = workdir / "out.parquet" + + try: + input_path = src + if args.rows > 0: + pf = pq.ParquetFile(src) + batch = next(pf.iter_batches(batch_size=args.rows)) + table = pa.Table.from_batches([batch], schema=pf.schema_arrow) + pq.write_table(table, sample_path) + input_path = sample_path + + t0 = time.perf_counter() + t = blosc2.CTable.from_parquet(str(input_path)) + t1 = time.perf_counter() + + t.save(str(out_b2d), overwrite=True) + t2 = time.perf_counter() + + t.to_parquet(str(out_parquet)) + t3 = time.perf_counter() + + print("=== CTable nested Parquet roundtrip benchmark ===") + print(f"input: {input_path}") + print(f"rows: {t.nrows}") + print(f"columns: {len(t.col_names)}") + print(f"from_parquet (s): {t1 - t0:.3f}") + print(f"save b2d (s): {t2 - t1:.3f}") + print(f"to_parquet (s): {t3 - t2:.3f}") + print(f"input bytes: {input_path.stat().st_size}") + print(f"output parquet: {out_parquet.stat().st_size}") + print(f"output b2d bytes: {_dir_size(out_b2d)}") + print(f"workdir: {workdir}") + + if not args.keep: + shutil.rmtree(workdir) + except Exception: + if not args.keep: + shutil.rmtree(workdir, ignore_errors=True) + raise + + +if __name__ == "__main__": + main() diff --git a/bench/ctable/where-nulls.py b/bench/ctable/where-nulls.py new file mode 100644 index 00000000..5b47285d --- /dev/null +++ b/bench/ctable/where-nulls.py @@ -0,0 +1,118 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Create a persistent nullable CTable for where() benchmarks. + +Usage: + python bench/ctable/where-nulls.py table.b2d + python bench/ctable/where-nulls.py table.b2z +""" + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +from pathlib import Path +from time import perf_counter + +import numpy as np + +import blosc2 + +NROWS = 500_000_000 +NULL_VALUE = 500 +RNG_SEED = 42 + + +@dataclass +class Row: + nrow: int = blosc2.field(blosc2.int64(ge=0)) + col1: int = blosc2.field(blosc2.int64(ge=0, le=1000, null_value=NULL_VALUE), default=None) + col2: int = blosc2.field(blosc2.int64(ge=0, le=1000, null_value=NULL_VALUE), default=None) + + +DTYPE = np.dtype( + [ + ("nrow", np.int64), + ("col1", np.int64), + ("col2", np.int64), + ] +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "urlpath", + help="Output table path. Use a .b2d directory or a .b2z file extension.", + ) + return parser.parse_args() + + +def check_urlpath(urlpath: str) -> str: + suffix = Path(urlpath).suffix + if suffix not in {".b2d", ".b2z"}: + raise SystemExit("urlpath must end in .b2d (directory-backed) or .b2z (zip-backed)") + return suffix[1:] + + +def make_nullable_column(rng: np.random.Generator) -> np.ndarray: + # Normal distribution centered at 500, with practically all values in [0, 1000]. + return np.rint(rng.normal(loc=500, scale=50, size=NROWS)).clip(0, 1000).astype(np.int64) + + +def make_data() -> np.ndarray: + rng = np.random.default_rng(RNG_SEED) + data = np.empty(NROWS, dtype=DTYPE) + data["nrow"] = np.arange(NROWS, dtype=np.int64) + data["col1"] = make_nullable_column(rng) + data["col2"] = make_nullable_column(rng) + return data + + +def fmt_bytes(nbytes: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if abs(nbytes) < 1024 or unit == "GiB": + return f"{nbytes:.2f} {unit}" if unit != "B" else f"{nbytes} {unit}" + nbytes /= 1024 + return f"{nbytes:.2f} GiB" + + +def main() -> None: + args = parse_args() + format_name = check_urlpath(args.urlpath) + + t0 = perf_counter() + data = make_data() + nulls_col1 = int(np.count_nonzero(data["col1"] == NULL_VALUE)) + nulls_col2 = int(np.count_nonzero(data["col2"] == NULL_VALUE)) + + table = blosc2.CTable(Row, urlpath=args.urlpath, mode="w", expected_size=NROWS, validate=False) + table.extend(data, validate=False) + elapsed = perf_counter() - t0 + + print("CTable nullable where() benchmark data created") + print("=" * 52) + print(f"urlpath: {args.urlpath}") + print(f"format: {format_name}") + print(f"rows: {len(table):,}") + print(f"columns: {', '.join(table.col_names)}") + print(f"null sentinel: {NULL_VALUE}") + print(f"col1 nulls: {nulls_col1:,}") + print(f"col2 nulls: {nulls_col2:,}") + print(f"uncompressed: {fmt_bytes(table.nbytes)}") + print(f"compressed: {fmt_bytes(table.cbytes)}") + print(f"compression: {table.cratio:.2f}x") + print(f"creation time: {elapsed:.3f} s") + print() + print(table) + + table.close() + + +if __name__ == "__main__": + main() diff --git a/bench/large-dict-store.py b/bench/large-dict-store.py new file mode 100644 index 00000000..c1524544 --- /dev/null +++ b/bench/large-dict-store.py @@ -0,0 +1,137 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### +import os +import time +import numpy as np +import blosc2 +from blosc2 import DictStore +from memory_profiler import memory_usage + +def make_arrays(n, min_size, max_size, dtype="f8"): + sizes = np.linspace(min_size, max_size, n).astype(int) + #arrays = [blosc2.arange(size, dtype=dtype) for size in sizes] + arrays = [blosc2.linspace(0, 1, size, dtype=dtype) for size in sizes] + #arrays = [np.random.randint(0, 100, size=size, dtype=dtype) for size in sizes] + # Calculate uncompressed size + uncompressed_size = sum(arr.nbytes for arr in arrays) + print(f"Uncompressed data size: {uncompressed_size / 1e9:.2f} GB") + return arrays, sizes, uncompressed_size + +def get_file_size(filepath): + """Get file size in MB.""" + if os.path.exists(filepath): + return os.path.getsize(filepath) / 2**20 + return 0 + +def check_arrays(tree_path, arrays, prefix="node"): + print("Checking stored arrays...") + tree = DictStore(tree_path, mode="r") + for i, arr in enumerate(arrays): + stored_arr = tree[f"/{prefix}{i}"][:] + if not np.allclose(arr, stored_arr): + raise ValueError(f"Array mismatch at {prefix}{i}") + +def run_embed_tree(arrays, threshold, tree_path, uncompressed_size, check=False): + def embed_process(): + tree = DictStore(tree_path, mode="w", threshold=threshold) + for i, arr in enumerate(arrays): + tree[f"/node{i}"] = arr + tree.close() + + t0 = time.time() + mem_usage = memory_usage((embed_process, ()), interval=0.1) + t1 = time.time() + peak_mem = max(mem_usage) - min(mem_usage) + file_size = get_file_size(tree_path) + compression_ratio = uncompressed_size / (file_size * 2**20) if file_size > 0 else 0 + print(f"[Embed] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, File size: {file_size:.2f} MB," + f" Compression: {compression_ratio:.1f}x") + + if check: + check_arrays(tree_path, arrays, prefix="node") + + return t1-t0, peak_mem, file_size + +def run_external_tree(arrays, threshold, tree_path, arr_prefix, uncompressed_size, check=False): + def external_process(): + tree = DictStore(tree_path, mode="w", threshold=threshold) + for i, arr in enumerate(arrays): + arr_path = f"{arr_prefix}_node{i}.b2nd" + arr_b2 = blosc2.asarray(arr, urlpath=arr_path, mode="w") + tree[f"/node{i}"] = arr_b2 + tree.close() + + t0 = time.time() + mem_usage = memory_usage((external_process, ()), interval=0.1) + t1 = time.time() + peak_mem = max(mem_usage) - min(mem_usage) + file_size = get_file_size(tree_path) + total_external_size = sum(get_file_size(f"{arr_prefix}_node{i}.b2nd") for i in range(len(arrays))) + total_size_mb = (file_size + total_external_size) + compression_ratio = uncompressed_size / (total_size_mb * 2**20) if total_size_mb > 0 else 0 + print(f"[External] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, DictStore file size: {file_size:.2f} MB," + f" External files size: {total_external_size:.2f} MB, Total: {total_size_mb:.2f} MB," + f" Compression: {compression_ratio:.1f}x") + + if check: + check_arrays(tree_path, arrays, prefix="node") + + return t1-t0, peak_mem, file_size, total_external_size + +def cleanup_files(tree_path, arr_prefix, n): + if os.path.exists(tree_path): + os.remove(tree_path) + for i in range(n): + arr_path = f"{arr_prefix}_node{i}.b2nd" + if os.path.exists(arr_path): + os.remove(arr_path) + +if __name__ == "__main__": + N = 10 + min_size = int(1e6) # 1 MB + max_size = int(1e8) # 100 MB + threshold = 2**23 # 8 MB threshold before using external arrays + print(f"Creating {N} arrays with sizes ranging from {min_size / 1e6:.2f} to {max_size / 1e6:.2f} MB...") + arrays, sizes, uncompressed_size = make_arrays(N, min_size, max_size) + + print("Benchmarking DictStore with embed arrays...") + tree_path_embed = "large_dict_store_embed.b2z" + t_embed, mem_embed, file_size_embed = run_embed_tree(arrays, None, tree_path_embed, uncompressed_size) + + print("Benchmarking DictStore with external arrays with threshold...") + tree_path_external = "large_dict_store_external_threshold.b2z" + arr_prefix = "large_external" + t_t_external, mem_t_external, file_t_size_external, external_t_size = ( + run_external_tree(arrays, threshold, tree_path_external, arr_prefix, uncompressed_size)) + + print("Benchmarking DictStore with external arrays with no threshold...") + tree_path_external_noth = "large_dict_store_external_nothreshold.b2z" + arr_prefix = "large_external_noth" + t_external, mem_external, file_size_external, external_size = ( + run_external_tree(arrays, None, tree_path_external_noth, arr_prefix, uncompressed_size)) + + print("\nSummary:") + print(f"Embed arrays: Time = {t_embed:.2f}s, Memory = {mem_embed:.2f} MB," + f" File size = {file_size_embed:.2f} MB") + print(f"External arrays (th: {threshold / 2**20:.2f} MB): Time = {t_t_external:.2f}s, Memory = {mem_t_external:.2f} MB," + f" DictStore file size = {file_t_size_external:.2f} MB, External files size = {external_t_size:.2f} MB") + print(f"External arrays: Time = {t_external:.2f}s, Memory = {mem_external:.2f} MB," + f" DictStore file size = {file_size_external:.2f} MB, External files size = {external_size:.2f} MB") + + speedup = t_embed / t_external if t_external > 0 else float('inf') + mem_ratio = mem_embed / mem_external if mem_external > 0 else float('inf') + file_ratio = file_size_embed / file_size_external if file_size_external > 0 else float('inf') + storage_ratio = file_size_embed / (file_size_external) + print(f"Time ratio (embed/external): {speedup:.2f}x") + print(f"Memory ratio (embed/external): {mem_ratio:.2f}x") + print(f"File size ratio (embed/external tree): {file_ratio:.2f}x") + print(f"Storage efficiency (embed vs total external): {storage_ratio:.2f}x") + + # cleanup_files(tree_path_embed, arr_prefix, N) + # cleanup_files(tree_path_external, arr_prefix, N) + # cleanup_files(tree_path_external_noth, arr_prefix_noth, N) diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 35b44312..12e99ea0 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -75,6 +75,19 @@ Construction .. automethod:: CTable.from_csv +Parquet interoperability +------------------------ + +Parquet import/export is intended as logical data interchange between Parquet +and Blosc2 CTable, not as exact preservation of Parquet's physical layout. For +example, Parquet files whose top-level schema is an unnamed ``list>`` +may be imported as a regular CTable whose rows are the list elements and whose +nested scalar fields are exposed as ordinary dotted columns. Exporting such a +table writes a valid logical Parquet table, but does not attempt to reconstruct +the original unnamed root-list grouping, row groups, encoding choices, or file +metadata exactly. + + Null policy ----------- @@ -615,12 +628,129 @@ to a typed representation. They are not used as an implicit fallback during Parquet import; unsupported Arrow/Parquet types still raise unless explicitly imported through :meth:`CTable.from_arrow` with ``object_fallback=True``. +Nested fields +------------- + +CTable supports first-class **nested struct schemas** by physically flattening +struct leaves into independent compressed columns. This keeps analytics fast +(each leaf is an ordinary :class:`~blosc2.NDArray`), while preserving the +logical nested row shape on read. + +**Automatic flattening from Arrow / Parquet** + +When :meth:`CTable.from_arrow` or :meth:`CTable.from_parquet` encounters a +top-level ``struct<…>`` field, it recursively flattens every scalar leaf into a +dotted column name and stores each leaf as its own physical column:: + + import pyarrow as pa + import blosc2 + + trip_type = pa.struct([ + ("begin", pa.struct([("lon", pa.float64()), ("lat", pa.float64())])), + ("end", pa.struct([("lon", pa.float64()), ("lat", pa.float64())])), + ]) + schema = pa.schema([pa.field("trip", trip_type), + pa.field("fare", pa.float64())]) + batch = pa.record_batch( + [pa.array([{"begin": {"lon": -87.6, "lat": 41.8}, + "end": {"lon": -87.7, "lat": 41.9}}], + type=trip_type), + pa.array([12.5])], + schema=schema, + ) + + t = blosc2.CTable.from_arrow(schema, [batch]) + # t.col_names → ['trip.begin.lon', 'trip.begin.lat', + # 'trip.end.lon', 'trip.end.lat', 'fare'] + +**Column access** + +Nested leaves are accessed with their dotted logical name or via chained +attribute proxies:: + + t["trip.begin.lon"].mean() # Column object (fast path) + t.trip.begin.lon.max() # attribute proxy, same column + +A literal ``.``, ``/``, or ``\\`` inside an Arrow field name is escaped with a +backslash in the logical column name. For example, path segments +``("trip.info", "begin/point", "lon.deg")`` become:: + + t[r"trip\.info.begin\/point.lon\.deg"] + +Such leaves are stored with percent-encoded path segments under ``_cols``; the +example above is stored at ``_cols/trip%2Einfo/begin%2Fpoint/lon%2Edeg``. + +**Filtering and expressions** + +Dotted names work everywhere a flat column name would:: + + t.where("trip.begin.lon > -87.7 and fare > 10") + t.where(t.trip.begin.lon > -87.7) + +**Select / projection** + +A struct prefix expands to all descendant leaves:: + + t.select(["trip.begin"]) # → columns trip.begin.lon, trip.begin.lat + t.select(["trip"]) # → all four trip.* leaves + +**Indexes and aggregates** + +Scalar leaf columns support all the same operations as flat columns:: + + t.create_index(col_name="trip.begin.lon") + t.where("trip.begin.lon > -87.7").nrows # uses the index + +**Row reconstruction** + +Single-row access reconstructs the original nested dict shape:: + + row = t[0] + row.trip # → {"begin": {"lon": ..., "lat": ...}, "end": {...}} + row.fare # → 12.5 + +**Inserting nested rows** + +:meth:`CTable.append` and :meth:`CTable.extend` accept either the flat dotted +form or the original nested dict / list-of-dicts shape:: + + # flat dotted keys + t.append({"trip.begin.lon": -87.6, "trip.begin.lat": 41.8, + "trip.end.lon": -87.7, "trip.end.lat": 41.9, "fare": 12.5}) + + # original nested dict (auto-flattened) + t.append({"trip": {"begin": {"lon": -87.6, "lat": 41.8}, + "end": {"lon": -87.7, "lat": 41.9}}, + "fare": 12.5}) + + # extend with a list of nested dicts + t.extend([ + {"trip": {"begin": {"lon": -87.6, "lat": 41.8}, + "end": {"lon": -87.7, "lat": 41.9}}, "fare": 12.5}, + {"trip": {"begin": {"lon": -87.5, "lat": 41.7}, + "end": {"lon": -87.8, "lat": 41.6}}, "fare": 8.0}, + ]) + +**Physical storage layout** + +Leaf columns are stored under a hierarchical path in the backing container: +``/_cols/trip/begin/lon``, ``/_cols/trip/begin/lat``, etc. Intermediate nodes +are namespaces only; no data is stored at non-leaf levels. + +**Arrow / Parquet round-trip** + +:meth:`CTable.to_parquet` and :meth:`CTable.to_arrow` reconstruct the original +nested Arrow schema from the stored metadata, so round-trips are lossless:: + + t.to_parquet("out.parquet") # Arrow schema has top-level "trip" struct + Struct columns -------------- Struct columns are declared with :func:`blosc2.struct` and store one dictionary (or ``None`` when nullable) per row in batched variable-length storage. They are -also used when importing top-level Arrow/Parquet ``struct<...>`` columns:: +also used when importing top-level Arrow/Parquet ``struct<...>`` columns when +**not** using the nested-leaf flattening path described above:: from dataclasses import dataclass import blosc2 as b2 diff --git a/plans/ctable-dictionary-type.md b/plans/ctable-dictionary-type.md new file mode 100644 index 00000000..348cdc7b --- /dev/null +++ b/plans/ctable-dictionary-type.md @@ -0,0 +1,688 @@ +# Plan: CTable dictionary/categorical column type + +## Motivation + +Real-world Parquet files frequently contain Arrow dictionary-encoded columns, especially repeated string +columns. Arrow represents these as: + +```text +dictionary +``` + +Today, `CTable.from_arrow()` does not support Arrow dictionary types directly. The compatibility fallback is +to decode dictionaries to plain strings before import, but this loses the compact representation and prevents +fast integer-code indexing. + +Add a CTable dictionary column type with Arrow-like semantics: + +```python +blosc2.dictionary( + index_type=blosc2.int32(), value_type=blosc2.vlstring(), ordered=False +) +``` + +For v1, keep the implementation intentionally narrow and optimized for the common Parquet case: string +categories represented by signed 32-bit codes. + +## Goals for v1 + +- Add a public dictionary column spec. +- Support dictionary columns in CTable schemas and persistent metadata. +- Store dictionary columns as stable integer codes plus a dictionary of unique string values. +- Import Arrow/Parquet dictionary-encoded string columns without decoding to full strings. +- Export CTable dictionary columns back to Arrow dictionary arrays. +- Allow decoded reads by default while exposing codes and dictionary values for advanced users. +- Enable equality/membership filtering to operate on integer codes. +- Make dictionary columns indexable by indexing their codes. +- Ensure the real-world `~/Downloads/chicago-taxi.parquet` dataset can round-trip to/from Blosc2 format. + +## Non-goals for v1 + +- General value types beyond `vlstring`. +- General index types beyond internal `int32`. +- Nested dictionary columns inside list/struct fields. +- Dictionary compaction/removal of unused categories. +- Ordered comparisons (`<`, `>`, sorting) beyond storing the `ordered` flag. +- Per-chunk or per-batch dictionaries. +- Schema-less/object fallback support for dictionaries. + +## Public API + +### Column spec + +Add: + +```python +blosc2.dictionary( + index_type=blosc2.int32(), + value_type=blosc2.vlstring(), + ordered=False, + nullable=True, +) +``` + +For v1: + +- `index_type` must be `blosc2.int32()`. +- `value_type` must be `blosc2.vlstring()`. +- `ordered` is persisted and exported to Arrow, but ordered comparisons are not implemented initially. +- `nullable=True` means row slots may be null. Nulls are represented internally by code `-1`. +- `nullable=False` rejects null slots during writes/import. + +Consider an alias later: + +```python +blosc2.categorical(...) +``` + +but implement only `dictionary` first to match Arrow terminology. + +### Example schema usage + +```python +from dataclasses import dataclass +import blosc2 + + +@dataclass +class Trip: + vendor: str = blosc2.field( + blosc2.dictionary(index_type=blosc2.int32(), value_type=blosc2.vlstring()) + ) + fare: float = blosc2.field(blosc2.float64()) +``` + +### Column access + +Default reads should return decoded values: + +```python +ct["vendor"][:] # ["Uber", "Lyft", None, "Uber"] +ct["vendor"][0] # "Uber" +``` + +Expose internals explicitly: + +```python +ct["vendor"].codes[:] # np.ndarray(dtype=int32), e.g. [0, 1, -1, 0] +ct["vendor"].dictionary[:] # ["Uber", "Lyft"] +``` + +Use `.dictionary` as the preferred public name for unique values because it matches Arrow terminology and the +`blosc2.dictionary(...)` spec name. A pandas-friendly `.categories` alias can be considered later, but should +not be part of the v1 API unless it falls out naturally. + +Useful methods/properties: + +```python +col.codes # fixed-width NDArray-like codes storage +col.dictionary # varlen string array of unique values +col.encode(values) # values -> int32 codes, extending dictionary if allowed +col.decode(codes) # codes -> values +col.value_to_code(value) # single value lookup; KeyError if absent +col.code_to_value(code) # single code lookup +``` + +For v1, keep mutation methods minimal and internal if needed. Public `.codes` and `.dictionary` are enough +for inspection and debugging. + +Logical slice reads should follow existing `vlstring` behavior and return Python lists, not NumPy object arrays: + +```python +ct["vendor"][:] # ["Uber", "Lyft", None, "Uber"] +``` + +## Semantics + +### Logical model + +A dictionary column is logically: + +```text +row slot -> int32 code -> dictionary value +``` + +Example: + +```text +codes: [0, 1, 0, -1] +dictionary: ["Uber", "Lyft"] +decoded: ["Uber", "Lyft", "Uber", None] +``` + +### Nulls + +Use reserved code `-1` for null row slots. + +Rationale: + +- `int32` codes give a simple, compact null representation. +- Code comparisons and indexes can include null slots naturally. +- This avoids a separate validity bitmap for v1 dictionary columns. + +Rules: + +- Valid category codes are `0 <= code < len(dictionary)`. +- `-1` means null slot. +- Codes `< -1` are invalid. +- If `nullable=False`, attempts to write/import null slots raise `ValueError`. +- Dictionary values themselves should not be null in v1. Null is represented only by slot code `-1`. + +### Dictionary growth + +Use an append-only global dictionary per column. + +- New string values append to the dictionary and receive the next code. +- Existing values reuse their existing code. +- Deleting table rows does not remove dictionary values. +- Updating a row to a new value may append a new dictionary value. +- Codes are stable for the life of the column. + +No automatic compaction in v1. A future explicit operation can be added: + +```python +ct["vendor"].compact_dictionary() +``` + +but this requires recoding all codes and rebuilding any indexes, so defer it. + +### Maximum cardinality + +Because v1 uses signed `int32` and reserves `-1` for null, the maximum number of categories is: + +```text +2_147_483_648 +``` + +Practically, memory/storage constraints will be hit earlier. If appending a new category would exceed +`np.iinfo(np.int32).max`, raise `OverflowError`. + +## Storage layout + +Represent a dictionary column as a logical column object wrapping two persisted components: + +```text +/ + _cols/ + vendor/ + codes # int32 NDArray, one code per row + dictionary # variable-length string storage, unique values +``` + +Exact on-disk naming should match existing table storage conventions, but the logical layout should be +column-local. Do not store dictionary values as a separate user-visible CTable column. + +### Codes storage + +- Fixed-width `int32` NDArray. +- Shape grows with table rows. +- Uses the normal column compression parameters. +- Indexes operate on this codes array. + +### Dictionary value storage + +Use the existing variable-length scalar string machinery where possible: + +- `vlstring` values. +- Append-only. +- Stored under the dictionary column directory. +- Maintains insertion order as category order. + +### In-memory lookup cache + +Maintain an in-memory mapping for fast encoding: + +```python +_value_to_code: dict[str, int] +``` + +Build it lazily from persisted dictionary values when opening a table. Persist only dictionary values, not the +Python mapping. + +## Schema metadata + +Add a new spec kind, likely in `src/blosc2/schema.py`: + +```json +{ + "kind": "dictionary", + "index_type": {"kind": "int", "bits": 32, "signed": true, ...}, + "value_type": {"kind": "vlstring", ...}, + "ordered": false, + "nullable": true, + "null_code": -1 +} +``` + +The compiler should produce a `CompiledColumn` with: + +- logical type: dictionary; +- physical dtype for codes: `np.int32`; +- display width based on decoded strings, not codes where feasible. + +Schema validation should reject unsupported v1 combinations early: + +- non-`int32` index type; +- non-`vlstring` value type; +- null dictionary values; +- nullable policies incompatible with `-1` null code. + +## Core implementation tasks + +### 1. Add `DictionarySpec` + +Implement in schema/spec layer: + +- constructor helper `blosc2.dictionary(...)`; +- metadata serialization/deserialization; +- equality/repr/docs; +- validation of v1 constraints. + +Potential fields: + +```python +@dataclass(frozen=True) +class DictionarySpec(ColumnSpec): + index_type: IntSpec + value_type: VLStringSpec + ordered: bool = False + nullable: bool = True + null_code: int = -1 +``` + +### 2. Add dictionary column object + +Implement a column class, for example: + +```python +class DictionaryColumn: + codes: blosc2.NDArray + dictionary: _ScalarVarLenArray # or existing vlstring backing type +``` + +Required operations: + +- `__len__` +- `__getitem__` scalar/slice/list/boolean mask returning decoded values +- `__setitem__` scalar/slice/list values, encoding as needed +- `append` / `extend` for Arrow import and row appends +- `flush` if dictionary storage uses buffered batch machinery +- `close` if needed + +For v1, prioritize the operations used by CTable append/import/read paths. + +### 3. Extend table storage + +Add storage factory methods analogous to existing list/varlen methods: + +```python +storage.create_dictionary_column(name, spec, cparams=None, dparams=None) +storage.open_dictionary_column(name, spec, ...) +``` + +These create/open both physical components (`codes`, `dictionary`) under the logical column. + +### 4. Extend CTable schema compilation and column creation + +Update CTable creation paths to detect `DictionarySpec`: + +- schema compiler; +- `_create_columns` / equivalent new-table creation; +- `_create_arrow_import_columns`; +- open-from-storage path; +- row append/update paths; +- column widths/display. + +Dictionary columns should be logical `ct.col_names` entries just like ordinary columns. + +### 5. Decoded read/write behavior + +When assigning Python values: + +```python +ct.append({"vendor": "Uber"}) +ct["vendor"][3] = "Lyft" +ct["vendor"][4:6] = ["Uber", None] +``` + +Encoding behavior: + +- If value is `None`: code `-1` if nullable, otherwise raise. +- If value is `str` and exists: use existing code. +- If value is `str` and missing: append dictionary value, assign new code. +- If value is not `str`/`None`: raise `TypeError`. + +When assigning raw codes, require explicit codes API. Do not silently accept integers via logical column writes, +because integers could be real category values in future dictionary types. + +## Arrow/Parquet interoperability + +### Import from Arrow + +Map Arrow dictionary columns as follows: + +```text +dictionary + -> blosc2.dictionary(index_type=blosc2.int32(), value_type=blosc2.vlstring(), ordered=X) +``` + +Accepted Arrow index types for v1: + +- signed integer indices: `int8`, `int16`, `int32`, `int64`; +- unsigned integer indices: `uint8`, `uint16`, `uint32`, `uint64`, provided all values fit in signed + `int32`; +- normalize internally to `int32`; +- reject if category count or any index value does not fit signed `int32`. + +Accepted Arrow value types for v1: + +- `string`, `large_string`, `utf8`, `large_utf8`; +- normalize internally to `vlstring`. + +Rejected for v1: + +- dictionary values of binary, numeric, struct, list, etc.; +- nested dictionary arrays inside list/struct; +- unsigned index arrays containing values that do not fit in signed `int32`. + +### Chunked Arrow arrays and dictionary unification + +Arrow chunked arrays and Parquet row groups may carry different dictionaries per chunk. CTable v1 should use +one global dictionary per column. + +Import algorithm: + +1. For each incoming Arrow dictionary array chunk: + - read its dictionary values; + - map chunk-local category values to global codes; + - translate chunk indices to global int32 codes; + - translate Arrow nulls to `-1`. +2. Append translated codes to the CTable codes storage. +3. Append new category values to the global dictionary as discovered. + +Preserve first-seen category order. This is deterministic for a given input stream and works well for append-only +semantics. + +If `ordered=True` and chunks have different dictionary orders, global first-seen order may not preserve the +semantic order. For v1: + +- preserve and export `ordered=True` only when the importer can verify all chunk dictionaries have the same + order for existing values; +- otherwise raise `ValueError`. Do not silently downgrade to `ordered=False`, because `ordered=True` carries + semantic meaning and silently changing it could make comparisons/sorts incorrect later. + +### Arrow schema inference + +Update `_arrow_type_to_spec()`: + +- recognize top-level Arrow dictionary type; +- return `DictionarySpec` for supported v1 string dictionaries; +- raise clear `TypeError` for unsupported dictionary variants. + +Do not decode dictionary type to plain string inside core `CTable.from_arrow()` when dictionary support is +available. The CLI can later expose a flag to force decoding if desired. + +### Arrow batch writing into CTable + +Update `_write_arrow_batch()`: + +- if compiled column is dictionary: + - accept Arrow dictionary arrays and use the unification algorithm; + - also optionally accept plain string arrays by encoding strings into the dictionary; + - reject unsupported types. + +This allows appending plain strings to an existing dictionary CTable column. + +### Export to Arrow + +When `iter_arrow_batches()` sees a dictionary column, emit Arrow dictionary arrays: + +```text +dictionary +``` + +Implementation approach: + +- Arrow dictionary values: `pa.array(dictionary_values, type=pa.string())` or `pa.large_string()`? Use `pa.string()` + for v1 unless a value exceeds Arrow string limits, then use `large_string()`. +- Indices: `pa.array(codes, type=pa.int32())`, with null mask for `codes == -1`. +- Construct `pa.DictionaryArray.from_arrays(indices, dictionary, ordered=spec.ordered)`. + +For slices/batches, reuse the full column dictionary rather than creating per-batch dictionaries. This preserves +stable codes and simplifies export. + +### Parquet CLI behavior + +Once core dictionary support exists: + +- Default CLI import should preserve supported Arrow dictionary string columns as dictionary CTable columns. +- Add an escape hatch: + +```bash +parquet-to-blosc2 --decode-dictionaries input.parquet output.b2d +``` + +or equivalent if users want plain `vlstring` columns. + +The default should favor preserving dictionary encoding because it is compact and closer to the original Arrow +schema. + +## Query and expression support + +### Equality + +For dictionary column `vendor`: + +```python +ct["vendor"] == "Uber" +``` + +should translate to: + +```python +ct["vendor"].codes == code_for("Uber") +``` + +If the value is absent from the dictionary, return an all-false boolean expression/selection without scanning. + +Null equality: + +```python +ct["vendor"] == None +``` + +maps to: + +```python +codes == -1 +``` + +Use whatever null comparison idiom is already preferred in CTable expressions; avoid encouraging `== None` in +user docs if there is an `is_null()` API. + +### Membership + +```python +ct["vendor"].isin(["Uber", "Lyft"]) +``` + +maps to code membership: + +```python +codes in [0, 1] +``` + +Values absent from the dictionary are ignored. If all requested values are absent, return all-false. + +### Ordered comparisons + +For v1: + +- If `ordered=False`, `<`, `<=`, `>`, `>=` should raise `TypeError` for dictionary columns. +- If `ordered=True`, still defer implementation unless it is trivial to map to code comparisons. Document that + ordered comparisons are not supported in v1 even though the flag is stored/exported. + +This avoids ambiguous semantics between dictionary order and lexical string order. + +## Indexing support + +Dictionary columns should be indexed by codes. + +### Index creation + +User API should remain logical: + +```python +ct.create_index("vendor") +``` + +Internally: + +- detect `vendor` is dictionary; +- create the physical index on `vendor.codes`; +- store public index metadata under the logical column name `vendor`; +- mark the index as dictionary-aware so query planning maps values to codes before using it. + +The public API should hide the code-index detail. On disk, index files may include an explicit `codes` suffix, +for example `__index__.vendor.codes...`, to avoid ambiguity and make debugging easier. + +Avoid requiring users to write: + +```python +ct["vendor"].codes.create_index() +``` + +though exposing code-level indexes for debugging is fine. + +### Query planning with indexes + +For equality: + +1. Look up the queried string in the dictionary. +2. If present, query the integer index for that code. +3. If absent, return empty result immediately. + +For membership: + +1. Map present values to codes. +2. Query the integer index for those codes. +3. Ignore absent values. + +For nulls: + +- code `-1` can be included in the code index. +- `is_null()` queries use code `-1`. + +### Index maintenance + +Because dictionary values are append-only and codes are stable: + +- existing index entries do not need recoding when new categories are appended; +- appending rows updates the code index just like appending rows to an integer column; +- deleting rows follows existing CTable valid-row semantics; +- dictionary compaction, if added later, must invalidate/rebuild indexes. + +## Persistence and compatibility + +### Opening existing tables + +Existing tables do not contain dictionary specs, so no migration is needed. + +### Versioning + +Add a schema metadata version bump if the CTable schema format has one. Older versions of python-blosc2 will not +understand `kind: dictionary`; they should fail clearly when opening such tables. + +### Robustness checks on open + +When opening a persisted dictionary column: + +- validate codes dtype is int32; +- validate dictionary storage exists; +- validate dictionary values are strings and contain no null entries; +- optionally validate codes are `-1` or within dictionary bounds. Full validation may be expensive; provide a + debug/validation path rather than doing it unconditionally for huge tables. + +## Testing plan + +### Unit tests for spec/schema + +- `blosc2.dictionary()` creates expected spec. +- Unsupported `index_type` raises. +- Unsupported `value_type` raises. +- Metadata roundtrip preserves `ordered`, `nullable`, `null_code`. +- Dataclass schema compilation supports dictionary fields. + +### CTable behavior tests + +- Create in-memory CTable with dictionary column. +- Append strings and nulls. +- Repeated strings reuse codes. +- New strings append dictionary values. +- Decoded scalar/slice reads work. +- `.codes[:]` and `.dictionary[:]` expose expected internals. +- `nullable=False` rejects nulls. +- Invalid value types raise. +- Persistent `.b2d`/`.b2z` tables reopen correctly. + +### Arrow import/export tests + +- Import `dictionary`. +- Import `dictionary`. +- Import `dictionary`. +- Import `dictionary` when values fit int32. +- Import unsigned dictionary indices when values fit signed int32. +- Reject too-large signed/unsigned dictionary indices or category counts. +- Import chunked arrays with different dictionaries and verify global unification. +- Preserve nulls as `-1` internally and Arrow nulls on export. +- Export emits Arrow dictionary type with int32 indices and string values. +- Parquet roundtrip preserves logical values. + +### Query/index tests + +- Equality filter on present value returns matching rows. +- Equality filter on absent value returns no rows without scanning if possible. +- Membership filter works. +- Null filter works. +- `ct.create_index("dict_col")` builds code index. +- Equality/membership use the code index. +- Appending rows after index creation maintains index correctness. + +### CLI tests + +- `parquet-to-blosc2` imports dictionary string column as dictionary column. +- Export produces Parquet/Arrow dictionary column. +- Optional dictionary-decoding flag imports as `vlstring` instead. +- Unsupported dictionary value type reports a clear error or decodes only if explicitly requested. +- Real-world acceptance test: `~/Downloads/chicago-taxi.parquet` imports to Blosc2, exports back to Parquet, + and round-trip comparison succeeds for imported/exported columns. + +## Suggested implementation order + +1. Add `DictionarySpec` and public `blosc2.dictionary()` helper. +2. Implement dictionary column storage wrapper with codes + vlstring dictionary. +3. Integrate dictionary columns into CTable creation/open/read/write paths. +4. Add decoded reads and append/set encoding. +5. Add Arrow dictionary import with global dictionary unification. +6. Add Arrow export as `pa.DictionaryArray`. +7. Add equality/membership expression translation. +8. Add dictionary-aware index creation and query usage. +9. Add CLI preservation by default and optional decode flag. +10. Add docs/examples. + +## Resolved design decisions + +These decisions are part of the v1 plan: + +1. Expose `nullable` on the dictionary spec, defaulting to `True`. +2. Accept Arrow unsigned dictionary indices if all values fit in signed `int32`; normalize internally to + `int32`. +3. Raise for ordered Arrow dictionaries with incompatible/differing chunk dictionary order. Do not silently + downgrade to unordered. +4. Make the Parquet CLI preserve supported dictionary columns by default. Provide an opt-out flag such as + `--decode-dictionaries` for users who want plain `vlstring` columns. +5. Use `.dictionary` as the preferred public property for unique values. Consider `.categories` only as a + future alias. +6. Return Python lists for logical slice reads, following existing `vlstring` behavior. +7. Keep `ct.create_index("vendor")` logical and hide code-index details from the public API. On-disk index + artifacts may include a `codes` suffix for clarity. diff --git a/plans/ctable-nested-fields.md b/plans/ctable-nested-fields.md new file mode 100644 index 00000000..5a2778f6 --- /dev/null +++ b/plans/ctable-nested-fields.md @@ -0,0 +1,251 @@ +# CTable nested fields via physical leaf columns + +## Summary + +Add first-class support for nested schemas in `CTable` by **physically flattening leaf fields** into real persisted columns, while preserving logical nested structure for row I/O and Arrow/Parquet roundtrips. + +Key idea: + +- Logical path: `trip.begin.lon` +- Physical storage path in container: `/_cols/trip/begin/lon` +- Canonical root field name: `""` (empty string) +- Display alias for root (optional): `/` + +This keeps analytics/indexing fast (leaf = ordinary column), and matches `.b2d` / `.b2z` container layout naturally. + +**Status: core implementation complete.** All acceptance criteria are met. +Remaining work is captured in the [Future work](#future-work) section below. + +--- + +## Goals + +1. Support nested struct/list schemas without storing struct leaves as opaque varlen/object blobs. +2. Enable columnar analytics on scalar leaves using existing `CTable` machinery: + - filters (`where`) + - lazy expressions + - aggregates (`sum/min/max/mean/std`) + - indexes + - sorting/grouping paths already supported for scalar columns +3. Preserve nested logical row interface (dict/list reconstruction on read). +4. Keep backward compatibility for existing flat tables and existing nested-as-varlen tables. + +## Non-goals (phase 1) + +1. Full list-element relational semantics (`explode`, SQL-like unnests) for query planner. +2. Indexing directly on list-valued paths. +3. Breaking on-disk compatibility of existing tables. + +--- + +## Proposed model + +## 1) Path model + +Define a canonical logical field-path type: + +- Root: `""` +- Path segments: `("trip", "begin", "lon")` +- Dotted display key: `trip.begin.lon` + +Add helpers: + +- `split_field_path(str) -> tuple[str, ...]` ✅ implemented (`ctable_storage.py`; backslash-escape aware) +- `join_field_path(tuple[str, ...]) -> str` ✅ implemented (`ctable_storage.py`; escapes literal `.`, `/`, and `\\`) +- escaping/unescaping for literal `.` and `/` in field names ✅ implemented for logical names via backslash escaping and for physical storage via percent-encoded path segments + +Recommendation: + +- Canonical internal identity: tuple segments +- Dotted names only as user syntax +- Physical storage path built from escaped segments ✅ literal `.`, `/`, `%`, and `\\` inside segments are percent-encoded + +## 2) Physical layout ✅ implemented + +Persist scalar leaves as standard column arrays under `_cols` hierarchy: + +- `/_cols/trip/begin/lon` +- `/_cols/trip/begin/lat` +- `/_cols/trip/begin/time` +- `/_cols/payment/fare` + +Intermediate nodes are namespaces only (no data arrays). + +For lists: + +- Keep existing `ListArray` physical representation for list leaves. ✅ +- For `list>`, phase 1 keeps list cell storage as list payload (no explode). ✅ + +## 3) Schema metadata ✅ implemented + +Extend schema serialization with nested mapping metadata, e.g.: + +- logical path -> physical column token/path ✅ (`schema.metadata["nested"]` dict) +- physical column -> storage path ✅ (`schema.metadata["nested"]["physical_to_storage"]`) +- root logical alias metadata when needed ✅ +- row reconstruction flag when nested Arrow structs were flattened ✅ + +Leaf spec details such as kind, dtype, nullability, and scalar/list/dictionary behavior remain in the standard schema column specs rather than being duplicated in `metadata["nested"]`. ✅ + +Keep `CompiledSchema.columns` as the ordered list of **physically stored leaf columns**. `CompiledSchema.columns_by_name` may additionally contain virtual logical aliases, such as top-level `StructSpec` entries used for Arrow/Parquet schema roundtrips; these aliases are not stored columns and do not appear in `CTable.col_names`. ✅ + +--- + +## API behavior + +## Column access ✅ implemented + +Allow both: + +- `t["trip.begin.lon"]` ✅ +- `t.trip.begin.lon` (via lightweight namespace proxy objects) ✅ (`_NestedColumnNamespace`; `_StructPathColumn` is used for struct-prefix virtual access such as `t["trip"]`) + +`Column` operations on scalar leaves behave exactly like current top-level scalar columns. ✅ + +## Row materialization ✅ implemented + +- `t[i]` reconstructs nested dict/list shape from leaves and list payload columns. ✅ +- Top-level unnamed field (`""`) is handled as root container. ✅ + +## Select/projection ✅ implemented + +`select([...])` accepts: + +- leaf paths (`"trip.begin.lon"`) ✅ +- struct prefix (`"trip.begin"`) that expands to descendant leaves ✅ + +## Expressions ✅ implemented + +`where("trip.begin.lon > -87.7 and payment.fare > 10")` supported by path rewriting to operand IDs or canonical flat leaf names. ✅ + +--- + +## Implementation plan + +## Phase 0 — design/compat scaffolding + +1. ✅ Path splitting/joining helpers (`_column_name_to_relpath` + inverse in schema metadata). +2. ✅ New schema metadata version (`schema.metadata["nested"]` with `version` key; backward-compatible read of old flat schemas). +3. ⚠️ Feature flag (internal) to enable nested physical layout for new tables — not a separate flag; nested layout is activated implicitly when the input schema contains struct fields. + +## Phase 1 — schema compilation flattening + +1. ✅ Schema compiler flattens nested structs into physical leaf columns (`schema_compiler.py`, `_flatten_arrow_struct_schema`). +2. ✅ Nested path mapping kept for reconstruction/export (`logical_to_physical`, `physical_to_storage`, optional root alias, and `reconstruct_rows` in nested metadata). Leaf type details remain in normal schema column specs. +3. ✅ Deterministic flat column keys — canonical dotted form used throughout. +4. ✅ Nullable propagation rules explicit (propagated from parent struct nullability). + +## Phase 2 — storage backend + +1. ✅ `ctable_storage` create/open accept hierarchical column paths. +2. ✅ Arrays stored in `/_cols///...` hierarchy. +3. ✅ Reopen logic uses stored schema column names and maps dotted names back to hierarchical `_cols/...` paths. +4. ~~Migration-safe fallback for legacy flat `_cols/` tables~~ — **skipped**: no code ever shipped writing dotted names as flat paths, so no migration is needed. + +## Phase 3 — read/write data paths + +1. ✅ `append`/`extend` flatten input nested dicts into leaf columns (`_flatten_nested_dict`, updated `_normalize_row_input` and `extend`). +2. ✅ `__getitem__(int)` and row iterators reconstruct nested rows (`_materialize_row`, `reconstruct_rows` flag). +3. ✅ Fast-path for already-flat rows preserved. + +## Phase 4 — column resolution and expression engine + +1. ✅ Column resolver from dotted path string → physical leaf column. +2. ✅ Attribute path proxy `t.trip.begin.lon` via `_StructPathColumn`. +3. ✅ Expression parsing includes nested leaves (`_where_expression_operands`). +4. ✅ List/object leaf expressions restricted appropriately in phase 1. + +## Phase 5 — indexes and analytics + +1. ✅ `create_index(col_name="trip.begin.lon")` works on scalar leaves. +2. ✅ Index catalog uses canonical dotted target path. +3. ✅ Aggregates (`mean`, `sum`, `min`, `max`, `std`) and `sort_by` work on resolved leaf NDArrays. + +## Phase 6 — Arrow/Parquet import/export + +1. ✅ Import: nested Arrow schema flattened into leaf storage + nested metadata (`from_arrow`, `_flatten_arrow_struct_*`). +2. ✅ Export: Arrow nested schema rebuilt from leaves (`to_arrow`, `to_parquet` reconstruct struct hierarchy). +3. ✅ Dictionary/timestamp/null semantics unchanged. + +## Phase 7 — docs/tests/perf + +1. Tests: + - ✅ Append/reopen/roundtrip for nested rows (`tests/ctable/test_nested_append.py`, `test_nested_access_storage.py`). + - ✅ `where`/`select`/`index`/`aggregate` on nested scalar leaves (covered in existing ctable test suite). + - ✅ Compatibility: legacy flat tables still pass all tests. + - ✅ Path parsing and escaping tests for literal `.` and `/` in nested Arrow field names (`tests/ctable/test_nested_access_storage.py`). +2. Docs: + - ✅ Nested path syntax, column access, filtering, Arrow/Parquet roundtrip (`doc/reference/ctable.rst`, "Nested fields" section). + - ✅ Method-level docstrings updated: `append`, `extend`, `__getitem__`, `where`, `select`, `rename_column`, `create_index`, `sort_by`, `from_arrow`, `from_parquet`. +3. Benchmarks: + - ✅ Nested leaf filter/index performance vs flat columns (`bench/ctable/bench_nested_filter_index.py`); overhead is negligible. + +--- + +## Compatibility and migration + +1. ✅ Existing tables remain readable/writable as-is. +2. ✅ Nested layout activated automatically when schema contains struct fields. +3. Optional utility later: migrate legacy nested-varlen columns to flattened-leaf layout (see Future work). + +--- + +## Acceptance criteria ✅ all met + +1. ✅ Can ingest taxi-like schema and persist leaves under hierarchical `_cols/...` paths. +2. ✅ `t["trip.begin.lon"].mean()` works and matches Arrow/Awkward reference. +3. ✅ `t.where("payment.fare > 20").nrows` works. +4. ✅ `t.create_index(col_name="trip.begin.time")` works for scalar leaf. +5. ✅ `t[i]` returns nested row shape equivalent to input schema. +6. ✅ Existing non-nested/legacy tables keep current behavior unchanged. + +--- + +## Future work + +### FW-1 — Field-name escaping for literal `.` and `/` + +**Status**: implemented. + +Logical nested paths use unescaped `.` as the separator. Literal `.`, `/`, and `\\` +inside a field-name segment are represented with backslash escaping in the logical +column name, e.g. Arrow path segments `("trip.info", "begin/point", "lon.deg")` +become `trip\\.info.begin\\/point.lon\\.deg`. + +Physical storage percent-encodes structural characters inside each path segment before +joining segments under `_cols`, e.g. the same leaf is stored at +`_cols/trip%2Einfo/begin%2Fpoint/lon%2Edeg`. + +### FW-2 — List-struct analytics (explode / unnest) + +**Status**: deferred (non-goal for phase 1). + +`list>` fields are currently stored as opaque list-payload columns. Future +work would: + +- Define an `explode` operation that creates a row-per-element view. +- Enable `where` / `create_index` on paths inside list elements. +- Design SQL-style unnest semantics. + +### FW-3 — Migration utility for legacy nested-varlen tables + +**Status**: deferred; likely unnecessary unless user demand appears. + +Because `CTable` is newly released, few if any production tables are expected to exist +with top-level Arrow `struct<...>` columns imported as opaque `blosc2.struct` varlen +columns. Existing tables remain readable as-is, but they will not automatically gain +nested-leaf analytics. + +Recommended path: re-import the original Arrow/Parquet source with a python-blosc2 +version that supports nested-leaf flattening. This creates the new physical leaf layout +and nested metadata directly. + +A future `CTable.migrate_nested_columns()` utility could still be considered if users +have important legacy tables without access to the original source data. Such a utility +would need to: + +- Detect columns whose schema spec is `struct` with a known logical type. +- Re-import/materialize them as flattened leaf columns. +- Update schema metadata and physical layout atomically. +- Leave `list>` migration out of scope until list-struct analytics are + designed separately. diff --git a/plans/ctable-separate-nested-cols.md b/plans/ctable-separate-nested-cols.md new file mode 100644 index 00000000..edbcfd33 --- /dev/null +++ b/plans/ctable-separate-nested-cols.md @@ -0,0 +1,686 @@ +# CTable separate nested columns for list-struct data + +## Summary + +Extend CTable nested storage so Arrow/Parquet datasets that are physically stored +as an unnamed top-level `list>` can be imported as a normal CTable +whose rows are the **elements of that root list** and whose struct leaves are +ordinary nested CTable columns. + +This is especially important for Awkward-style Parquet files such as Chicago +taxi, whose top-level schema is effectively: + +```text +"": list, + payment: struct<...>, + company: ... +>> +``` + +For this case, the outer unnamed list is treated as a physical/chunking artifact +of the Parquet encoding, not as a semantic CTable column. The imported table +should look and behave like: + +```python +ct["trip.begin.lon"] +ct["payment.fare"] +ct.where("payment.fare > 20") +ct.nrows == total_number_of_root_list_elements +``` + +No user-facing `column_0` and no required `ct.explode()` for this root-list case. + +The mental model is: + +```text +unnamed list> -> root record stream -> regular nested CTable rows +``` + +Named `list>` fields inside an otherwise normal parent table remain +typed `ListArray` columns by default. Future `explode()` support can expose +those named repeated fields as element-row views when parent/child analytics are +needed. + +--- + +## Relationship to existing nested-field work + +`plans/ctable-nested-fields.md` already covers: + +- logical dotted paths; +- escaping literal `.`, `/`, and `\\`; +- physical hierarchical `_cols/...` storage paths; +- top-level `struct<...>` flattening into leaf columns; +- nested row reconstruction for scalar struct leaves; +- Arrow/Parquet schema roundtrips for top-level structs. + +This plan extends that machinery to the special and common case where the whole +Parquet file is an unnamed top-level `list>` record stream. + +--- + +## Goals + +1. Import a single unnamed top-level `list>` as a regular CTable row + stream, with the list elements becoming CTable rows. +2. Physically store scalar leaves of the element struct as separate CTable + columns, typically NDArrays or existing typed CTable column kinds. +3. Preserve nested logical field paths, e.g. `trip.begin.lon`, `payment.fare`. +4. Avoid `column_0` in the user-facing API for unnamed root-list datasets. +5. Keep named `list>` fields as typed `ListArray` columns by default. +6. Store enough provenance metadata to explain that an unnamed root list was + flattened, without requiring exact original Parquet row grouping roundtrip. +7. Make separated nested-column import the default for Parquet inputs that qualify, + with explicit opt-out for schema-fidelity workflows. + +## Non-goals for first implementation + +1. Exact reconstruction of the original Parquet row grouping for unnamed root + lists. +2. In-place migration of existing opaque `ListArray` list-struct columns. +3. Full `explode()` / SQL-style unnesting for named repeated fields. +4. Recursive flattening of nested lists inside element structs. +5. Making Awkward Array a dependency. + +--- + +## Core distinction: root record stream vs named repeated field + +### Case 1: single unnamed top-level `list>` + +Input schema: + +```text +"": list, payment: struct<...>, company: ...>> +``` + +Interpretation: + +- The unnamed top-level list is a physical container/chunking artifact. +- Its elements are the logical records. +- The element struct is the logical root schema. +- The imported CTable row count is the total number of list elements, not the + number of original Parquet rows. + +User-facing result: + +```text +trip.sec +trip.begin.lon +trip.begin.lat +trip.begin.time +trip.end.lon +trip.end.lat +trip.end.time +trip.path # nested list inside element; kept as a ListArray initially +payment.fare +payment.tips +payment.total +payment.type +company +``` + +Example: + +```python +ct = blosc2.CTable.from_parquet("chicago-taxi.parquet", separate_nested_cols=True) +ct["trip.begin.lon"].mean() +ct.where("payment.fare > 20") +``` + +No `ct.explode()` is needed because `ct` is already in the element row space. + +### Case 2: named `list>` inside a parent table + +Input schema: + +```text +user_id: int64 +events: list> +``` + +Interpretation: + +- Parent rows are semantically meaningful. +- `user_id` has one value per parent row. +- `events` has one list per parent row. + +Default representation: + +```text +user_id: NDArray +events: ListArray(list(struct(...))) +``` + +This requires no separate parent-offset metadata for ordinary CTable use: + +```python +ct["user_id"] +ct["events"] +``` + +Offsets only become important if/when a future `ct.explode("events")` view is +implemented and needs to map event elements back to parent rows. + +--- + +## Proposed metadata + +For unnamed-root flattening, store provenance metadata. This is proposed shape, +not final schema: + +```json +{ + "nested": { + "version": 2, + "original_root": { + "kind": "unnamed_list_struct", + "field_name": "", + "preserve_grouping": false + } + } +} +``` + +Meaning: + +- `kind = "unnamed_list_struct"`: source had an unnamed top-level list of struct. +- `field_name = ""`: canonical Arrow root field name. +- `preserve_grouping = false`: original Parquet row/list grouping is not part of + the logical CTable model and is not guaranteed to roundtrip exactly. + +Future optional metadata if exact grouping is requested: + +```json +{ + "original_root": { + "kind": "unnamed_list_struct", + "field_name": "", + "preserve_grouping": true, + "offsets": "_root._offsets", + "valid": "_root._valid" + } +} +``` + +But first implementation should not store original offsets by default. + +--- + +## Physical storage model for unnamed root list + +Given: + +```text +"": list, + path: list> + >, + payment: struct, + company: dictionary +>> +``` + +Store scalar struct leaves as ordinary CTable physical columns: + +```text +/_cols/trip/sec +/_cols/trip/begin/lon +/_cols/trip/begin/lat +/_cols/trip/begin/time +/_cols/payment/fare +/_cols/payment/tips +/_cols/payment/total +/_cols/company +``` + +Nested list fields inside the element struct remain typed list columns in phase +1: + +```text +/_cols/trip/path +``` + +where `trip.path` is a `ListArray` with one cell per logical trip row. + +All visible columns in the imported CTable have the same row count: + +```text +nrows == total number of elements in the unnamed root list +``` + +Leaf types may be: + +- fixed-width numeric/bool/timestamp NDArrays; +- dictionary columns; +- variable-length scalar columns (`vlstring`, `vlbytes`); +- typed `ListArray` columns for nested list fields; +- `ObjectArray` only as fallback for unsupported/heterogeneous data. + +--- + +## Named list-struct fields: ListArray vs ObjectArray + +For named `list>` fields, prefer typed `ListArray` by default: + +```text +events: ListArray(spec=list(struct({"time": timestamp(...), "amount": float64()}))) +``` + +Reasons: + +- Preserves Arrow logical type better than schema-less objects. +- Keeps field/type metadata available for future `explode()`. +- Roundtrips to Arrow/Parquet more naturally. +- Supports both `serializer="msgpack"` and `serializer="arrow"` tradeoffs. + +Use `ObjectArray` only as fallback when: + +- the Arrow type is unsupported by typed `ListArray`; +- the list contents are heterogeneous; +- item schema cannot be represented by `ListSpec`; +- the user explicitly requests object fallback. + +--- + +## Import behavior + +### Phase A: default import with opt-out + +The feature started as opt-in, but is now enabled by default for +`CTable.from_parquet()` and `parquet-to-blosc2` when the Parquet schema qualifies +as a single unnamed root `list>`. The same `separate_nested_cols` +default also lets ordinary top-level Arrow/Parquet `struct<...>` fields follow +`CTable.from_arrow()` semantics and flatten recursively into dotted leaf columns +without changing row cardinality: + +```text +CTable.from_parquet(...) +parquet-to-blosc2 input.parquet output.b2d +``` + +Opt out when closer fidelity to the original Parquet row/schema shape is desired: + +```text +CTable.from_parquet(..., separate_nested_cols=False) +parquet-to-blosc2 ... --no-separate-nested-cols +``` + +`CTable.from_arrow(..., separate_nested_cols=True)` remains available for direct +Arrow inputs. Named list fields, including named `list>`, remain +typed `ListArray` columns by default. + +### Phase B: eligibility for root flattening + +Root flattening applies when: + +1. the Arrow schema has exactly one top-level field; +2. the top-level field name is `""` or is otherwise known to be the canonical + unnamed root; +3. the top-level field type is `list>` or `large_list>`. + +When all conditions hold, flatten `array.values` (the struct element array) into +CTable columns and use `len(array.values)` as the CTable row count. + +### Phase C: import algorithm for unnamed root + +1. Read Arrow list array/chunked array. +2. For each batch/chunk, access the flattened element struct array via + `list_array.values`. +3. Recursively flatten struct fields into leaf arrays. +4. Create/append CTable columns for each leaf. +5. For nested list fields inside the element struct, create/append typed + `ListArray` columns with one list cell per element row. +6. Avoid `to_pylist()` for scalar leaves whenever possible. +7. Store `original_root` provenance metadata. + +The original top-level list offsets do not need to be stored by default. + +--- + +## Row access and logical API + +For unnamed-root flattening, `ct[i]` returns a row representing one element of +the original root list: + +```python +row = ct[i] +row.trip["begin"]["lon"] +row.payment["fare"] +``` + +Column access is ordinary nested CTable access: + +```python +ct["trip.begin.lon"] +ct.trip.begin.lon +ct["payment.fare"] +``` + +Filtering and analytics operate directly: + +```python +ct.where("payment.fare > 20") +ct["trip.begin.lon"].mean() +ct.select(["trip.begin", "payment.fare"]) +``` + +No `column_0` and no required `explode()` for this case. + +--- + +## Arrow/Parquet export behavior + +Exact reproduction of the original unnamed `list>` Parquet row +layout is not a goal by default. Blosc2 and Parquet have different storage +models; import/export should preserve the logical data decently rather than +promise byte- or schema-shape-exact Parquet roundtrips. + +Default export may write the clean logical table: + +```text +trip: struct<...> +payment: struct<...> +company: ... +``` + +rather than wrapping rows back into an unnamed top-level `list>`. + +A future compatibility option could preserve and re-emit the original root-list +row grouping, but only if a concrete user need appears. If added, original +offsets/validity would need to be stored at import time. + +--- + +## Future `explode()` semantics for named repeated fields + +`explode()` remains useful for named list fields inside parent tables, but is not +required for unnamed-root record streams. + +Example future API: + +```python +events = ct.explode("events") +events["time"] +events["amount"] +events["_parent"] # optional parent row index +events["_ordinal"] # optional position inside parent list +``` + +This is a logical view over a repeated field and changes row granularity from +parent rows to element rows. It may require offsets or a generated parent-index +array. This is deferred until after root record stream flattening is working. + +--- + +## Storage and CTable integration + +### TreeStore / nested CTable compatibility + +A CTable with separated nested columns must remain self-contained when stored as +an object/subtree inside a `TreeStore`, including compact `.b2z` stores. All +physical leaves, indexes, and metadata must live under the CTable root and be +addressed relative to that root: + +```text +/some_table/_meta +/some_table/_valid_rows +/some_table/_cols/trip/sec +/some_table/_cols/trip/begin/lon +/some_table/_cols/payment/fare +``` + +Opening `/some_table` as a regular CTable should reconstruct the same logical +schema and expose the same APIs (`ct[i]`, `ct.where(...)`, `to_arrow()`) without +requiring state outside the CTable subtree. Reopen logic should continue to rely +on the CTable schema/manifest rather than scanning arbitrary outer TreeStore +children. + +For `.b2z`, direct-offset/open behavior must work for all separated nested +leaves, just like current hierarchical `_cols/...` CTable leaves. + +### Schema representation + +Recommended for unnamed-root flattening: + +- `CompiledSchema.columns` contains the physical, user-visible element-row leaf + columns. +- `CTable.col_names` contains logical nested paths such as `trip.begin.lon` and + `payment.fare`. +- `metadata["nested"]["original_root"]` records that these columns came from an + unnamed top-level list of struct. +- There are no user-visible `_offsets` / `_valid` columns by default. + +--- + +## Indexing + +For unnamed-root flattened tables, indexes work like normal CTable indexes: + +```python +ct.create_index("payment.fare") +ct.where("payment.fare > 20") +ct.create_index("trip.begin.time") +``` + +For named repeated fields, element-level indexes should be deferred until +`explode()` semantics are implemented. + +--- + +## Implementation phases + +### Phase 0 — design scaffolding + +- [x] Define `original_root` provenance metadata. +- [x] Add helpers to detect a single unnamed top-level `list>` schema. +- [x] Add helpers to flatten Arrow `ListArray.values` struct arrays into leaf arrays. + +### Phase 1 — unnamed-root record stream import + +- [x] Implement `separate_nested_cols=True` support for single unnamed top-level + `list>`; make it the default for `CTable.from_parquet()` and the CLI. +- [x] Import element struct leaves as normal nested CTable columns. +- [x] Keep nested list fields inside the element struct as typed `ListArray` columns. +- [x] Avoid `to_pylist()` for scalar leaves; fixed-width leaves use the Arrow → NumPy path. +- [x] Set `ct.nrows` to the total element count. +- [x] Store `original_root` provenance metadata. +- [x] Add `CTable.from_parquet(max_rows=...)`; for unnamed-root imports the limit + applies to flattened element rows. + +Acceptance tests: + +- [x] Simple unnamed `list>` imports to dotted CTable columns. +- [x] Chicago taxi-style sample imports without `column_0` via `CTable.from_parquet()` + and `parquet-to-blosc2`. +- [x] `CTable.from_parquet(..., max_rows=N)` limits ordinary rows and flattened + unnamed-root element rows. +- [x] `ct.where("payment.fare > 20")` works directly. +- [x] `ct["trip.begin.lon"].mean()` works directly. +- [x] Reopen persistent `.b2d` / `.b2z`. +- [x] `to_arrow()` emits a clean logical nested table. +- [x] CLI `--no-separate-nested-cols` preserves ordinary top-level structs as + singleton-list columns for closer schema fidelity. +- [x] CLI default `--separate-nested-cols` flattens ordinary top-level structs into + dotted columns consistently with `CTable.from_arrow()`. + +### Phase 2 — nested list children inside root elements + +- [x] Ensure fields like `trip.path: list>` become typed `ListArray` + columns with one cell per element row. +- [x] Support `serializer="msgpack"` and `serializer="arrow"` for these list + columns. +- [x] Add fast Arrow import path for Arrow-serialized list columns via + `ListArray.extend_arrow()`, avoiding Python object materialization. +- [x] Make Arrow the default list serializer for Parquet imports in both + `CTable.from_parquet()` and `parquet-to-blosc2`; msgpack remains available for + read-time PyArrow independence. +- [x] Add serializer-aware batching defaults for the CLI: Arrow uses the sampled + flattened Parquet-batch scale, while msgpack uses + `compute_chunks_blocks(estimated_nrows).blocks[0]` to avoid giant Python object + payloads. +- [x] Expose `items_per_block` in `BatchArray.info` and `ListArray.info` so the + internal block-size heuristic is visible when tuning compression/random access. +- [x] Retune `BatchArray._guess_blocksize()` cache-budget tiers so default + `clevel=5` uses `L2 / 2` instead of L1-sized blocks, improving compression for + Arrow IPC payloads while keeping blocks smaller than full-batch `clevel=6+` + behavior. +- [ ] Add regression tests for `items_per_block` appearing in `.info` output. +- [ ] Add compression/lookup microbenchmarks for Arrow `ListArray` block-size + tuning on Chicago taxi-style list-struct payloads. + +### Phase 3 — named repeated field explode (future) + +- [ ] Add `ct.explode("events")` for named list fields if needed. +- [ ] Expose element leaf columns and optional `_parent`, `_ordinal`. +- [ ] Support `where`, aggregates, and sorting on exploded scalar leaves. + +### Phase 4 — parent predicates (future) + +- [ ] Add `where_any()` and `where_all()` for named repeated fields if there is user + demand. +- [ ] Map element masks back to parent masks using offsets/parent-index arrays. + +### Phase 5 — recursive repeated groups (future) + +- [ ] Consider recursively flattening nested repeated fields inside element structs. +- [ ] Example: `trip.path.londiff` in Chicago taxi. +- [ ] This requires nested row-space semantics and should be designed separately. + +--- + +## Profiling and tuning notes + +Recent profiling on: + +```bash +parquet-to-blosc2 chicago-taxi.parquet chicago-taxi.b2d \ + --overwrite --separate-nested-cols --max-rows 200_000 +``` + +showed that the old msgpack list serializer spends most of its time in the +list-column conversion path: + +- `CTable._write_arrow_batch()` dominated the import path. +- Inside that function, `arrow_col.to_pylist()` for the nested list column took + about 88% of the function time for the profiled Chicago taxi import. +- Fixed-width scalar leaves were already using the Arrow → NumPy path via + `_arrow_column_to_numpy()`, so the main Python-object materialization issue was + the nested `ListArray` column, not all columns. + +Using Arrow serialization for nested list columns avoids this conversion. This +is now the default for Parquet imports; pass `--list-serializer msgpack` only when +read-time PyArrow independence is more important than import speed: + +```bash +parquet-to-blosc2 chicago-taxi.parquet chicago-taxi.b2d \ + --overwrite --separate-nested-cols --max-rows 200_000 +``` + +Observed result on the 200k-row sample: + +- msgpack list serializer: about 6.1 s import time, 12.5 MB output. +- arrow list serializer: about 0.6 s import time, 14.7 MB output. + +Arrow-serialized `ListArray`/`BatchArray` payloads are still compressed by Blosc2 +as serialized byte payloads, so `BatchArray` keeps `typesize=1` by default. +Experiments with this Chicago taxi `trip.path` payload showed `typesize=1` was +also the best choice empirically. + +The more important tuning parameter was internal `items_per_block`. The old +`clevel=5` heuristic used an L1-sized budget and produced small blocks (for this +case, around 804 items/block), which compressed poorly. Retuning the heuristic +to use `L2 / 2` for `clevel` 4–6 produced much larger but still sub-batch blocks +(for this case, around 51k items/block), improving the `trip.path` cratio from +about 4.95 to about 12.0 with only a small copy-time increase. + +Current `BatchArray._guess_blocksize()` policy: + +- `clevel` 1–3: L1 data-cache budget. +- `clevel` 4–6: half the L2 cache budget. +- `clevel` 7–8: full L2 cache budget. +- `clevel` 9: full batch. + +Open follow-ups: + +- Add tests around the new `.info` fields and block-size heuristic. +- Benchmark random lookup latency versus compression ratio for different + `items_per_block` values on Arrow list-struct payloads. +- Keep the read-time PyArrow requirement for Arrow-serialized list columns documented + in the `CTable.from_parquet()` docstring and CLI `--list-serializer` help. + +--- + +## Resolved design decisions + +1. Use the name `separate_nested_cols` for this behavior/API surface. It better + describes the general physical goal: nested fields become separate physical + CTable columns where possible. +2. For qualifying schemas, unnamed-root list flattening is automatic by default: + - exactly one top-level field; + - field name is the canonical unnamed root `""`; + - field type is `list>` or `large_list>`. + + Rationale: for these files, the outer list is a physical Parquet encoding + artifact rather than a meaningful user column. Separating the element struct + leaves produces a more natural CTable, improves analytics, and should usually + improve compression for scalar leaves because each leaf is compressed with its + own dtype/codec path. Users can opt out with `separate_nested_cols=False` or + `--no-separate-nested-cols` when closer fidelity to the original Parquet schema + is desired. +3. Store provenance metadata by default, but do not store original root offsets + by default. Exact original Parquet root grouping is considered a low-priority + compatibility feature, not part of the normal CTable/Parquet interchange contract. +4. `to_parquet()` should emit a clean logical nested table by default, e.g. + `trip: struct<...>`, `payment: struct<...>`, `company: ...`, not a re-wrapped + unnamed `list` with arbitrary grouping. +5. Do not silently fall back to `ObjectArray` for unsupported nested children. + Raise by default; use `object_fallback=True` for explicit ObjectArray fallback. + +--- + +## Current status and remaining work + +The first milestone is implemented: unnamed-root record stream flattening for one +top-level `list>` column supports: + +```python +ct = blosc2.CTable.from_parquet( + "chicago-taxi.parquet", + separate_nested_cols=True, +) + +ct["payment.fare"].mean() +ct.where("payment.fare > 20") +ct["trip.begin.lon"].mean() +``` + +This is now the default for `CTable.from_parquet()` and `parquet-to-blosc2` for +qualifying unnamed-root `list>` Parquet files. Pass +`separate_nested_cols=False` in the library API, or `--no-separate-nested-cols` +in the CLI, when preserving the original Parquet row/schema shape is more +important than the separated column layout. + +Implemented beyond the original first milestone: + +- ordinary top-level structs flatten into dotted columns by default in the CLI; +- `parquet-to-blosc2 --progress` is opt-in and reports ETA for unnamed-root + imports; +- unnamed-root CLI imports write one flattened Parquet batch at a time, capped by + `MAX_ELEMENT_WRITE_BATCH`; +- CLI summary output distinguishes unnamed-root row flattening from general + nested-column separation and reports serializer-aware batching choices; +- Arrow is the default list serializer for Parquet imports, with msgpack still + available explicitly; +- Arrow/msgpack use different default BatchArray sizes to match their memory + behavior. + +Remaining work: + +- `ct.explode()` and parent/element mapping for named repeated fields; +- recursive flattening of nested repeated fields such as `trip.path.londiff`; +- tests and benchmarks for `.info` block-size fields, `items_per_block` tuning, + compression ratio, and random lookup latency. diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index de9ed022..8a587c06 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -18,13 +18,7 @@ import numpy as np -_HAS_NUMBA = False -try: - import numba - - _HAS_NUMBA = True -except ImportError: - pass +_HAS_NUMBA = importlib.util.find_spec("numba") is not None # Do the platform check once at module level IS_WASM = platform.machine() == "wasm32" # IS_WASM = True # for testing (comment this line out for production) @@ -736,10 +730,12 @@ def _raise(exc): where, ) from .schema import ( + DictionarySpec, bool, bytes, complex64, complex128, + dictionary, field, float32, float64, @@ -785,6 +781,8 @@ def _raise(exc): "bytes", "complex64", "complex128", + "dictionary", + "DictionarySpec", "field", "float32", "float64", diff --git a/src/blosc2/batch_array.py b/src/blosc2/batch_array.py index 8992f161..72c435a9 100644 --- a/src/blosc2/batch_array.py +++ b/src/blosc2/batch_array.py @@ -49,6 +49,8 @@ def __init__(self, parent: BatchArray, nbatch: int, lazybatch: bytes) -> None: self._items: list[Any] | None = None self._cached_block_index: int | None = None self._cached_block: list[Any] | None = None + self._cached_block_column_index: int | None = None + self._cached_block_column = None self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazybatch) def _normalize_index(self, index: int) -> int: @@ -74,6 +76,17 @@ def _get_block(self, block_index: int) -> list[Any]: self._cached_block = block return block + def _get_block_item(self, block_index: int, item_index: int) -> Any: + if self._cached_block_index == block_index and self._cached_block is not None: + return self._cached_block[item_index] + if self._parent._serializer != "arrow": + return self._get_block(block_index)[item_index] + if self._cached_block_column_index != block_index or self._cached_block_column is None: + payload = self._parent.schunk.get_vlblock(self._nbatch, block_index) + self._cached_block_column = self._parent._deserialize_arrow_block_column(payload) + self._cached_block_column_index = block_index + return self._cached_block_column[item_index].as_py() + def __getitem__(self, index: int | slice) -> Any | list[Any]: if isinstance(index, slice): items = self._decode_items() @@ -87,9 +100,8 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: block_index, item_index = divmod(index, items_per_block) if block_index >= self._nblocks: raise IndexError("Batch index out of range") - block = self._get_block(block_index) try: - return block[item_index] + return self._get_block_item(block_index, item_index) except IndexError as exc: raise IndexError("Batch index out of range") from exc items = self._decode_items() @@ -593,11 +605,17 @@ def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: raise ValueError("BatchArray entries cannot be empty") clevel = self.cparams.clevel + # For serialized batch payloads, especially Arrow IPC, L1-sized blocks are often + # too small for codecs like Zstd to exploit cross-row redundancy. Use larger + # cache-budget tiers as clevel increases, while avoiding full L2 blocks at the + # default clevel to keep random access reasonably granular. if clevel == 9: return len(payload_sizes) - if 0 < clevel <= 5: + if 0 < clevel <= 3: budget = blosc2.cpu_info.get("l1_data_cache_size") - elif 5 < clevel < 9: + elif 3 < clevel <= 6: + budget = blosc2.cpu_info.get("l2_cache_size") // 2 + elif 6 < clevel < 9: budget = blosc2.cpu_info.get("l2_cache_size") else: return len(payload_sizes) @@ -625,9 +643,12 @@ def _serialize_msgpack_block(self, items: list[Any]) -> bytes: return payload def _serialize_arrow_block(self, items) -> bytes: - pa, _ = self._require_pyarrow() + pa, pa_ipc = self._require_pyarrow() batch = pa.record_batch([items], schema=self._get_arrow_schema()) - payload = batch.serialize().to_pybytes() + sink = pa.BufferOutputStream() + with pa_ipc.new_stream(sink, batch.schema) as writer: + writer.write_batch(batch) + payload = sink.getvalue().to_pybytes() _check_serialized_size(payload) return payload @@ -639,16 +660,34 @@ def _serialize_block(self, items: Any) -> bytes: def _deserialize_msgpack_block(self, payload: bytes) -> list[Any]: return msgpack_unpackb(payload) - def _deserialize_arrow_block(self, payload: bytes) -> list[Any]: + def _deserialize_arrow_block_column(self, payload: bytes): pa, pa_ipc = self._require_pyarrow() - batch = pa_ipc.read_record_batch(pa.BufferReader(payload), self._get_arrow_schema()) - return batch.column(0).to_pylist() + try: + reader = pa_ipc.open_stream(pa.BufferReader(payload)) + batch = reader.read_next_batch() + except (pa.ArrowInvalid, OSError): + # Backward compatibility for older arrow-serializer blocks written + # as bare serialized RecordBatch payloads. Those cannot represent + # dictionary batches reliably, so new blocks use IPC streams. + batch = pa_ipc.read_record_batch(pa.BufferReader(payload), self._get_arrow_schema()) + return batch.column(0) + + def _deserialize_arrow_block(self, payload: bytes) -> list[Any]: + return self._deserialize_arrow_block_column(payload).to_pylist() def _deserialize_block(self, payload: bytes) -> list[Any]: if self._serializer == "arrow": return self._deserialize_arrow_block(payload) return self._deserialize_msgpack_block(payload) + def _deserialize_arrow_block_item(self, payload: bytes, item_index: int) -> Any: + return self._deserialize_arrow_block_column(payload)[item_index].as_py() + + def _deserialize_block_item(self, payload: bytes, item_index: int) -> Any: + if self._serializer == "arrow": + return self._deserialize_arrow_block_item(payload, item_index) + return self._deserialize_msgpack_block(payload)[item_index] + def _vl_cparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.cparams) @@ -903,6 +942,7 @@ def info_items(self) -> list: return [ ("type", f"{self.__class__.__name__}"), ("serializer", self.serializer), + ("items_per_block", self.items_per_block), ("nbatches", nbatches_value), ("nblocks", nblocks_value), ("nitems", sum(batch_sizes)), diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index 9a849650..c662740d 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -15,7 +15,6 @@ from collections.abc import Sequence import numpy as np -import requests import blosc2 from blosc2.b2objects import encode_b2object_payload, make_b2object_carrier, write_b2object_payload @@ -31,6 +30,12 @@ """Default timeout for HTTP requests.""" +def _requests(): + import requests + + return requests + + @contextmanager def c2context( *, @@ -109,7 +114,7 @@ def _xget(url, params=None, headers=None, auth_token=None, timeout=TIMEOUT): if auth_token: headers = headers.copy() if headers else {} headers["Cookie"] = auth_token - response = requests.get(url, params=params, headers=headers, timeout=timeout) + response = _requests().get(url, params=params, headers=headers, timeout=timeout) response.raise_for_status() return response @@ -117,7 +122,7 @@ def _xget(url, params=None, headers=None, auth_token=None, timeout=TIMEOUT): def _xpost(url, json=None, auth_token=None, timeout=TIMEOUT): auth_token = auth_token or _subscriber_data["auth_token"] headers = {"Cookie": auth_token} if auth_token else None - response = requests.post(url, json=json, headers=headers, timeout=timeout) + response = _requests().post(url, json=json, headers=headers, timeout=timeout) response.raise_for_status() return response.json() @@ -132,7 +137,7 @@ def _sub_url(urlbase, path): def login(username, password, urlbase): url = _sub_url(urlbase, "auth/jwt/login") creds = {"username": username, "password": password} - resp = requests.post(url, data=creds, timeout=TIMEOUT) + resp = _requests().post(url, data=creds, timeout=TIMEOUT) resp.raise_for_status() return "=".join(list(resp.cookies.items())[0]) @@ -234,7 +239,7 @@ def __init__(self, path: str, /, urlbase: str | None = None, auth_token: str | N # Try to 'open' the remote path try: self.meta = info(self.path, self.urlbase, auth_token=self.auth_token) - except requests.HTTPError as err: + except _requests().HTTPError as err: raise FileNotFoundError(f"Remote path not found: {path}.\nError was: {err}") from err cparams = self.meta["schunk"]["cparams"] # Remove "filters, meta" from cparams; this is an artifact from the server diff --git a/src/blosc2/cli/parquet_to_blosc2.py b/src/blosc2/cli/parquet_to_blosc2.py index ba513c9b..8f5c2507 100644 --- a/src/blosc2/cli/parquet_to_blosc2.py +++ b/src/blosc2/cli/parquet_to_blosc2.py @@ -44,9 +44,10 @@ from typing import Any import blosc2 -from blosc2.schema_compiler import schema_to_dict +from blosc2.schema_compiler import _validate_column_name, schema_to_dict DEFAULT_BATCH_SIZE = 2048 +MAX_ELEMENT_WRITE_BATCH = 5_000_000 # cap on flattened elements yielded per write def require_pyarrow(): @@ -174,7 +175,14 @@ def build_parser() -> argparse.ArgumentParser: "--max-rows", type=int, default=None, - help="Maximum number of rows to import from the source parquet file; imports all rows by default.", + help=( + "Maximum number of CTable rows to import. " + "In normal mode this equals the number of Parquet rows read. " + "With separate nested columns enabled for an unnamed-root list> " + "file, the unit is list elements " + "(i.e. the number of rows in the resulting CTable), " + "not outer Parquet rows." + ), ) parser.add_argument( "--batch-size", @@ -186,7 +194,9 @@ def build_parser() -> argparse.ArgumentParser: "--blosc2-batch-size", type=int, default=None, - help="Rows grouped into each persisted BatchArray batch for imported Blosc2 varlen/list columns.", + help="Internal batch_rows for BatchArray/varlen columns in the imported CTable. " + "Defaults to the blocks value from blosc2.compute_chunks_blocks() based on " + "the estimated CTable row count.", ) parser.add_argument( "--blosc2-items-per-block", @@ -197,6 +207,16 @@ def build_parser() -> argparse.ArgumentParser: "Defaults to BatchArray's automatic heuristic." ), ) + parser.add_argument( + "--list-serializer", + choices=["msgpack", "arrow"], + default="arrow", + help=( + "Serializer for imported list columns. 'arrow' is the default and stores Arrow list " + "batches directly, which is much faster for deeply nested lists but requires PyArrow " + "when reading those columns later. Use 'msgpack' to avoid that read-time dependency." + ), + ) parser.add_argument("--use-dict", action="store_true", help="Enable C-Blosc2 dictionary compression.") parser.add_argument( "--float-trunc-prec", @@ -236,7 +256,12 @@ def build_parser() -> argparse.ArgumentParser: "--batch-report-every", type=int, default=1, - help="Print progress every N batches; the final batch is always reported.", + help="With --progress, print progress every N batches; the final batch is always reported.", + ) + parser.add_argument( + "--progress", + action="store_true", + help="Print import progress lines. By default, only the import summary is shown.", ) parser.add_argument( "--profile", @@ -244,6 +269,29 @@ def build_parser() -> argparse.ArgumentParser: help="Run the selected operation under cProfile and print cumulative timing stats.", ) parser.add_argument("--overwrite", action="store_true") + parser.add_argument( + "--decode-dictionaries", + action="store_true", + help=( + "Decode Arrow dictionary-encoded columns to plain vlstring instead of preserving " + "the dictionary encoding. By default, supported dictionary columns " + "(string values with integer indices) are imported as Blosc2 dictionary columns." + ), + ) + parser.add_argument( + "--separate-nested-cols", + action=argparse.BooleanOptionalAction, + default=True, + dest="separate_nested_cols", + help=( + "Import nested columns as separate CTable columns where possible. " + "Top-level struct fields are flattened recursively into dotted leaf columns " + "(e.g. trip.begin.lon). For a single unnamed top-level list> " + "field (the Awkward Array / Chicago-taxi layout), flatten the outer list " + "so that each element becomes a CTable row. Enabled by default; use " + "--no-separate-nested-cols when closer Parquet schema fidelity is desired." + ), + ) return parser @@ -272,11 +320,40 @@ def _release_arrow_temporaries(pa) -> None: pa.default_memory_pool().release_unused() +def ctable_column_name_map(schema) -> dict[str, str]: + """Return a mapping from Arrow field names to CTable-safe column names. + + Remaps invalid names (empty strings, names starting with '_', names + containing '/') to safe substitutes like ``column_0``. + """ + used: set[str] = set() + result: dict[str, str] = {} + for i, field in enumerate(schema): + original = field.name + try: + _validate_column_name(original) + candidate = original + except ValueError: + candidate = f"column_{i}" + if candidate in used: + base = candidate + suffix = 1 + while f"{base}_{suffix}" in used: + suffix += 1 + candidate = f"{base}_{suffix}" + used.add(candidate) + result[original] = candidate + return result + + def classify_columns( # noqa: C901 pa, schema, fixed_string_lengths: dict[str, int] | None = None, fixed_bytes_lengths: dict[str, int] | None = None, + *, + decode_dictionaries: bool = False, + separate_nested_cols: bool = True, ): """Classify Parquet schema columns into importable categories.""" fixed_cols: dict[str, object] = {} @@ -289,8 +366,14 @@ def classify_columns( # noqa: C901 for field in schema: t = field.type if pa.types.is_struct(t): - struct_wrap_cols[field.name] = pa.list_(t) - conversions[field.name] = {"conversion": "struct_wrapped_as_singleton_list"} + if separate_nested_cols: + # Let CTable.from_arrow() apply its normal struct flattening so + # top-level structs become dotted leaf columns. + fixed_cols[field.name] = field + conversions[field.name] = {"conversion": "struct_flattened_to_columns"} + else: + struct_wrap_cols[field.name] = pa.list_(t) + conversions[field.name] = {"conversion": "struct_wrapped_as_singleton_list"} continue if pa.types.is_list(t) or pa.types.is_large_list(t): value_type = t.value_type @@ -299,6 +382,30 @@ def classify_columns( # noqa: C901 else: fixed_cols[field.name] = field continue + if pa.types.is_dictionary(t): + vt = t.value_type + if vt in (pa.string(), pa.large_string(), pa.utf8(), pa.large_utf8()): + if decode_dictionaries: + # Decode to plain vlstring. + fixed_cols[field.name] = pa.field( + field.name, pa.string(), nullable=field.nullable, metadata=field.metadata + ) + conversions[field.name] = { + "conversion": "dictionary_decoded_to_vlstring", + "ordered": bool(t.ordered), + } + else: + fixed_cols[field.name] = field + conversions[field.name] = { + "conversion": "dictionary_preserved", + "ordered": bool(t.ordered), + } + else: + conversions[field.name] = { + "conversion": "skipped", + "reason": f"unsupported dictionary value type: {vt}", + } + continue if pa.types.is_boolean(t): fixed_cols[field.name] = field if field.nullable: @@ -352,21 +459,41 @@ def build_import_schema( fixed_cols: dict, struct_wrap_cols: dict, timestamp_units: dict[str, str] | None = None, + column_name_map: dict[str, str] | None = None, ): """Build the Arrow schema passed to CTable.from_arrow().""" timestamp_units = timestamp_units or {} + column_name_map = column_name_map or {} fields = [] for field in original_schema: + ctable_name = column_name_map.get(field.name, field.name) if field.name in struct_wrap_cols: - fields.append(pa.field(field.name, struct_wrap_cols[field.name], nullable=True)) + fields.append(pa.field(ctable_name, struct_wrap_cols[field.name], nullable=True)) elif field.name in fixed_cols: unit = timestamp_units.get(field.name) if unit is not None: fields.append( - pa.field(field.name, pa.timestamp(unit, tz=field.type.tz), nullable=field.nullable) + pa.field(ctable_name, pa.timestamp(unit, tz=field.type.tz), nullable=field.nullable) ) else: - fields.append(field) + # Use the field from fixed_cols in case it was remapped (e.g. dict→string) + fc = fixed_cols[field.name] + if hasattr(fc, "type") and fc.type != field.type: + # fc has the remapped type; use ctable_name for the field name + fields.append( + pa.field( + ctable_name, + fc.type, + nullable=fc.nullable, + metadata=fc.metadata if fc.metadata else None, + ) + ) + elif ctable_name != field.name: + fields.append( + pa.field(ctable_name, field.type, nullable=field.nullable, metadata=field.metadata) + ) + else: + fields.append(field) return pa.schema(fields) @@ -614,11 +741,14 @@ def scan_string_and_bytes_lengths(pa, pf, args, schema) -> tuple[dict[str, int], def transform_batch( - pa, batch, selected_cols: list[str], struct_wrap_cols: dict, timestamp_units: dict[str, str] + pa, + batch, + selected_cols: list[str], + struct_wrap_cols: dict, + timestamp_units: dict[str, str], + import_schema=None, ): """Apply import-time Arrow conversions; pass everything else through.""" - if not struct_wrap_cols and not timestamp_units: - return batch arrays = list(batch.columns) for name, unit in timestamp_units.items(): idx = batch.schema.get_field_index(name) @@ -636,18 +766,34 @@ def transform_batch( continue arr = batch.column(idx) arrays[idx] = pa.array([[v] if v is not None else None for v in arr.to_pylist()], type=target_type) + if import_schema is not None: + # Cast / rename arrays to match import_schema (e.g. dict→string, renamed columns). + for i, field in enumerate(import_schema): + if not arrays[i].type.equals(field.type): + arrays[i] = arrays[i].cast(field.type, safe=True) + return pa.record_batch(arrays, schema=import_schema) + if not struct_wrap_cols and not timestamp_units: + return batch return pa.record_batch(arrays, names=selected_cols) -def store_original_arrow_metadata(ct, original_schema, imported_schema, conversions: dict) -> None: +def store_original_arrow_metadata( + ct, original_schema, imported_schema, conversions: dict, column_name_map: dict | None = None +) -> None: + column_name_map = column_name_map or {} fields_meta = {} for field in original_schema: entry = conversions.get(field.name) if entry is None: continue entry = dict(entry) + ctable_name = column_name_map.get(field.name, field.name) + if ctable_name != field.name: + entry["ctable_name"] = ctable_name entry["original_arrow_type"] = str(field.type) - if field.name in imported_schema.names: + if ctable_name in imported_schema.names: + entry["ctable_arrow_type"] = str(imported_schema.field(ctable_name).type) + elif field.name in imported_schema.names: entry["ctable_arrow_type"] = str(imported_schema.field(field.name).type) fields_meta[field.name] = entry ct._schema.metadata = { @@ -692,6 +838,13 @@ def print_import_plan( fixed_bytes_cols = [ n for n, e in conversions.items() if e.get("conversion") in {"fixed_bytes", "fixed_bytes_nullable"} ] + dict_cols = [n for n, e in conversions.items() if e.get("conversion") == "dictionary_preserved"] + dict_decoded_cols = [ + n for n, e in conversions.items() if e.get("conversion") == "dictionary_decoded_to_vlstring" + ] + flattened_structs = [ + n for n, e in conversions.items() if e.get("conversion") == "struct_flattened_to_columns" + ] wrapped_structs = list(struct_wrap_cols) skipped = {n: e for n, e in conversions.items() if e.get("conversion") == "skipped"} print(f"Input: {input_path} ({input_path.stat().st_size / 1e6:.1f} MB)") @@ -699,14 +852,21 @@ def print_import_plan( print(f"CTable store: {ctable_store_kind(output_path)}") print(f"Rows: {pf.metadata.num_rows:,}") if args.max_rows is not None: - print(f"Rows to import: {min(args.max_rows, pf.metadata.num_rows):,}") + print(f"Rows to import: {min(args.max_rows, pf.metadata.num_rows):,} (Parquet rows)") print(f"Parquet columns: {len(parquet_schema)}") print(f"Imported columns: {len(fixed_cols) + len(struct_wrap_cols)}") - print(f" Fixed-width: {len(fixed_cols) - len(vlstring_cols) - len(vlbytes_cols)}") + n_fixed_non_string = ( + len(fixed_cols) - len(vlstring_cols) - len(vlbytes_cols) - len(dict_cols) - len(dict_decoded_cols) + ) + print(f" Fixed-width: {n_fixed_non_string}") print(f" Fixed strings: {len(fixed_string_cols)}") print(f" Fixed bytes: {len(fixed_bytes_cols)}") print(f" vlstring: {len(vlstring_cols)}") print(f" vlbytes: {len(vlbytes_cols)}") + print(f" Dictionary: {len(dict_cols)}") + if dict_decoded_cols: + print(f" Dict→vlstring: {len(dict_decoded_cols)}") + print(f" Struct→columns: {len(flattened_structs)}") print(f" Struct→list: {len(wrapped_structs)}") print(f" Nullable scalars: {len(nullable_scalars)}") print(f" Skipped unsupported: {len(skipped)}") @@ -720,6 +880,7 @@ def print_import_plan( print(f"Blosc2 batch size: {args.blosc2_batch_size:,}") if args.blosc2_items_per_block is not None: print(f"Blosc2 items/block: {args.blosc2_items_per_block:,}") + print(f"List serializer: {args.list_serializer}") print(f"Codec / level: {args.codec} / {args.clevel}") print(f"Use dict: {args.use_dict}") trunc_global = getattr(args, "float_trunc_prec_global", None) @@ -734,7 +895,7 @@ def print_import_plan( print() -def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_units): +def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_units, import_schema=None): rows_done = 0 t0 = time.perf_counter() total = pf.metadata.num_rows if args.max_rows is None else min(args.max_rows, pf.metadata.num_rows) @@ -749,14 +910,16 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_un report_batch_mem = args.mem_report and batch_n % args.mem_every == 0 if report_batch_mem: memory_report(f"batch {batch_n} after parquet read", pa) - batch = transform_batch(pa, raw_batch, selected_cols, struct_wrap_cols, timestamp_units) + batch = transform_batch( + pa, raw_batch, selected_cols, struct_wrap_cols, timestamp_units, import_schema + ) if report_batch_mem: memory_report(f"batch {batch_n} after transform", pa) rows_done += len(batch) elapsed = time.perf_counter() - t0 rate = rows_done / elapsed if elapsed > 0 else 0.0 eta = (total - rows_done) / rate if rate > 0 else 0.0 - if batch_n % args.batch_report_every == 0 or rows_done >= total: + if args.progress and (batch_n % args.batch_report_every == 0 or rows_done >= total): print( f" batch {batch_n:4d} {rows_done:>12,}/{total:,} " f"{elapsed:7.1f}s {rate / 1e3:7.1f}k rows/s ETA {eta:6.0f}s", @@ -769,10 +932,246 @@ def progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_un memory_report(f"batch {batch_n} after ctable write", pa) +def _flatten_root_batches_with_progress( + pa, + pf, + inner_schema, + args, + capacity_hint=None, +): + """Yield flattened :class:`pyarrow.RecordBatch` objects from an unnamed-root Parquet file. + + Reads Parquet batches, flattens the outer ``list>`` column via + ``ListArray.flatten()``, and honours ``args.max_rows`` as an element-level + row limit. When ``args.progress`` is enabled, progress is printed per + Parquet batch according to ``args.batch_report_every``. + + Each flattened Parquet batch is yielded as a single write to CTable so that + the per-write Python/Arrow overhead is amortised over as many rows as + possible. Batches exceeding ``MAX_ELEMENT_WRITE_BATCH`` are split into + cap-sized chunks to bound memory usage. + """ + rows_done = 0 + max_rows = args.max_rows + t0 = time.perf_counter() + # total_str is the CTable-row (element) limit for the progress display. + total_str = f"{max_rows:,} CTable rows" if max_rows is not None else "?" + # Use capacity_hint as the estimated total for ETA when max_rows is not set. + estimated_total = max_rows if max_rows is not None else capacity_hint + + for parquet_batch_n, raw_batch in enumerate( + pf.iter_batches(batch_size=args.parquet_batch_size), start=1 + ): + if max_rows is not None and rows_done >= max_rows: + break + + report_batch_mem = args.mem_report and parquet_batch_n % args.mem_every == 0 + if report_batch_mem: + memory_report(f"batch {parquet_batch_n} after parquet read", pa) + + list_array = raw_batch.column(0) + struct_values = list_array.flatten() # skips null outer-list rows + + if len(struct_values) == 0: + continue + + if max_rows is not None: + remaining = max_rows - rows_done + if len(struct_values) > remaining: + struct_values = struct_values.slice(0, remaining) + + # Yield the whole flattened batch as one write; split only when it + # exceeds MAX_ELEMENT_WRITE_BATCH to bound peak memory. + n_elems = len(struct_values) + + elapsed = time.perf_counter() - t0 + rate = rows_done / elapsed if elapsed > 0 and rows_done > 0 else 0.0 + eta_str = ( + f" ETA {(estimated_total - rows_done) / rate:6.0f}s" + if rate > 0 and estimated_total is not None + else "" + ) + report_progress = parquet_batch_n % args.batch_report_every == 0 or ( + max_rows is not None and rows_done + n_elems >= max_rows + ) + n_writes = (n_elems + MAX_ELEMENT_WRITE_BATCH - 1) // MAX_ELEMENT_WRITE_BATCH + if args.progress and report_progress: + print( + f" parquet batch {parquet_batch_n:4d}: " + f"{n_elems:>12,} CTable rows -> {n_writes:,} write(s) " + f"done {rows_done:>12,}/{total_str} " + f"{elapsed:7.1f}s {rate / 1e3:7.1f}k rows/s{eta_str}", + flush=True, + ) + + for offset in range(0, n_elems, MAX_ELEMENT_WRITE_BATCH): + chunk = struct_values.slice(offset, min(MAX_ELEMENT_WRITE_BATCH, n_elems - offset)) + sub_batch = pa.RecordBatch.from_struct_array(chunk) + rows_done += len(sub_batch) + yield sub_batch + + if report_batch_mem: + memory_report(f"batch {parquet_batch_n} after flatten+write", pa) + + if max_rows is not None and rows_done >= max_rows: + break + + +def import_unnamed_root_separate_cols( + args, + input_path: Path, + output_path: Path, + pa, + pf, + parquet_schema, +) -> list[str]: + """Import an unnamed-root ``list>`` Parquet file with nested column separation. + + Each element of the unnamed root list becomes a CTable row. Struct leaves + are stored as separate physical columns with dotted logical paths such as + ``trip.begin.lon`` and ``payment.fare``. + + Returns the list of imported CTable column names. + """ + from blosc2.schema_compiler import schema_to_dict + + inner_schema = blosc2.CTable._inner_schema_for_unnamed_root(pa, parquet_schema) + total_parquet_rows = pf.metadata.num_rows if pf.metadata is not None else None + + # ------------------------------------------------------------------ + # Estimate total element count by sampling the first Parquet batch. + # This is used as capacity_hint so that compute_chunks_blocks() picks + # chunk/block sizes proportional to the actual data volume rather than + # defaulting to (1, 1) when the element count is unknown. + # pf.iter_batches() creates a fresh iterator each call, so sampling + # here does not affect the import iterator created later. + # ------------------------------------------------------------------ + capacity_hint = None + estimated_batch_rows = None + if total_parquet_rows is not None and total_parquet_rows > 0: + try: + sample = next( + pf.iter_batches(batch_size=min(args.parquet_batch_size, total_parquet_rows)), + None, + ) + if sample is not None and len(sample) > 0: + n_outer_sampled = len(sample) + n_elems_sampled = len(sample.column(0).flatten()) + avg_per_outer_row = n_elems_sampled / n_outer_sampled + estimated_batch_rows = max(1, round(args.parquet_batch_size * avg_per_outer_row)) + estimate = round(total_parquet_rows * avg_per_outer_row) + if args.max_rows is not None: + estimate = min(estimate, args.max_rows) + capacity_hint = max(1, estimate) + except Exception: + pass # sampling failure is non-fatal; from_arrow falls back to _EXPECTED_SIZE_DEFAULT + + if args.blosc2_batch_size is None: + if args.list_serializer == "arrow": + # Arrow list storage appends incoming Arrow chunks directly, without + # materializing Python nested-list objects. Use the natural flattened + # Parquet-batch scale (about 1M rows for Chicago taxi), capped only for + # pathological batches, so the displayed BatchArray size matches the + # actual write granularity better than the absolute cap would. + args.blosc2_batch_size = min( + MAX_ELEMENT_WRITE_BATCH, + estimated_batch_rows if estimated_batch_rows is not None else MAX_ELEMENT_WRITE_BATCH, + ) + else: + # Msgpack list storage materializes nested Arrow list data as Python objects + # before serializing. Keep its internal BatchArray batch_rows at Blosc2's + # cache-tuned block granularity instead of the larger Arrow write scale. + if capacity_hint is not None: + _, blocks = blosc2.compute_chunks_blocks((capacity_hint,)) + args.blosc2_batch_size = max(1, blocks[0]) + else: + args.blosc2_batch_size = DEFAULT_BATCH_SIZE + + print(f"Input: {input_path} ({input_path.stat().st_size / 1e6:.1f} MB)") + print(f"Output: {output_path}") + print(f"CTable store: {ctable_store_kind(output_path)}") + print("Mode: unnamed-root list flattening") + print("Nested columns: separated into dotted CTable columns") + if total_parquet_rows is not None: + print(f"Parquet rows: {total_parquet_rows:,}") + if capacity_hint is not None: + print(f"Est. CTable rows: ~{capacity_hint:,}") + n_inner = len(inner_schema) + print(f"Inner struct fields: {n_inner}") + for f in inner_schema: + print(f" {f.name}: {f.type}") + if args.max_rows is not None: + print(f"Max CTable rows: {args.max_rows:,} (list elements)") + print(f"Parquet batch size: {args.parquet_batch_size:,} outer rows") + blosc2_batch_note = ( + f"auto, max: {MAX_ELEMENT_WRITE_BATCH:,}" + if getattr(args, "blosc2_batch_size_auto", False) + else f"max: {MAX_ELEMENT_WRITE_BATCH:,}" + ) + print(f"Blosc2 batch size: {args.blosc2_batch_size:,} BatchArray rows ({blosc2_batch_note})") + if args.blosc2_items_per_block is not None: + print(f"Blosc2 items/block: {args.blosc2_items_per_block:,}") + print(f"List serializer: {args.list_serializer}") + print(f"Codec / level: {args.codec} / {args.clevel}") + print(f"Use dict: {args.use_dict}") + print() + + cparams = blosc2.CParams(codec=blosc2.Codec[args.codec], clevel=args.clevel, use_dict=args.use_dict) + t0 = time.perf_counter() + maybe_memory_report(args, "before CTable import", pa) + + ct = blosc2.CTable.from_arrow( + inner_schema, + _flatten_root_batches_with_progress(pa, pf, inner_schema, args, capacity_hint=capacity_hint), + urlpath=str(output_path), + mode="w", + cparams=cparams, + capacity_hint=capacity_hint, + auto_null_sentinels=True, + blosc2_batch_size=args.blosc2_batch_size, + blosc2_items_per_block=args.blosc2_items_per_block, + list_serializer=args.list_serializer, + ) + + maybe_memory_report(args, "after CTable import", pa) + + # Store the original_root provenance metadata so that reopened CTables know + # they came from an unnamed-root list> file. + nested_meta = ct._schema.metadata.get("nested", {}) + nested_meta["original_root"] = { + "kind": "unnamed_list_struct", + "field_name": "", + "preserve_grouping": False, + } + ct._schema.metadata["nested"] = nested_meta + ct._storage.save_schema(schema_to_dict(ct._schema)) + + maybe_memory_report(args, "after metadata save", pa) + + elapsed = time.perf_counter() - t0 + rows = len(ct) + cols = len(ct.col_names) + col_names = list(ct.col_names) + ct.close() + + maybe_memory_report(args, "after CTable close", pa) + + output_size = ( + output_path.stat().st_size + if output_path.is_file() + else sum(f.stat().st_size for f in output_path.rglob("*") if f.is_file()) + ) + print(f"Done in {elapsed:.2f}s") + print(f"Element rows imported: {rows:,}") + print(f"Columns imported: {cols}") + print(f"Output size: {output_size / 1e6:.1f} MB") + return col_names + + def import_parquet_to_ctable(args, input_path: Path, output_path: Path): if args.parquet_batch_size <= 0: raise ValueError("--parquet-batch-size must be positive") - if args.blosc2_batch_size <= 0: + if args.blosc2_batch_size is not None and args.blosc2_batch_size <= 0: raise ValueError("--blosc2-batch-size must be positive") if args.blosc2_items_per_block is not None and args.blosc2_items_per_block <= 0: raise ValueError("--blosc2-items-per-block must be positive") @@ -799,6 +1198,14 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path): maybe_memory_report(args, "after ParquetFile open", pa) parquet_schema = pf.schema_arrow + # ------------------------------------------------------------------ + # Early dispatch: --separate-nested-cols for unnamed-root datasets + # ------------------------------------------------------------------ + if getattr(args, "separate_nested_cols", False) and blosc2.CTable._detect_unnamed_root_list_struct( + pa, parquet_schema + ): + return import_unnamed_root_separate_cols(args, input_path, output_path, pa, pf, parquet_schema) + fixed_string_lengths, fixed_bytes_lengths = scan_string_and_bytes_lengths(pa, pf, args, parquet_schema) maybe_memory_report(args, "after string/binary length scan", pa) @@ -806,13 +1213,24 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path): maybe_memory_report(args, "after timestamp unit scan", pa) fixed_cols, struct_wrap_cols, conversions, nullable_scalars = classify_columns( - pa, parquet_schema, fixed_string_lengths, fixed_bytes_lengths + pa, + parquet_schema, + fixed_string_lengths, + fixed_bytes_lengths, + decode_dictionaries=getattr(args, "decode_dictionaries", False), + separate_nested_cols=getattr(args, "separate_nested_cols", True), ) maybe_memory_report(args, "after column classification", pa) selected_cols = [f.name for f in parquet_schema if f.name in fixed_cols or f.name in struct_wrap_cols] - import_schema = build_import_schema(pa, parquet_schema, fixed_cols, struct_wrap_cols, timestamp_units) - fixed_scalar_lengths = {**fixed_string_lengths, **fixed_bytes_lengths} or None + column_name_map = ctable_column_name_map(parquet_schema) + import_schema = build_import_schema( + pa, parquet_schema, fixed_cols, struct_wrap_cols, timestamp_units, column_name_map + ) + fixed_scalar_lengths = { + column_name_map.get(name, name): length + for name, length in {**fixed_string_lengths, **fixed_bytes_lengths}.items() + } or None float_trunc_column_cparams = build_float_trunc_column_cparams(pa, import_schema, args) maybe_memory_report(args, "after import schema build", pa) @@ -833,7 +1251,7 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path): ct = blosc2.CTable.from_arrow( import_schema, - progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_units), + progress_batches(pa, pf, args, selected_cols, struct_wrap_cols, timestamp_units, import_schema), urlpath=str(output_path), mode="w", cparams=blosc2.CParams(codec=blosc2.Codec[args.codec], clevel=args.clevel, use_dict=args.use_dict), @@ -844,10 +1262,11 @@ def import_parquet_to_ctable(args, input_path: Path, output_path: Path): auto_null_sentinels=True, blosc2_batch_size=args.blosc2_batch_size, blosc2_items_per_block=args.blosc2_items_per_block, + list_serializer=args.list_serializer, column_cparams=float_trunc_column_cparams or None, ) maybe_memory_report(args, "after CTable import", pa) - store_original_arrow_metadata(ct, parquet_schema, import_schema, conversions) + store_original_arrow_metadata(ct, parquet_schema, import_schema, conversions, column_name_map) maybe_memory_report(args, "after metadata save", pa) elapsed = time.perf_counter() - t0 rows = len(ct) @@ -921,6 +1340,22 @@ def export_ctable_to_parquet(input_path: Path, output_path: Path, *, batch_size: elif conversion in {"vlstring", "vlstring_nullable", "vlbytes", "vlbytes_nullable"}: if str(arr.type) != str(field.type): arr = arr.cast(field.type) + elif conversion in {"dictionary_preserved"}: + # CTable emits dictionary; restore original type if needed. + if str(arr.type) != str(field.type): + arr = arr.cast(field.type, safe=True) + elif conversion in {"dictionary_decoded_to_vlstring"}: + # Was decoded to vlstring on import; restore as dictionary type on export. + if pa.types.is_dictionary(field.type): + encoded = pa.DictionaryArray.from_arrays( + *pa.array(arr.to_pylist()) + .dictionary_encode() + .unify_dictionaries([pa.array(arr.to_pylist()).dictionary_encode()]), + ordered=field.type.ordered, + ) + arr = encoded.cast(field.type) + elif str(arr.type) != str(field.type): + arr = arr.cast(field.type) elif str(arr.type) != str(field.type): arr = pa.array(arr.to_pylist(), type=field.type) arrays.append(arr) @@ -1060,7 +1495,33 @@ def average_parquet_row_group_size(input_path: Path) -> int | None: return max(1, round(metadata.num_rows / metadata.num_row_groups)) +def is_unnamed_root_parquet_input(input_path: Path) -> bool: + if input_path.suffix != ".parquet" or not input_path.exists(): + return False + try: + pa, pq = require_pyarrow() + pf = pq.ParquetFile(input_path) + return blosc2.CTable._detect_unnamed_root_list_struct(pa, pf.schema_arrow) + except Exception: + return False + + def resolve_default_batch_sizes(args, *, parquet_specified: bool, blosc2_specified: bool) -> None: + if getattr(args, "separate_nested_cols", False) and is_unnamed_root_parquet_input(args.input_path): + # In separate-nested mode the two batch-size options use different units: + # Parquet batches are outer rows, while Blosc2 batches are flattened + # CTable rows. Keep them independent so a large write batch does not + # accidentally imply a huge Parquet read batch (and vice versa). + if not parquet_specified: + args.parquet_batch_size = average_parquet_row_group_size(args.input_path) or DEFAULT_BATCH_SIZE + if not blosc2_specified: + # Defer separate-nested defaults until import, where we have a sampled + # estimate of flattened CTable rows per Parquet batch. Arrow uses that + # natural per-Parquet-batch scale; msgpack uses a smaller blocks-based + # scale because it materializes nested Python objects before serializing. + args.blosc2_batch_size = None + return + if parquet_specified and not blosc2_specified: args.blosc2_batch_size = args.parquet_batch_size elif blosc2_specified and not parquet_specified: @@ -1079,6 +1540,7 @@ def main(argv: list[str] | None = None) -> int: argv, "--batch-size" ) blosc2_specified = _option_present(argv, "--blosc2-batch-size") + args.blosc2_batch_size_auto = not blosc2_specified resolve_default_batch_sizes(args, parquet_specified=parquet_specified, blosc2_specified=blosc2_specified) if args.profile: diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 4cb022c0..a00e1725 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -24,7 +24,6 @@ from typing import TYPE_CHECKING, ClassVar import numpy as np -import requests import blosc2 from blosc2 import blosc2_ext @@ -1138,6 +1137,8 @@ def print_versions(): import numexpr print(f"numexpr version: {numexpr.__version__}") + import requests + print(f"requests version: {requests.__version__}") print(f"Python version: {sys.version}") (sysname, _nodename, release, version, machine, processor) = platform.uname() diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 0c3cdd26..1f80dc3d 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -15,7 +15,6 @@ import contextvars import copy import dataclasses -import itertools import os import pprint import re @@ -25,17 +24,29 @@ from dataclasses import MISSING, dataclass from dataclasses import field as dataclass_field from textwrap import TextWrapper -from typing import Any, Generic, Literal, TypeVar +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar import numpy as np import blosc2 from blosc2 import compute_chunks_blocks -from blosc2.ctable_storage import FileTableStorage, InMemoryTableStorage, TableStorage, TreeStoreTableStorage +from blosc2.ctable_storage import ( + FileTableStorage, + InMemoryTableStorage, + TableStorage, + TreeStoreTableStorage, + _column_name_to_relpath, + join_field_path, + split_field_path, +) from blosc2.info import InfoReporter, format_nbytes_info from blosc2.list_array import ListArray, coerce_list_cell from blosc2.scalar_array import _ScalarVarLenArray + +if TYPE_CHECKING: + from blosc2.dictionary_column import DictionaryColumn from blosc2.schema import ( + DictionarySpec, ListSpec, ObjectSpec, SchemaSpec, @@ -249,6 +260,9 @@ def __getitem__(self, key): class _CTableInfoReporter(InfoReporter): """Info reporter that also preserves the historic ``t.info()`` call style.""" + def __len__(self) -> int: + return len(self.obj.info_items) + def __repr__(self) -> str: items = self.obj.info_items max_key_len = max(len(k) for k, _ in items) @@ -546,6 +560,12 @@ def is_varlen_scalar(self) -> bool: col = self._table._schema.columns_by_name.get(self._col_name) return col is not None and isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)) + @property + def is_dictionary(self) -> bool: + """True if this column is a dictionary-encoded string column.""" + col = self._table._schema.columns_by_name.get(self._col_name) + return col is not None and isinstance(col.spec, DictionarySpec) + @property def _valid_rows(self): if self._mask is None: @@ -580,20 +600,25 @@ def _values_from_key(self, key): # noqa: C901 if not (0 <= key < n_rows): raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") pos_true = _find_physical_index(self._valid_rows, key) + if self.is_dictionary: + return self._raw_col[int(pos_true)] return self._maybe_decode_timestamp_values(self._raw_col[int(pos_true)]) elif isinstance(key, slice): - valid = self._valid_rows - real_pos = blosc2.where(valid, _arange(len(valid))).compute() + real_pos = np.where(self._valid_rows[:])[0] start, stop, step = key.indices(len(real_pos)) if start >= stop: - return [] if (self.is_list or self.is_varlen_scalar) else np.array([], dtype=self.dtype) + return ( + [] + if (self.is_list or self.is_varlen_scalar or self.is_dictionary) + else np.array([], dtype=self.dtype) + ) selected_pos = real_pos[start:stop:step] # physical row positions if self.is_computed: lo, hi = int(selected_pos.min()), int(selected_pos.max()) chunk = np.asarray(self._raw_col[lo : hi + 1]) return chunk[selected_pos - lo] - if self.is_list or self.is_varlen_scalar: + if self.is_list or self.is_varlen_scalar or self.is_dictionary: return self._raw_col[selected_pos] return self._maybe_decode_timestamp_values(np.asarray(self._raw_col[selected_pos])) @@ -608,17 +633,17 @@ def _values_from_key(self, key): # noqa: C901 if self.is_computed: raw_np = np.asarray(self._raw_col[:]) return raw_np[phys_indices] - if self.is_list or self.is_varlen_scalar: + if self.is_list or self.is_varlen_scalar or self.is_dictionary: return self._raw_col[phys_indices] return self._maybe_decode_timestamp_values(self._raw_col[phys_indices]) elif isinstance(key, (list, tuple, np.ndarray)): - real_pos = blosc2.where(self._valid_rows, _arange(len(self._valid_rows))).compute() + real_pos = np.where(self._valid_rows[:])[0] phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) if self.is_computed: raw_np = np.asarray(self._raw_col[:]) return raw_np[phys_indices] - if self.is_list or self.is_varlen_scalar: + if self.is_list or self.is_varlen_scalar or self.is_dictionary: return self._raw_col[phys_indices] return self._maybe_decode_timestamp_values(self._raw_col[phys_indices]) @@ -773,18 +798,39 @@ def __iter__(self): yield from data_chunk[mask_chunk] def __repr__(self) -> str: - preview_items = [] - for value in itertools.islice(self, self._REPR_PREVIEW_ITEMS + 1): - if isinstance(value, np.generic): - value = value.item() - preview_items.append(repr(value)) - - truncated = len(preview_items) > self._REPR_PREVIEW_ITEMS + preview_len = self._REPR_PREVIEW_ITEMS + 1 + if self.is_list: + label = self._table._dtype_info_label( + self.dtype, self._table._schema.columns_by_name[self._col_name].spec + ) + preview_values = [f"<{label}>"] * min(len(self), preview_len) + else: + preview_pos = np.where(self._valid_rows[:])[0][:preview_len] + if self.is_dictionary or self.is_varlen_scalar: + preview_values = self._raw_col[preview_pos] + elif len(preview_pos) == 0: + preview_values = [] + else: + preview_values = self._maybe_decode_timestamp_values(self._raw_col[preview_pos]).tolist() + truncated = len(preview_values) > self._REPR_PREVIEW_ITEMS if truncated: - preview_items = preview_items[: self._REPR_PREVIEW_ITEMS] - preview_items.append("...") + preview_values = preview_values[: self._REPR_PREVIEW_ITEMS] + + if self.dtype is not None and self.dtype.kind in "biufc" and preview_values: + arr = np.asarray(preview_values, dtype=self.dtype) + preview = np.array2string(arr, separator=", ", max_line_width=10_000)[1:-1] + if truncated: + preview = f"{preview}, ..." if preview else "..." + else: + preview_items = [] + for value in preview_values: + if isinstance(value, np.generic): + value = value.item() + preview_items.append(repr(value)) + if truncated: + preview_items.append("...") + preview = ", ".join(preview_items) - preview = ", ".join(preview_items) return f"Column({self._col_name!r}, dtype={self.dtype}, len={len(self)}, values=[{preview}])" def __len__(self): @@ -796,6 +842,76 @@ def shape(self) -> tuple[int]: """Logical shape of the live column values.""" return (len(self),) + @property + def info(self) -> _CTableInfoReporter: + """Get information about this column. + + The report includes both logical/live-row details and, when available, + the physical storage details used internally by lazy predicates. + + Examples + -------- + >>> print(t["score"].info) + >>> t["score"].info() + """ + return _CTableInfoReporter(self) + + @property + def info_items(self) -> list[tuple[str, object]]: + """Structured summary items used by :attr:`info`.""" + raw = self._raw_col + table = self._table + col_meta = table._schema.columns_by_name.get(self._col_name) + spec = col_meta.spec if col_meta is not None else None + physical_len = len(raw) if hasattr(raw, "__len__") else None + items: list[tuple[str, object]] = [ + ("type", self.__class__.__name__), + ("name", self._col_name), + ("logical_length", len(self)), + ("physical_length", physical_len), + ("dtype", table._dtype_info_label(self.dtype, spec)), + ("computed", self.is_computed), + ("nullable", self.null_value is not None or getattr(spec, "nullable", False)), + ] + + if self.is_list: + items.append(("storage", "list")) + elif self.is_varlen_scalar: + items.append(("storage", "variable-length scalar")) + elif self.is_dictionary: + items.append(("storage", "dictionary")) + items.append(("dictionary_size", len(raw.dictionary))) + else: + items.append(("storage", "ndarray" if isinstance(raw, blosc2.NDArray) else type(raw).__name__)) + + chunks = getattr(raw, "chunks", None) + blocks = getattr(raw, "blocks", None) + if chunks is not None: + items.append(("chunks", chunks)) + if blocks is not None: + items.append(("blocks", blocks)) + + nbytes = getattr(raw, "nbytes", None) + cbytes = getattr(raw, "cbytes", None) + cratio = getattr(raw, "cratio", None) + if nbytes is not None: + items.append(("nbytes", format_nbytes_info(nbytes))) + if cbytes is not None: + items.append(("cbytes", format_nbytes_info(cbytes))) + if cratio is not None: + items.append(("cratio", f"{cratio:.2f}")) + + urlpath = getattr(raw, "urlpath", None) + if urlpath is not None: + items.append(("urlpath", urlpath)) + cparams = getattr(raw, "cparams", None) + dparams = getattr(raw, "dparams", None) + if cparams is not None: + items.append(("cparams", cparams)) + if dparams is not None: + items.append(("dparams", dparams)) + return items + @property def ndim(self) -> int: """Number of logical dimensions.""" @@ -812,6 +928,11 @@ def _ensure_queryable(self) -> None: f"Column {self._col_name!r} is a vlstring/vlbytes column; " "lazy expressions and vectorized comparisons are not supported yet." ) + if self.is_dictionary: + raise NotImplementedError( + f"Column {self._col_name!r} is a dictionary column; " + "use == and isin() for dictionary column comparisons." + ) @staticmethod def _unwrap_operand(other): @@ -964,17 +1085,97 @@ def __le__(self, other): return self._raw_col <= self._coerce_timestamp_operand(other) def __eq__(self, other): + if self.is_dictionary: + return self._dictionary_eq(other) self._ensure_queryable() if self._is_nullable_bool and isinstance(other, (bool, np.bool_)): return self._raw_col == int(other) return self._raw_col == self._coerce_timestamp_operand(other) def __ne__(self, other): + if self.is_dictionary: + result = self._dictionary_eq(other) + if isinstance(result, np.ndarray): + return ~result + return ~np.asarray(result, dtype=bool) self._ensure_queryable() if self._is_nullable_bool and isinstance(other, (bool, np.bool_)): return self._raw_col == int(not other) return self._raw_col != self._coerce_timestamp_operand(other) + def _dictionary_eq(self, other): + """Return a physical-slot boolean predicate for dictionary equality. + + Regular fixed-width columns build predicates against their raw physical + arrays, whose length is the table slot capacity. Dictionary predicates + need to use the same coordinate system so they can be combined with + regular predicates before aggregate/view code intersects them with + ``_valid_rows``. + """ + dc = self._raw_col # DictionaryColumn + spec = self._table._schema.columns_by_name[self._col_name].spec + if other is None: + target_code = spec.null_code + elif isinstance(other, str): + try: + target_code = dc.value_to_code(other) + except KeyError: + return blosc2.zeros(len(self._table._valid_rows), dtype=np.bool_) + else: + raise TypeError( + f"Dictionary column {self._col_name!r} can only be compared with str or None, " + f"got {type(other).__name__!r}." + ) + pred = dc.codes == np.int32(target_code) + valid = self._lazy_valid_rows() + if len(dc.codes) != len(self._table._valid_rows): + physical = blosc2.zeros(len(self._table._valid_rows), dtype=np.bool_) + physical[: len(dc.codes)] = pred + pred = physical + return pred & valid + + def isin(self, values) -> np.ndarray: + """Return a boolean array True where the live value is in *values*. + + For dictionary columns this performs efficient integer-code membership + testing (no decoding of all values). Values absent from the + dictionary are treated as not-present. + + For non-dictionary columns this decodes all live values and tests + membership in a set. + """ + if self.is_dictionary: + return self._dictionary_isin(values) + live_values = self[:] + test_set = set(values) + if isinstance(live_values, np.ndarray): + return np.array([v in test_set for v in live_values.tolist()], dtype=bool) + return np.array([v in test_set for v in live_values], dtype=bool) + + def _dictionary_isin(self, values) -> np.ndarray: + """Return a boolean array for in-membership tests against a dictionary column.""" + dc = self._raw_col # DictionaryColumn + spec = self._table._schema.columns_by_name[self._col_name].spec + valid = self._valid_rows + live_pos = np.where(valid[:])[0] + if len(live_pos) == 0: + return np.zeros(0, dtype=bool) + # Map requested values to codes, ignoring absent values. + target_codes: set[int] = set() + for v in values: + if v is None: + target_codes.add(spec.null_code) + elif isinstance(v, str): + with contextlib.suppress(KeyError): + target_codes.add(dc.value_to_code(v)) + if not target_codes: + return np.zeros(len(live_pos), dtype=bool) + live_codes = np.asarray(dc.codes[live_pos], dtype=np.int32) + mask = np.zeros(len(live_codes), dtype=bool) + for code in target_codes: + mask |= live_codes == np.int32(code) + return mask + def __gt__(self, other): self._ensure_queryable() return self._raw_col > self._coerce_timestamp_operand(other) @@ -1179,8 +1380,11 @@ def is_null(self) -> np.ndarray: For varlen scalar columns (vlstring/vlbytes) nullability is represented as native ``None`` values, so this returns True wherever the value is - ``None``. + ``None``. For dictionary columns, returns True where the code equals + the null_code (``-1`` by default). """ + if self.is_dictionary: + return self._dictionary_eq(None) if self.is_varlen_scalar: return np.array([v is None for v in self], dtype=np.bool_) return self._null_mask_for(self[:]) @@ -1195,6 +1399,8 @@ def null_count(self) -> int: Returns ``0`` in O(1) if no ``null_value`` is configured for this column and the column is not a varlen scalar column. """ + if self.is_dictionary: + return int(self.is_null().sum()) if self.is_varlen_scalar: return sum(1 for v in self if v is None) if self.null_value is None: @@ -1285,7 +1491,9 @@ def _normalize_sum_where(self, where): return None if isinstance(where, str): self._table._guard_varlen_scalar_expression(where) - where = blosc2.lazyexpr(where, self._table._where_expression_operands()) + operands = self._table._where_expression_operands() + where, operands = self._table._rewrite_nested_expression(where, operands) + where = blosc2.lazyexpr(where, operands) if isinstance(where, np.ndarray) and where.dtype == np.bool_: where = blosc2.asarray(where) if isinstance(where, Column): @@ -1309,7 +1517,10 @@ def _lazy_nonnull_mask(self, where=None): if not isinstance(raw, (blosc2.NDArray, blosc2.LazyExpr)): return NotImplemented - all_rows_visible = self._mask is None and self._table._n_rows == len(self._table._valid_rows) + table_n_rows = self._table._known_n_rows() + all_rows_visible = ( + self._mask is None and table_n_rows is not None and table_n_rows == len(self._table._valid_rows) + ) mask = None if all_rows_visible else self._lazy_valid_rows() if where is not None: mask = where if mask is None else mask & where @@ -1340,7 +1551,7 @@ def _sum_lazy_fastpath(self, acc_dtype, where=None, *, jit=None, jit_backend=Non where is None and self._table.base is not None and total_rows - and self._table._n_rows / total_rows < 0.25 + and self._table.nrows / total_rows < 0.25 ): return NotImplemented @@ -1666,6 +1877,171 @@ def _fmt_bytes(n: int) -> str: # We use a plain dict so that nothing extra needs to be imported. +class _StructPathColumn: + """Virtual read-only column representing a struct prefix path. + + Values are reconstructed per row from descendant dotted leaf columns. + """ + + def __init__(self, table: CTable, prefix: str, leaves: list[str]): + self._table = table + self._prefix = prefix + self._leaves = list(leaves) + + def _leaf_is_null_at_logical(self, leaf: str, idx: int) -> bool: + col = self._table[leaf] + v = col[idx] + nv = col.null_value + if nv is None: + return v is None + try: + return bool(col._null_mask_for(np.asarray([v]))[0]) + except Exception: + return v is None + + def _row_value_at_logical(self, idx: int): + # If every descendant leaf is null at this row, represent the struct as None. + if self._leaves and all(self._leaf_is_null_at_logical(leaf, idx) for leaf in self._leaves): + return None + prefix_parts = split_field_path(self._prefix) + result: dict[str, Any] = {} + for leaf in self._leaves: + parts = split_field_path(leaf) + rel_parts = parts[len(prefix_parts) :] + if not rel_parts: + continue + node = result + for part in rel_parts[:-1]: + child = node.get(part) + if not isinstance(child, dict): + child = {} + node[part] = child + node = child + node[rel_parts[-1]] = self._table._normalize_scalar_value(self._table[leaf][idx]) + return result + + def __getitem__(self, key): + if isinstance(key, int): + return self._row_value_at_logical(key) + if isinstance(key, slice): + start, stop, step = key.indices(self._table.nrows) + return [self._row_value_at_logical(i) for i in range(start, stop, step)] + if isinstance(key, (list, np.ndarray)): + if len(key) == 0: + return [] + if isinstance(key, np.ndarray) and key.dtype == np.bool_: + idxs = np.where(key)[0] + elif isinstance(key[0], (bool, np.bool_)): + idxs = [i for i, v in enumerate(key) if v] + else: + idxs = [int(i) for i in key] + return [self._row_value_at_logical(i) for i in idxs] + raise TypeError(f"Invalid index type: {type(key)}") + + def __iter__(self): + for i in range(self._table.nrows): + yield self._row_value_at_logical(i) + + +class _NestedColumnNamespace: + """Attribute proxy for dotted nested column paths. + + Allows `t.trip.begin.lon` when the physical leaf column is named + `"trip.begin.lon"`. + """ + + def __init__(self, table: CTable, prefix: str): + self._table = table + self._prefix = prefix + + def __getattr__(self, name: str): + path = join_field_path((*split_field_path(self._prefix), name)) + if path in self._table._cols or path in self._table._computed_cols: + return Column(self._table, path) + path_parts = split_field_path(path) + for col_name in self._table.col_names: + parts = split_field_path(col_name) + if parts[: len(path_parts)] == path_parts and len(parts) > len(path_parts): + return _NestedColumnNamespace(self._table, path) + raise AttributeError(path) + + def __repr__(self) -> str: + return f"" + + +class _LazyColumnDict(dict): + """Dict-like column cache that opens persistent columns on first use. + + Persistent CTables can be wide, and opening every stored column eagerly is + expensive for workloads that touch only a small subset of columns, e.g. + ``blosc2.open(path).trip.km.sum()`` on a nested table. Keep the public and + internal ``_cols`` access pattern mostly unchanged while deferring each + ``storage.open_*_column()`` call until that column is actually requested. + + Methods that logically need all materialized columns, such as ``items()`` + and ``values()``, force-load the cache for compatibility with normal + ``dict`` usage. Name-oriented operations, such as ``keys()``, iteration, + ``len()``, and ``in``, operate from the schema column list without opening + the column payloads. + """ + + def __init__(self, table: CTable, storage: TableStorage, col_names: list[str]): + super().__init__() + self._table = table + self._storage = storage + self._col_names = list(col_names) + self._available = set(col_names) + + def _load(self, name: str): + if name not in self._available: + raise KeyError(name) + if not dict.__contains__(self, name): + dict.__setitem__(self, name, self._table._open_column_from_storage(self._storage, name)) + return dict.__getitem__(self, name) + + def _load_all(self) -> None: + for name in self._col_names: + self._load(name) + + def __getitem__(self, name: str): + return self._load(name) + + def get(self, name: str, default=None): + return self._load(name) if name in self._available else default + + def __contains__(self, name: object) -> bool: + return name in self._available + + def __iter__(self): + return iter(self._col_names) + + def __len__(self) -> int: + return len(self._col_names) + + def keys(self): + return dict.fromkeys(self._col_names).keys() + + def items(self): + self._load_all() + return dict.items(self) + + def values(self): + self._load_all() + return dict.values(self) + + def __setitem__(self, name: str, value) -> None: + if name not in self._available: + self._available.add(name) + self._col_names.append(name) + dict.__setitem__(self, name, value) + + def __delitem__(self, name: str) -> None: + self._available.remove(name) + self._col_names.remove(name) + if dict.__contains__(self, name): + dict.__delitem__(self, name) + + class CTable(Generic[RowT]): """Columnar compressed table with typed columns and row-oriented access.""" @@ -1679,6 +2055,23 @@ class CTable(Generic[RowT]): #: :meth:`add_column` and :meth:`drop_column` are blocked on views. base: CTable | None + @property + def _n_rows(self) -> int: + """Number of live rows, computed lazily for reopened tables.""" + n_rows = getattr(self, "_n_rows_cached", None) + if n_rows is None: + n_rows = int(blosc2.count_nonzero(self._valid_rows)) + self._n_rows_cached = n_rows + return n_rows + + @_n_rows.setter + def _n_rows(self, value: int | None) -> None: + self._n_rows_cached = value + + def _known_n_rows(self) -> int | None: + """Return cached live-row count without triggering a scan.""" + return getattr(self, "_n_rows_cached", None) + def __init__( self, row_type: type[RowT], @@ -1742,17 +2135,11 @@ def __init__( ) self.col_names = [c["name"] for c in schema_dict["columns"]] self._valid_rows = storage.open_valid_rows() + self._cols = _LazyColumnDict(self, storage, self.col_names) for name in self.col_names: cc = self._schema.columns_by_name[name] - if self._is_list_column(cc): - col = storage.open_list_column(name) - elif self._is_varlen_scalar_column(cc): - col = storage.open_varlen_scalar_column(name, cc.spec) - else: - col = storage.open_column(name) - self._cols[name] = col self._col_widths[name] = max(len(name), cc.display_width) - self._n_rows = int(blosc2.count_nonzero(self._valid_rows)) + self._n_rows = None self._last_pos = None # resolve lazily on first write # ---- Restore computed/materialized column metadata (if any) ---- self._computed_cols = {} @@ -1823,6 +2210,10 @@ def _is_list_column(col: CompiledColumn) -> bool: def _is_varlen_scalar_column(col: CompiledColumn) -> bool: return isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)) + @staticmethod + def _is_dictionary_column(col: CompiledColumn) -> bool: + return isinstance(col.spec, DictionarySpec) + @staticmethod def _is_list_spec(spec: SchemaSpec) -> bool: return isinstance(spec, ListSpec) @@ -1886,7 +2277,10 @@ def _resolve_nullable_specs( for col in schema.columns: spec = col.spec if ( - isinstance(spec, (ListSpec, VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)) + isinstance( + spec, + (ListSpec, VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec, DictionarySpec), + ) or getattr(spec, "null_value", None) is not None ): continue @@ -1912,7 +2306,11 @@ def _resolve_nullable_specs( def _flush_varlen_columns(self) -> None: for col in self._schema.columns: - if self._is_list_column(col) or self._is_varlen_scalar_column(col): + if ( + self._is_list_column(col) + or self._is_varlen_scalar_column(col) + or self._is_dictionary_column(col) + ): self._cols[col.name].flush() def _init_columns( @@ -1939,6 +2337,14 @@ def _init_columns( dparams=col_storage.get("dparams"), ) continue + if self._is_dictionary_column(col): + self._cols[col.name] = storage.create_dictionary_column( + col.name, + spec=col.spec, + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) + continue # Recompute chunks/blocks using the actual dtype so that wide # string columns (e.g. U183642) don't produce multi-GB chunks. chunks = col_storage["chunks"] @@ -1978,22 +2384,47 @@ def _resolve_column_storage( result["dparams"] = dparams return result + @staticmethod + def _flatten_nested_dict(d: dict, prefix: str = "") -> dict: + """Recursively flatten a nested dict into a dotted-key flat dict. + + Works for both single-row dicts ``{field: value}`` and column-batch + dicts ``{field: array}``. Leaves non-dict values unchanged. + + Example:: + + {"trip": {"begin": {"lon": 1.0}}} -> {"trip.begin.lon": 1.0} + """ + result = {} + for k, v in d.items(): + full_key = join_field_path((*split_field_path(prefix), k)) if prefix else join_field_path((k,)) + if isinstance(v, dict): + result.update(CTable._flatten_nested_dict(v, full_key)) + else: + result[full_key] = v + return result + def _normalize_row_input(self, data: Any) -> dict[str, Any]: """Normalize a row input to a ``{col_name: value}`` dict. Accepted shapes: - list / tuple → positional, zipped with stored column names (computed columns skipped) - - dict → used as-is - - dataclass → ``dataclasses.asdict`` + - dict → used as-is (nested dicts are flattened to dotted keys) + - dataclass → ``dataclasses.asdict`` (nested fields flattened) - np.void / structured scalar → field-name access """ stored = self._append_input_col_names if isinstance(data, dict): + if any(isinstance(v, dict) for v in data.values()): + return self._flatten_nested_dict(data) return data if isinstance(data, (list, tuple)): return dict(zip(stored, data, strict=False)) if dataclasses.is_dataclass(data) and not isinstance(data, type): - return dataclasses.asdict(data) + d = dataclasses.asdict(data) + if any(isinstance(v, dict) for v in d.values()): + return self._flatten_nested_dict(d) + return d if isinstance(data, (np.void, np.record)): return {name: data[name] for name in stored} # Fallback: try positional indexing @@ -2009,6 +2440,9 @@ def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: elif self._is_varlen_scalar_column(col): # Coercion is handled inside _ScalarVarLenArray.append. result[col.name] = val + elif self._is_dictionary_column(col): + # Pass str/None through; DictionaryColumn.__setitem__ encodes. + result[col.name] = val elif isinstance(col.spec, timestamp): if val is None: result[col.name] = col.spec.null_value @@ -2022,6 +2456,17 @@ def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: result[col.name] = np.array(val, dtype=col.dtype).item() return result + def _open_column_from_storage(self, storage: TableStorage, name: str): + """Open one stored column from *storage*.""" + cc = self._schema.columns_by_name[name] + if self._is_list_column(cc): + return storage.open_list_column(name) + if self._is_varlen_scalar_column(cc): + return storage.open_varlen_scalar_column(name, cc.spec) + if self._is_dictionary_column(cc): + return storage.open_dictionary_column(name, cc.spec) + return storage.open_column(name) + def _resolve_last_pos(self) -> int: """Return the physical index of the next write slot. @@ -2067,6 +2512,9 @@ def _grow(self) -> None: cc = self._schema.columns_by_name[name] if self._is_list_column(cc) or self._is_varlen_scalar_column(cc): continue + if self._is_dictionary_column(cc): + col_arr.resize((c * 2,)) + continue col_arr.resize((c * 2,)) self._valid_rows.resize((c * 2,)) @@ -2083,10 +2531,14 @@ def _display_positions(self, head_tail: int = 10): return all_pos, np.array([], dtype=all_pos.dtype), 0 return all_pos[:head_tail], all_pos[-head_tail:], hidden - def _display_widths(self) -> dict[str, int]: + def _display_widths(self, col_names: list[str] | None = None) -> dict[str, int]: widths: dict[str, int] = {} - single_col = len(self.col_names) == 1 - for name in self.col_names: + col_names = self.col_names if col_names is None else col_names + single_col = len(col_names) == 1 + for name in col_names: + if name == "...": + widths[name] = 3 + continue spec = self._schema.columns_by_name.get(name) dtype_label = self._dtype_info_label(self._col_dtype(name), spec.spec if spec else None) widths[name] = max(self._col_widths[name], len(dtype_label)) @@ -2094,25 +2546,80 @@ def _display_widths(self) -> dict[str, int]: widths[name] = max(widths[name], 80) return widths + def _display_columns(self) -> tuple[list[str], int]: + """Return terminal-width-friendly display columns and hidden count.""" + col_names = list(self.col_names) + widths = self._display_widths(col_names) + widths["..."] = 3 + total_width = sum(widths[n] + 2 for n in col_names) + 2 * max(0, len(col_names) - 1) + term_width = shutil.get_terminal_size((120, 20)).columns + if total_width <= term_width or len(col_names) <= 2: + return col_names, 0 + + selected: list[str] = [] + left = 0 + right = len(col_names) - 1 + used = 0 + + def extra_width(name: str, n_existing: int) -> int: + return widths[name] + 2 + (2 if n_existing else 0) + + # Account for an ellipsis column between left and right blocks. + used += widths["..."] + 2 + while left <= right: + left_name = col_names[left] + need = extra_width(left_name, len(selected) + 1) + if used + need > term_width: + break + selected.append(left_name) + used += need + left += 1 + if left > right: + break + + right_name = col_names[right] + need = extra_width(right_name, len(selected) + 1) + if used + need > term_width: + break + selected.append(right_name) + used += need + right -= 1 + + left_cols = [n for n in col_names if n in selected and col_names.index(n) < left] + right_cols = [n for n in col_names if n in selected and col_names.index(n) > right] + display_cols = left_cols + ["..."] + right_cols + hidden = len(col_names) - len(left_cols) - len(right_cols) + return display_cols, hidden + @staticmethod def _format_cell(value, width: int) -> str: - s = str(value) + if isinstance(value, np.datetime64): + s = str(value).replace("T", " ") + if s.endswith(".000"): + s = s[:-4] + else: + s = str(value) if len(s) > width: s = s[: width - 1] + "…" return f" {s:<{width}} " - def _format_display_row(self, values: dict, widths: dict[str, int]) -> str: - return " ".join(self._format_cell(values[n], widths[n]) for n in self.col_names) + def _format_display_row(self, values: dict, widths: dict[str, int], col_names: list[str]) -> str: + return " ".join(self._format_cell(values[n], widths[n]) for n in col_names) - def _rows_to_dicts(self, positions) -> list[dict]: + def _rows_to_dicts(self, positions, col_names: list[str] | None = None) -> list[dict]: if len(positions) == 0: return [] - col_data = {n: self._fetch_col_at_positions(n, positions) for n in self.col_names} + col_names = self.col_names if col_names is None else col_names + real_cols = [n for n in col_names if n != "..."] + col_data = {n: self._fetch_col_at_positions(n, positions) for n in real_cols} rows = [] for i in range(len(positions)): row = {} - for n in self.col_names: - row[n] = self._normalize_scalar_value(col_data[n][i]) + for n in col_names: + # Keep NumPy scalar types for display so their compact string + # formatting is preserved (notably float32, e.g. 224.97 + # instead of Python float's 224.97000122070312). + row[n] = "..." if n == "..." else col_data[n][i] rows.append(row) return rows @@ -2121,31 +2628,44 @@ def __str__(self) -> str: nrows = self._n_rows ncols = len(self.col_names) head_pos, tail_pos, hidden = self._display_positions() - widths = self._display_widths() - sep = " ".join("─" * (w + 2) for w in widths.values()) + display_cols, hidden_cols = self._display_columns() + widths = self._display_widths(display_cols) + sep = " ".join("─" * (widths[n] + 2) for n in display_cols) + + dtype_row = {} + for n in display_cols: + if n == "...": + dtype_row[n] = "..." + else: + dtype_row[n] = self._dtype_info_label( + self._col_dtype(n), + self._schema.columns_by_name[n].spec if n in self._schema.columns_by_name else None, + ) lines = [ - self._format_display_row({n: n for n in self.col_names}, widths), - self._format_display_row( - { - n: self._dtype_info_label( - self._col_dtype(n), - self._schema.columns_by_name[n].spec if n in self._schema.columns_by_name else None, - ) - for n in self.col_names - }, - widths, - ), + self._format_display_row({n: n for n in display_cols}, widths, display_cols), + self._format_display_row(dtype_row, widths, display_cols), sep, ] - lines.extend(self._format_display_row(row, widths) for row in self._rows_to_dicts(head_pos)) + lines.extend( + self._format_display_row(row, widths, display_cols) + for row in self._rows_to_dicts(head_pos, display_cols) + ) if hidden > 0: - lines.append(self._format_display_row(dict.fromkeys(self.col_names, "..."), widths)) - lines.extend(self._format_display_row(row, widths) for row in self._rows_to_dicts(tail_pos)) + lines.append(self._format_display_row(dict.fromkeys(display_cols, "..."), widths, display_cols)) + lines.extend( + self._format_display_row(row, widths, display_cols) + for row in self._rows_to_dicts(tail_pos, display_cols) + ) lines.append(sep) footer = f"{nrows:,} rows × {ncols} columns" + notes = [] if hidden > 0: - footer += f" ({hidden:,} rows hidden)" + notes.append(f"{hidden:,} rows hidden") + if hidden_cols > 0: + notes.append(f"{hidden_cols:,} columns hidden") + if notes: + footer += f" ({', '.join(notes)})" lines.append(footer) return "\n".join(lines) @@ -2170,6 +2690,17 @@ def _row_namedtuple_type(self): self._row_namedtuple_type_cache_cols = visible return self._row_namedtuple_type_cache + def _row_namedtuple_type_for_fields(self, fields: tuple[str, ...]): + cache = getattr(self, "_row_namedtuple_type_cache_by_fields", None) + if cache is None: + cache = {} + self._row_namedtuple_type_cache_by_fields = cache + row_type = cache.get(fields) + if row_type is None: + row_type = _make_namedtuple_row_type(fields) + cache[fields] = row_type + return row_type + @staticmethod def _normalize_scalar_value(value): if isinstance(value, np.generic): @@ -2195,8 +2726,32 @@ def _materialize_row(self, index: int): if not (0 <= index < n_rows): raise IndexError(f"row index {index} is out of bounds for table with {n_rows} rows") pos = _find_physical_index(self._valid_rows, index) - row_type = self._row_namedtuple_type() - return row_type(*(self._physical_row_value(name, int(pos)) for name in self.col_names)) + + nested_meta = self._schema.metadata.get("nested") if self._schema.metadata else None + reconstruct = isinstance(nested_meta, dict) and bool(nested_meta.get("reconstruct_rows", False)) + if not reconstruct: + row_type = self._row_namedtuple_type() + return row_type(*(self._physical_row_value(name, int(pos)) for name in self.col_names)) + + row_dict: dict[str, Any] = {} + for name in self.col_names: + value = self._physical_row_value(name, int(pos)) + parts = split_field_path(name) + if len(parts) <= 1: + row_dict[name] = value + continue + node = row_dict + for part in parts[:-1]: + child = node.get(part) + if not isinstance(child, dict): + child = {} + node[part] = child + node = child + node[parts[-1]] = value + + fields = tuple(row_dict.keys()) + row_type = self._row_namedtuple_type_for_fields(fields) + return row_type(*(row_dict[f] for f in fields)) def iter_sorted( self, @@ -2275,6 +2830,12 @@ def iter_sorted( # Open existing table (classmethod) # ------------------------------------------------------------------ + @classmethod + def _open_from_existing_filestore(cls, urlpath: str, *, mode: str, store: blosc2.TreeStore) -> CTable: + """Open a root CTable reusing an already-opened TreeStore.""" + storage = FileTableStorage(urlpath, mode, store=store) + return cls._open_from_storage(storage) + @classmethod def open(cls, urlpath: str, *, mode: str = "r") -> CTable: """Open a persistent CTable from *urlpath*. @@ -2463,6 +3024,23 @@ def _save_to_storage(self, storage: TableStorage) -> None: disk_col.extend(self._cols[name][int(pos)] for pos in live_pos) disk_col.flush() continue + if self._is_dictionary_column(col): + src_dc = self._cols[name] + disk_dc = storage.create_dictionary_column( + name, + spec=col.spec, + cparams=col.config.cparams if col.config.cparams is not None else self._table_cparams, + dparams=col.config.dparams if col.config.dparams is not None else self._table_dparams, + ) + # Copy dictionary values first + for v in src_dc.dictionary: + disk_dc.encode(v) + disk_dc.flush() + # Copy live codes + if n_live > 0: + raw_codes = np.asarray(src_dc.codes[live_pos], dtype=np.int32) + disk_dc.codes[:n_live] = raw_codes + continue dtype_chunks, dtype_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype) col_storage = self._resolve_column_storage(col, dtype_chunks, dtype_blocks) disk_col = storage.create_column( @@ -2551,17 +3129,12 @@ def _open_from_storage(cls, storage: TableStorage) -> CTable: obj.base = None obj._valid_rows = storage.open_valid_rows() + obj._cols = _LazyColumnDict(obj, storage, col_names) for name in col_names: cc = schema.columns_by_name[name] - if obj._is_list_column(cc): - obj._cols[name] = storage.open_list_column(name) - elif obj._is_varlen_scalar_column(cc): - obj._cols[name] = storage.open_varlen_scalar_column(name, cc.spec) - else: - obj._cols[name] = storage.open_column(name) obj._col_widths[name] = max(len(name), cc.display_width) - obj._n_rows = int(blosc2.count_nonzero(obj._valid_rows)) + obj._n_rows = None obj._last_pos = None obj._computed_cols = {} obj._materialized_cols = {} @@ -2632,6 +3205,8 @@ def load(cls, urlpath: str) -> CTable: disk_cols[col.name] = file_storage.open_list_column(col.name) elif cls._is_varlen_scalar_column(col): disk_cols[col.name] = file_storage.open_varlen_scalar_column(col.name, col.spec) + elif cls._is_dictionary_column(col): + disk_cols[col.name] = file_storage.open_dictionary_column(col.name, col.spec) else: disk_cols[col.name] = file_storage.open_column(col.name) phys_size = len(disk_valid) @@ -2664,6 +3239,17 @@ def load(cls, urlpath: str) -> CTable: mem_col.flush() mem_cols[name] = mem_col continue + if cls._is_dictionary_column(col): + mem_col = mem_storage.create_dictionary_column(name, spec=col.spec) + disk_dc = disk_cols[name] + # Copy dictionary values + for v in disk_dc.dictionary: + mem_col.encode(v) + # Copy codes + if phys_size > 0: + mem_col.codes[:phys_size] = disk_dc.codes[:phys_size] + mem_cols[name] = mem_col + continue col_chunks, col_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype) mem_col = mem_storage.create_column( name, @@ -2830,17 +3416,27 @@ def select(self, cols: list[str]) -> CTable: Parameters ---------- cols: - Ordered list of column names to keep. + Ordered list of column names to keep. For tables with **nested + (dotted) column names**, a struct-prefix name automatically expands + to all descendant leaves:: + + t.select(["trip.begin"]) # expands to trip.begin.lon, trip.begin.lat + t.select(["trip"]) # expands to all trip.* leaves Raises ------ KeyError - If any name in *cols* is not a column of this table. + If any name in *cols* is not a column of this table (and does not + match any struct prefix). ValueError If *cols* is empty. """ if not cols: raise ValueError("select() requires at least one column name.") + expanded_cols = [] + for name in cols: + expanded_cols.extend(self._expand_logical_column_selector(name)) + cols = expanded_cols for name in cols: if name not in self._cols and name not in self._computed_cols: raise KeyError(f"No column named {name!r}. Available: {self.col_names}") @@ -3063,17 +3659,44 @@ def _resolve_arrow_columns(self, columns, include_computed: bool = True) -> list names = list(self.col_names) if not include_computed: names = [name for name in names if name not in self._computed_cols] + + # If top-level struct aliases are present in schema metadata (virtual + # entries not physically stored), prefer exporting them instead of + # their descendant dotted leaves. + virtual_structs = [ + n + for n, cc in self._schema.columns_by_name.items() + if n not in self.col_names and isinstance(cc.spec, StructSpec) + ] + for alias in sorted(virtual_structs, key=len, reverse=True): + alias_parts = split_field_path(alias) + children = [ + n + for n in names + if split_field_path(n)[: len(alias_parts)] == alias_parts + and len(split_field_path(n)) > len(alias_parts) + ] + if not children: + continue + first = min(names.index(c) for c in children) + child_set = set(children) + names = [n for n in names if n not in child_set] + names.insert(first, alias) else: - names = list(columns) + names = [] + for name in columns: + names.extend(self._expand_logical_column_selector(name)) if len(set(names)) != len(names): raise ValueError("columns must be unique") for name in names: - if name not in self.col_names: + if name not in self.col_names and name not in self._schema.columns_by_name: raise KeyError(f"No column named {name!r}. Available: {self.col_names}") return names @staticmethod def _pa_type_from_spec(pa, spec): + if isinstance(spec, DictionarySpec): + return pa.dictionary(pa.int32(), pa.string(), ordered=spec.ordered) if isinstance(spec, VLStringSpec): return pa.string() if isinstance(spec, VLBytesSpec): @@ -3102,17 +3725,35 @@ def _pa_type_from_spec(pa, spec): return pa.large_binary() return pa.from_numpy_dtype(dtype) + def _export_arrow_names(self, names: list[str]) -> list[str]: + nested = self._schema.metadata.get("nested") if self._schema.metadata else None + exported = list(names) + if isinstance(nested, dict): + root_meta = nested.get("root") + if isinstance(root_meta, dict): + physical = root_meta.get("physical") + if isinstance(physical, str) and physical: + exported = ["" if n == physical else n for n in exported] + for i, n in enumerate(names): + cc = self._schema.columns_by_name.get(n) + if n not in self.col_names and cc is not None and isinstance(cc.spec, StructSpec): + parts = split_field_path(n) + if len(parts) == 1: + exported[i] = parts[0] + return exported + def _arrow_schema_for_columns(self, columns=None, *, include_computed: bool = True): pa = self._require_pyarrow("to_arrow()/to_parquet()") names = self._resolve_arrow_columns(columns, include_computed=include_computed) + arrow_names = self._export_arrow_names(names) fields = [] - for name in names: + for name, arrow_name in zip(names, arrow_names, strict=True): cc = self._schema.columns_by_name.get(name) if cc is not None: pa_type = self._pa_type_from_spec(pa, cc.spec) else: pa_type = pa.from_numpy_dtype(np.asarray(self[name][:0]).dtype) - fields.append(pa.field(name, pa_type)) + fields.append(pa.field(arrow_name, pa_type)) return pa.schema(fields) def iter_arrow_batches( @@ -3127,11 +3768,17 @@ def iter_arrow_batches( self._validate_arrow_batch_size(batch_size) self._flush_varlen_columns() names = self._resolve_arrow_columns(columns, include_computed=include_computed) + arrow_names = self._export_arrow_names(names) for start in range(0, self._n_rows, batch_size): stop = min(start + batch_size, self._n_rows) arrays = [] for name in names: + cc = self._schema.columns_by_name.get(name) + if name not in self.col_names and cc is not None and isinstance(cc.spec, StructSpec): + values = self[name][start:stop] + arrays.append(pa.array(values, type=self._pa_type_from_spec(pa, cc.spec))) + continue col = self[name] if col.is_list: spec = self._schema.columns_by_name[name].spec @@ -3142,6 +3789,34 @@ def iter_arrow_batches( values = col[start:stop] # list of str/bytes/None arrays.append(pa.array(values, type=self._pa_type_from_spec(pa, spec))) continue + if col.is_dictionary: + dc = self._cols[name] # DictionaryColumn + spec = self._schema.columns_by_name[name].spec + # Get physical positions for live rows in [start, stop) + valid = self._valid_rows + real_pos = blosc2.where(valid, _arange(len(valid))).compute() + batch_real_pos = real_pos[start:stop] + if len(batch_real_pos) == 0: + pa_dict = pa.array(dc.dictionary, type=pa.string()) + pa_indices = pa.array([], type=pa.int32()) + arrays.append( + pa.DictionaryArray.from_arrays(pa_indices, pa_dict, ordered=spec.ordered) + ) + else: + raw_codes = np.asarray(dc.codes[batch_real_pos], dtype=np.int32) + null_mask = raw_codes == np.int32(spec.null_code) + safe_codes = raw_codes.copy() + safe_codes[null_mask] = 0 + pa_dict = pa.array(dc.dictionary, type=pa.string()) + pa_indices = pa.array( + safe_codes, + type=pa.int32(), + mask=null_mask if null_mask.any() else None, + ) + arrays.append( + pa.DictionaryArray.from_arrays(pa_indices, pa_dict, ordered=spec.ordered) + ) + continue arr = np.asarray(col[start:stop]) nv = col.null_value null_mask = col._null_mask_for(arr) if nv is not None else None @@ -3175,7 +3850,7 @@ def iter_arrow_batches( ) else: arrays.append(pa.array(arr, mask=null_mask if has_nulls else None)) - yield pa.RecordBatch.from_arrays(arrays, names=names) + yield pa.RecordBatch.from_arrays(arrays, names=arrow_names) def to_arrow(self): """Convert all live rows to a :class:`pyarrow.Table`.""" @@ -3191,6 +3866,9 @@ def _auto_null_sentinel(pa, pa_type, *, null_policy: NullPolicy): @staticmethod def _arrow_type_needs_object_fallback(pa, pa_type) -> bool: """True when *pa_type* has no typed CTable mapping.""" + if pa.types.is_dictionary(pa_type): + vt = pa_type.value_type + return vt not in (pa.string(), pa.large_string(), pa.utf8(), pa.large_utf8()) if pa_type in ( pa.int8(), pa.int16(), @@ -3230,6 +3908,47 @@ def _arrow_type_to_spec( # noqa: C901 ): import blosc2.schema as b2s + # Handle Arrow dictionary types (dict-encoded strings) + if pa.types.is_dictionary(pa_type): + vt = pa_type.value_type + if vt in (pa.string(), pa.large_string(), pa.utf8(), pa.large_utf8()): + index_type = pa_type.index_type + # Accept signed and unsigned integer index types; validate fit in int32. + if not (pa.types.is_integer(index_type) or pa.types.is_unsigned_integer(index_type)): + raise TypeError( + f"Dictionary column has unsupported index type {index_type!r}; " + "expected an integer type." + ) + if arrow_col is not None: + # Validate all indices fit in signed int32. + if pa.types.is_unsigned_integer(index_type): + max_idx = arrow_col.combine_chunks().indices.to_pandas().max(skipna=True) + if max_idx is not None and max_idx > np.iinfo(np.int32).max: + raise ValueError( + f"Arrow dictionary column has unsigned indices exceeding int32.max " + f"(max={max_idx})." + ) + combined = ( + arrow_col.combine_chunks() if hasattr(arrow_col, "combine_chunks") else arrow_col + ) + n_cats = len(combined.dictionary) + if n_cats > np.iinfo(np.int32).max: + raise OverflowError( + f"Arrow dictionary has {n_cats} categories, exceeding int32 capacity." + ) + return b2s.dictionary( + index_type=b2s.int32(), + value_type=b2s.vlstring(), + ordered=bool(pa_type.ordered), + nullable=nullable, + ) + if object_fallback: + return b2s.object(nullable=nullable) + raise TypeError( + f"No blosc2 spec for Arrow dictionary type {pa_type!r} with " + f"value type {pa_type.value_type!r}. Only string dictionary values are supported in v1." + ) + mapping = [ (pa.int8(), b2s.int8), (pa.int16(), b2s.int16), @@ -3258,10 +3977,9 @@ def _arrow_type_to_spec( # noqa: C901 if pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): if arrow_col is not None: - py_values = arrow_col.to_pylist() - flat_values = [item for cell in py_values if cell is not None for item in cell] - item_arrow_col = pa.array(flat_values, type=pa_type.value_type) - nullable = nullable or any(v is None for v in py_values) + combined = arrow_col.combine_chunks() if hasattr(arrow_col, "combine_chunks") else arrow_col + item_arrow_col = combined.values + nullable = nullable or combined.null_count > 0 else: item_arrow_col = None nullable = True @@ -3354,10 +4072,12 @@ def _compiled_columns_from_arrow( arrow_col = table_for_inference.column(name) if table_for_inference is not None else None field_is_list = pa.types.is_list(field.type) or pa.types.is_large_list(field.type) field_is_struct = pa.types.is_struct(field.type) + field_is_dictionary = pa.types.is_dictionary(field.type) column_string_max_length = cls._string_max_length_for_column(string_max_length, name) field_is_varlen_scalar = ( not field_is_list and not field_is_struct + and not field_is_dictionary and column_string_max_length is None and ( pa.types.is_string(field.type) @@ -3372,7 +4092,9 @@ def _compiled_columns_from_arrow( field_is_object_fallback = object_fallback and field_needs_object_fallback null_value = None has_null_value_override = name in column_null_values - if has_null_value_override and (field_is_list or field_is_struct or field_is_object_fallback): + if has_null_value_override and ( + field_is_list or field_is_struct or field_is_dictionary or field_is_object_fallback + ): raise TypeError(f"column_null_values only supports scalar columns; {name!r} is not scalar") if has_null_value_override and field_is_varlen_scalar: raise TypeError( @@ -3385,7 +4107,11 @@ def _compiled_columns_from_arrow( auto_null_sentinels and field.nullable and not ( - field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback + field_is_list + or field_is_struct + or field_is_dictionary + or field_is_varlen_scalar + or field_is_object_fallback ) ): null_value = cls._auto_null_sentinel(pa, field.type, null_policy=null_policy) @@ -3393,7 +4119,11 @@ def _compiled_columns_from_arrow( arrow_col is not None and arrow_col.null_count and not ( - field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback + field_is_list + or field_is_struct + or field_is_dictionary + or field_is_varlen_scalar + or field_is_object_fallback ) and null_value is None ): @@ -3411,7 +4141,11 @@ def _compiled_columns_from_arrow( object_fallback=object_fallback, ) if null_value is not None and not ( - field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback + field_is_list + or field_is_struct + or field_is_dictionary + or field_is_varlen_scalar + or field_is_object_fallback ): cls._validate_null_value_for_spec(name, spec, null_value) columns.append(cls._compiled_column_from_spec(name, spec)) @@ -3467,7 +4201,7 @@ def _create_arrow_import_columns( new_valid = storage.create_valid_rows( shape=(capacity,), chunks=default_chunks, blocks=default_blocks ) - new_cols: dict[str, blosc2.NDArray | ListArray | _ScalarVarLenArray] = {} + new_cols: dict[str, blosc2.NDArray | ListArray | _ScalarVarLenArray | DictionaryColumn] = {} for col in columns: if cls._is_list_column(col): new_cols[col.name] = storage.create_list_column( @@ -3477,6 +4211,10 @@ def _create_arrow_import_columns( new_cols[col.name] = storage.create_varlen_scalar_column( col.name, spec=col.spec, cparams=cparams, dparams=dparams ) + elif cls._is_dictionary_column(col): + new_cols[col.name] = storage.create_dictionary_column( + col.name, spec=col.spec, cparams=cparams, dparams=dparams + ) else: chunks, blocks = default_chunks, default_blocks if col.dtype is not None: @@ -3517,34 +4255,117 @@ def _new_arrow_import_ctable( obj._last_pos = 0 return obj + @staticmethod + def _timestamp_normalizer_for_spec(spec: SchemaSpec): # noqa: C901 + """Build a trusted Arrow-import normalizer for timestamp leaves. + + Arrow already validates list/struct values during import, so list columns + normally skip Python-level coercion. The exception is nested timestamps: + ``to_pylist()`` yields ``datetime``/``numpy.datetime64`` objects, while + msgpack-backed ListArray storage expects integer epoch offsets. Return a + small normalizer that descends only into branches containing timestamps, + or ``None`` when no normalization is needed. + """ + if isinstance(spec, timestamp): + + def normalize_timestamp(value, unit=spec.unit): + if value is None: + return None + if isinstance(value, (int, np.integer)): + return int(value) + return np.datetime64(value).astype(f"datetime64[{unit}]").astype(np.int64).item() + + return normalize_timestamp + + if isinstance(spec, ListSpec): + item_normalizer = CTable._timestamp_normalizer_for_spec(spec.item_spec) + if item_normalizer is None: + return None + + def normalize_list(value, item_normalizer=item_normalizer): + if value is None: + return None + for i, item in enumerate(value): + value[i] = item_normalizer(item) + return value + + return normalize_list + + if isinstance(spec, StructSpec): + field_normalizers = { + name: normalizer + for name, child in spec.fields.items() + if (normalizer := CTable._timestamp_normalizer_for_spec(child)) is not None + } + if not field_normalizers: + return None + + def normalize_struct(value, field_normalizers=field_normalizers): + if value is None: + return None + for name, normalizer in field_normalizers.items(): + if name in value: + value[name] = normalizer(value[name]) + return value + + return normalize_struct + + return None + @classmethod def _write_arrow_batches(cls, obj, batches, columns, new_cols, new_valid) -> None: pos = 0 + list_normalizers = { + col.name: cls._timestamp_normalizer_for_spec(col.spec) + for col in columns + if cls._is_list_column(col) + } for batch in batches: end = pos + len(batch) while end > len(new_valid): obj._grow() new_valid = obj._valid_rows - pos = cls._write_arrow_batch(batch, columns, new_cols, new_valid, pos) + pos = cls._write_arrow_batch(batch, columns, new_cols, new_valid, pos, list_normalizers) for col in columns: - if cls._is_list_column(col) or cls._is_varlen_scalar_column(col): + if ( + cls._is_list_column(col) + or cls._is_varlen_scalar_column(col) + or cls._is_dictionary_column(col) + ): new_cols[col.name].flush() obj._n_rows = pos obj._last_pos = pos @classmethod - def _write_arrow_batch(cls, batch, columns, new_cols, new_valid, pos: int) -> int: + def _write_arrow_batch(cls, batch, columns, new_cols, new_valid, pos: int, list_normalizers) -> int: m = len(batch) if m == 0: return pos for col in columns: arrow_col = batch.column(batch.schema.get_field_index(col.name)) if cls._is_list_column(col): + if getattr(col.spec, "serializer", None) == "arrow": + new_cols[col.name].extend_arrow(arrow_col) + continue # Trusted Arrow-import fast path: schema has already been inferred, - # so avoid Python-level per-item coercion/validation here. - new_cols[col.name].extend(arrow_col.to_pylist(), validate=False) + # so avoid Python-level per-item coercion. If nested timestamps + # are present, normalize only those leaves before storing. + values = arrow_col.to_pylist() + normalizer = list_normalizers[col.name] + if normalizer is not None: + values = [normalizer(value) for value in values] + new_cols[col.name].extend(values, validate=False) elif cls._is_varlen_scalar_column(col): new_cols[col.name].extend(arrow_col.to_pylist()) + elif cls._is_dictionary_column(col): + import pyarrow as _pa + + if _pa.types.is_dictionary(arrow_col.type): + # Arrow dictionary array: use unification algorithm. + new_cols[col.name].extend_from_arrow(_pa, arrow_col, pos, m, ordered=col.spec.ordered) + else: + # Plain string array: encode values into the dictionary. + new_cols[col.name][pos : pos + m] = arrow_col.to_pylist() else: new_cols[col.name][pos : pos + m] = cls._arrow_column_to_numpy(arrow_col, col) new_valid[pos : pos + m] = True @@ -3596,8 +4417,157 @@ def _arrow_schema_metadata(schema) -> dict[str, Any]: arrow_meta["schema_ipc_base64"] = schema_ipc_base64 return {"arrow": arrow_meta} + @staticmethod + def _nested_metadata_from_column_names( + column_names: list[str], *, empty_root_physical: str | None = None + ) -> dict: + logical_to_physical = {} + physical_to_storage = {} + for name in column_names: + logical_to_physical[name] = name + physical_to_storage[name] = f"_cols/{_column_name_to_relpath(name)}" + nested = { + "version": 1, + "logical_root": "", + "logical_to_physical": logical_to_physical, + "physical_to_storage": physical_to_storage, + } + if empty_root_physical: + logical_to_physical[""] = empty_root_physical + nested["root"] = {"logical": "", "physical": empty_root_physical} + return nested + + # ------------------------------------------------------------------ + # Unnamed-root list> detection and flattening helpers + # ------------------------------------------------------------------ + + @staticmethod + def _detect_unnamed_root_list_struct(pa, schema) -> bool: + """Return True iff *schema* qualifies for unnamed-root list> flattening. + + Conditions (all must hold): + * exactly one top-level field; + * field name is ``""`` (the canonical unnamed Arrow root); + * field type is ``list>`` or ``large_list>``. + """ + if len(schema) != 1: + return False + field = schema[0] + if field.name != "": + return False + t = field.type + if not (pa.types.is_list(t) or pa.types.is_large_list(t)): + return False + return pa.types.is_struct(t.value_type) + + @staticmethod + def _inner_schema_for_unnamed_root(pa, schema): + """Extract the inner struct schema from a single unnamed root list> schema. + + Returns a new Arrow schema whose top-level fields are the struct fields + of the list value type. The nullable flag of the original unnamed field + is not propagated — individual struct child nullability applies. + """ + field = schema[0] # the unnamed "" field + struct_type = field.type.value_type # struct type inside the list + return pa.schema(list(struct_type)) + + @staticmethod + def _flatten_root_list_struct_batches(pa, inner_schema, batches, max_rows: int | None = None): + """Yield flattened :class:`pyarrow.RecordBatch` objects from an unnamed root stream. + + For each incoming batch (which has a single list> column), + flatten the outer list using ``ListArray.flatten()`` — which skips null + outer list rows — and convert the resulting struct array into a + :class:`~pyarrow.RecordBatch` whose columns correspond to the struct fields. + + Parameters + ---------- + pa: + The ``pyarrow`` module. + inner_schema: + Arrow schema for the inner struct (output of + :meth:`_inner_schema_for_unnamed_root`). + batches: + Iterable of incoming :class:`~pyarrow.RecordBatch` objects from the + unnamed-root Parquet file. + max_rows: + Optional maximum number of flattened element rows to yield. + """ + rows_seen = 0 + for batch in batches: + if max_rows is not None and rows_seen >= max_rows: + break + list_array = batch.column(0) + # flatten() skips null outer list rows and concatenates element values + struct_values = list_array.flatten() + if max_rows is not None: + remaining = max_rows - rows_seen + if len(struct_values) > remaining: + struct_values = struct_values.slice(0, remaining) + n_values = len(struct_values) + if n_values == 0: + # Emit an empty record batch that still carries the inner schema + empty_arrays = [pa.array([], type=f.type) for f in inner_schema] + yield pa.record_batch(empty_arrays, schema=inner_schema) + continue + rows_seen += n_values + yield pa.RecordBatch.from_struct_array(struct_values) + + @staticmethod + def _flatten_arrow_struct_schema(pa, schema): + """Flatten top-level struct fields into dotted leaf fields recursively.""" + + out_fields = [] + + def _walk(field, prefix: tuple[str, ...] = (), parent_nullable: bool = False): + parts = (*prefix, field.name) + name = join_field_path(parts) + nullable = bool(parent_nullable or field.nullable) + if pa.types.is_struct(field.type): + for child in field.type: + _walk(pa.field(child.name, child.type, nullable=child.nullable), parts, nullable) + else: + out_fields.append(pa.field(name, field.type, nullable=nullable)) + + for f in schema: + _walk(f) + return pa.schema(out_fields, metadata=schema.metadata) + + @staticmethod + def _flatten_arrow_struct_batch(pa, batch, flat_schema): + arrays = [] + + def _extract(array, arr_type, parts): + if not parts: + return array + head = parts[0] + if pa.types.is_struct(arr_type): + return _extract(array.field(head), arr_type[head].type, parts[1:]) + raise KeyError("Invalid flattened path") + + for field in flat_schema: + parts = split_field_path(field.name) + col = batch.column(batch.schema.get_field_index(parts[0])) + arr = _extract(col, col.type, parts[1:]) + arrays.append(arr) + return pa.RecordBatch.from_arrays(arrays, schema=flat_schema) + @classmethod - def from_arrow( + def _flatten_arrow_struct_input(cls, pa, schema, batches): + """Return flattened (schema, batches, flattened) for struct-containing Arrow inputs.""" + if not any(pa.types.is_struct(f.type) for f in schema): + return schema, batches, False + flat_schema = cls._flatten_arrow_struct_schema(pa, schema) + + def _gen(): + for b in batches: + yield cls._flatten_arrow_struct_batch(pa, b, flat_schema) + + return flat_schema, _gen(), True + + @classmethod + def from_arrow( # noqa: C901 cls, schema, batches, @@ -3612,19 +4582,37 @@ def from_arrow( auto_null_sentinels: bool = True, blosc2_batch_size: int | None = _BATCH_SIZE_DEFAULT, blosc2_items_per_block: int | None = None, + list_serializer: Literal["msgpack", "arrow"] = "msgpack", object_fallback: bool = False, column_cparams: Mapping[str, dict[str, Any]] | None = None, + separate_nested_cols: bool = False, ) -> CTable: """Build a :class:`CTable` from an Arrow schema and iterable of record batches. + **Nested struct flattening**: top-level Arrow ``struct<…>`` fields are + automatically and recursively flattened into dotted leaf columns. For + example, a field ``trip: struct>`` + becomes two CTable columns ``trip.begin.lon`` and ``trip.begin.lat``. + Each leaf is stored as an independent compressed :class:`~blosc2.NDArray`. + Row reads via ``t[i]`` reconstruct the original nested dict shape. Use + ``t["trip.begin.lon"]`` or ``t.trip.begin.lon`` to access a leaf:: + + import pyarrow as pa, blosc2 + trip_type = pa.struct([("begin", pa.struct([("lon", pa.float64())]))]) + schema = pa.schema([pa.field("trip", trip_type)]) + t = blosc2.CTable.from_arrow(schema, batches) + t.col_names # ['trip.begin.lon'] + t["trip.begin.lon"].mean() + t.trip.begin.lon.max() + When *string_max_length* is ``None`` (the default), scalar Arrow ``string`` / ``large_string`` columns are imported as :func:`~blosc2.vlstring` columns and ``binary`` / ``large_binary`` - columns are imported as :func:`~blosc2.vlbytes` columns. Arrow - ``struct`` columns are imported as :func:`~blosc2.struct` columns backed - by batched variable-length storage. Null values for these variable- - length scalar columns are represented as native ``None`` with no - sentinel needed. + columns are imported as :func:`~blosc2.vlbytes` columns. Non-struct + ``struct`` columns (not containing only scalar leaves) are imported as + :func:`~blosc2.struct` columns backed by batched variable-length + storage. Null values for these variable-length scalar columns are + represented as native ``None`` with no sentinel needed. When *string_max_length* is set to a positive integer, scalar string and binary columns are imported as fixed-width @@ -3639,6 +4627,10 @@ def from_arrow( schema-less ``object`` columns) are flushed to their backend. Set it to ``None`` to keep those columns pending until the final flush. + ``list_serializer`` selects the backend serializer for imported list + columns. ``"msgpack"`` is the default; ``"arrow"`` stores Arrow list + batches directly and can be much faster for deeply nested list columns. + Unsupported Arrow types raise by default. Pass ``object_fallback=True`` to import such columns as schema-less :func:`~blosc2.object` columns. This fallback is intentionally not used by :meth:`from_parquet`. @@ -3652,13 +4644,57 @@ def from_arrow( raise ValueError("blosc2_batch_size must be a positive integer or None") if blosc2_items_per_block is not None and blosc2_items_per_block <= 0: raise ValueError("blosc2_items_per_block must be a positive integer or None") + if list_serializer not in {"msgpack", "arrow"}: + raise ValueError("list_serializer must be 'msgpack' or 'arrow'") + + # ------------------------------------------------------------------ + # Unnamed-root list> flattening (opt-in) + # ------------------------------------------------------------------ + # When the source schema is a single unnamed "" field of type + # list>, the outer list is a physical Parquet/Awkward + # chunking artifact, not a semantic column. Flatten it so that each + # element becomes a CTable row. The struct fields become ordinary + # top-level columns and are further flattened by the struct-leaf + # machinery below. + original_root_metadata: dict | None = None + if separate_nested_cols and cls._detect_unnamed_root_list_struct(pa, schema): + inner_schema = cls._inner_schema_for_unnamed_root(pa, schema) + batches = cls._flatten_root_list_struct_batches(pa, inner_schema, batches) + schema = inner_schema + original_root_metadata = { + "kind": "unnamed_list_struct", + "field_name": "", + "preserve_grouping": False, + } + batches = iter(batches) first_batch = None table_for_inference = None + original_top_level_struct_specs: dict[str, SchemaSpec] = {} + for f in schema: + if pa.types.is_struct(f.type): + original_top_level_struct_specs[join_field_path((f.name,))] = cls._arrow_type_to_spec( + pa, f.type, nullable=f.nullable, object_fallback=object_fallback + ) if string_max_length is None or isinstance(string_max_length, Mapping): first_batch = next(batches, None) - if first_batch is not None: - table_for_inference = pa.Table.from_batches([first_batch], schema=schema) + + # Flatten top-level Arrow structs into dotted leaf columns so CTable can + # persist nested scalar leaves as physical columns. + flattened_structs = False + if first_batch is not None: + import itertools as _it + + schema, flat_batches, flattened_structs = cls._flatten_arrow_struct_input( + pa, schema, _it.chain([first_batch], batches) + ) + batches = iter(flat_batches) + first_batch = next(batches, None) + else: + schema, batches, flattened_structs = cls._flatten_arrow_struct_input(pa, schema, batches) + + if first_batch is not None: + table_for_inference = pa.Table.from_batches([first_batch], schema=schema) columns = cls._compiled_columns_from_arrow( pa, schema, @@ -3669,24 +4705,67 @@ def from_arrow( ) cls._apply_arrow_column_cparams(columns, column_cparams) for col in columns: - if ( - cls._is_list_column(col) and getattr(col.spec, "storage", None) == "batch" - ) or cls._is_varlen_scalar_column(col): + if cls._is_list_column(col): + if getattr(col.spec, "storage", None) == "batch": + col.spec.serializer = list_serializer + if blosc2_batch_size is not None: + col.spec.batch_rows = blosc2_batch_size + if blosc2_items_per_block is not None: + col.spec.items_per_block = blosc2_items_per_block + elif cls._is_varlen_scalar_column(col): if blosc2_batch_size is not None: col.spec.batch_rows = blosc2_batch_size if blosc2_items_per_block is not None: col.spec.items_per_block = blosc2_items_per_block + metadata = cls._arrow_schema_metadata(schema) + empty_root_physical = None + schema_meta = getattr(schema, "metadata", None) or {} + root_key = b"blosc2_empty_root_physical" + if root_key in schema_meta: + raw = schema_meta[root_key] + empty_root_physical = raw.decode() if isinstance(raw, bytes) else str(raw) + metadata["nested"] = cls._nested_metadata_from_column_names( + [col.name for col in columns], empty_root_physical=empty_root_physical + ) + if flattened_structs: + metadata["nested"]["reconstruct_rows"] = True + if original_root_metadata is not None: + metadata["nested"]["original_root"] = original_root_metadata + compiled_columns_by_name = {col.name: col for col in columns} + for name, spec in original_top_level_struct_specs.items(): + if name in compiled_columns_by_name: + continue + compiled_columns_by_name[name] = CompiledColumn( + name=name, + py_type=spec.python_type, + spec=spec, + dtype=getattr(spec, "dtype", None), + default=MISSING, + config=ColumnConfig(cparams=None, dparams=None, chunks=None, blocks=None), + display_width=compute_display_width(spec), + ) + compiled = CompiledSchema( row_cls=None, columns=columns, - columns_by_name={col.name: col for col in columns}, - metadata=cls._arrow_schema_metadata(schema), + columns_by_name=compiled_columns_by_name, + metadata=metadata, ) if first_batch is not None: import itertools as _it batches = _it.chain([first_batch], batches) - capacity = max(capacity_hint or 1, 1) + # Use capacity_hint to size initial NDArray chunks/blocks correctly. + # When capacity_hint is None and we are in the unnamed-root flatten path, + # fall back to _EXPECTED_SIZE_DEFAULT (1 M) so that compute_chunks_blocks + # produces a reasonable block size instead of (1,) which causes catastrophic + # storage fragmentation. For non-unnamed-root imports capacity_hint is + # always supplied by from_parquet (pf.metadata.num_rows), so the fallback + # only matters for direct from_arrow() calls without a hint. + if capacity_hint is None and original_root_metadata is not None: + capacity = _EXPECTED_SIZE_DEFAULT + else: + capacity = max(capacity_hint or 1, 1) storage = cls._storage_for_arrow_import(urlpath, mode) new_cols, new_valid = cls._create_arrow_import_columns(storage, columns, capacity, cparams, dparams) storage.save_schema(schema_to_dict(compiled)) @@ -3727,7 +4806,7 @@ def to_parquet( writer.write_table(table, row_group_size=row_group_size or len(batch)) @classmethod - def from_parquet( + def from_parquet( # noqa: C901 cls, path, *, @@ -3741,6 +4820,9 @@ def from_parquet( auto_null_sentinels: bool = True, blosc2_batch_size: int | None = _BATCH_SIZE_DEFAULT, blosc2_items_per_block: int | None = None, + list_serializer: Literal["msgpack", "arrow"] = "arrow", + separate_nested_cols: bool = True, + max_rows: int | None = None, **kwargs, ) -> CTable: """Read a Parquet file into a :class:`CTable`. @@ -3751,11 +4833,24 @@ def from_parquet( This method delegates the actual table construction to :meth:`CTable.from_arrow`, so Arrow schema handling, nullable-column support, - and Blosc2 write tuning follow the same rules as that method. Top-level - Arrow ``struct<...>`` columns are imported as :func:`~blosc2.struct` - columns backed by batched variable-length storage. Unsupported Parquet - types are not silently imported as schema-less :func:`~blosc2.object` - columns; they raise so callers can decide how to handle them explicitly. + and Blosc2 write tuning follow the same rules as that method. + + **Nested struct flattening**: top-level Parquet ``struct<…>`` fields are + automatically and recursively flattened into dotted leaf columns — the same + as in :meth:`from_arrow`. For example, a Parquet file that contains a column + ``trip: struct>`` produces two CTable + columns ``trip.begin.lon`` and ``trip.begin.lat``. Row reads reconstruct the + original nested dict shape; individual leaves are accessed via dotted names or + attribute-chain proxies:: + + t = blosc2.CTable.from_parquet("trips.parquet") + t.col_names # e.g. ['trip.begin.lon', 'trip.begin.lat', ...] + t["trip.begin.lon"].mean() + t.trip.begin.lon.max() + + Unsupported Parquet types are not silently imported as schema-less + :func:`~blosc2.object` columns; they raise so callers can decide how to + handle them explicitly. Parameters ---------- @@ -3805,7 +4900,31 @@ def from_parquet( blosc2_items_per_block : int or None, optional Target number of items per internal Blosc2 block. Passed through to - :meth:`CTable.from_arrow`. + :meth:`CTable.from_arrow`. In general, larger number of items + favors compression ratios but make random access slower. + + list_serializer : {"msgpack", "arrow"}, optional + Serializer used for imported list columns. The default, ``"arrow"``, + stores Arrow list batches directly and is much faster for deeply nested + or ``list>`` columns. The tradeoff is that accessing those + list columns later requires PyArrow. Use ``"msgpack"`` to keep + list-column stores independent of PyArrow at read time; it can be + smaller for simple lists but is much slower and more memory-intensive + for deeply nested data. + + separate_nested_cols : bool, optional + Whether to separate qualifying nested columns during import. Defaults to + ``True``. In particular, a single unnamed top-level + ``list>`` field is treated as a root record stream: each list + element becomes a CTable row and struct leaves become ordinary nested + CTable columns. Use ``separate_nested_cols=False`` when closer fidelity to + the original Parquet row/schema shape is more important than the separated + column layout. + + max_rows : int or None, optional + Maximum number of rows to import. For ordinary Parquet files this limits + Parquet/CTable rows. For unnamed-root ``list>`` files imported + with ``separate_nested_cols=True``, this limits flattened element rows. **kwargs Additional keyword arguments forwarded to ``pyarrow.parquet.ParquetFile``. @@ -3824,6 +4943,8 @@ def from_parquet( If :mod:`pyarrow` is not installed. ValueError If ``batch_size`` is not greater than 0. + ValueError + If ``max_rows`` is negative. ValueError If ``columns`` contains duplicate names. Exception @@ -3863,6 +4984,8 @@ def from_parquet( pq = cls._require_pyarrow_parquet("from_parquet()") pa = cls._require_pyarrow("from_parquet()") cls._validate_arrow_batch_size(batch_size) + if max_rows is not None and max_rows < 0: + raise ValueError("max_rows must be non-negative") string_max_length = kwargs.pop("string_max_length", None) pf = pq.ParquetFile(path, **kwargs) arrow_schema = pf.schema_arrow @@ -3872,6 +4995,114 @@ def from_parquet( fields = [arrow_schema.field(name) for name in columns] arrow_schema = pa.schema(fields) batches = pf.iter_batches(batch_size=batch_size, columns=columns) + + # Parquet files generated by Awkward-style pipelines may contain an + # unnamed top-level field (""). When separate_nested_cols=True and the + # schema qualifies as an unnamed-root list>, skip the + # rename-to-root logic and pass the original schema directly to + # from_arrow, which will perform the element-level flattening. + # Otherwise, normalize empty column names to non-empty names as before. + _is_unnamed_root_flatten = separate_nested_cols and cls._detect_unnamed_root_list_struct( + pa, arrow_schema + ) + if not _is_unnamed_root_flatten and any(name == "" for name in arrow_schema.names): + used = {n for n in arrow_schema.names if n} + + def _fresh_root_name() -> str: + base = "root" + if base not in used: + used.add(base) + return base + i = 1 + while True: + candidate = f"{base}_{i}" + if candidate not in used: + used.add(candidate) + return candidate + i += 1 + + original_names = list(arrow_schema.names) + renamed = [_fresh_root_name() if n == "" else n for n in original_names] + arrow_schema = pa.schema( + [arrow_schema.field(i).with_name(renamed[i]) for i in range(len(renamed))] + ) + # Preserve canonical unnamed-root intent in schema metadata. + try: + first_root = next(renamed[i] for i, old in enumerate(original_names) if old == "") + except StopIteration: + first_root = renamed[0] if renamed else "root" + current_meta = dict(arrow_schema.metadata or {}) + current_meta[b"blosc2_empty_root_physical"] = first_root.encode() + arrow_schema = arrow_schema.with_metadata(current_meta) + + def _renamed_batches(batch_iter, names): + for b in batch_iter: + yield b.rename_columns(names) + + batches = _renamed_batches(batches, renamed) + + def _limited_batches(batch_iter, limit: int): + rows_seen = 0 + for batch in batch_iter: + if rows_seen >= limit: + break + remaining = limit - rows_seen + if len(batch) > remaining: + batch = batch.slice(0, remaining) + rows_seen += len(batch) + yield batch + + # For unnamed-root flattening, max_rows applies to flattened element rows, + # not to the outer Parquet rows. Pre-flatten here when a limit is requested + # so the limit can be enforced precisely before handing batches to from_arrow. + if _is_unnamed_root_flatten and max_rows is not None: + inner_schema = cls._inner_schema_for_unnamed_root(pa, arrow_schema) + limited_flat_batches = cls._flatten_root_list_struct_batches( + pa, inner_schema, batches, max_rows=max_rows + ) + ct = cls.from_arrow( + inner_schema, + limited_flat_batches, + urlpath=urlpath, + mode=mode, + cparams=cparams, + dparams=dparams, + validate=validate, + capacity_hint=max_rows, + string_max_length=string_max_length, + auto_null_sentinels=auto_null_sentinels, + blosc2_batch_size=blosc2_batch_size, + blosc2_items_per_block=blosc2_items_per_block, + list_serializer=list_serializer, + separate_nested_cols=False, + ) + nested_meta = ct._schema.metadata.get("nested", {}) + nested_meta["original_root"] = { + "kind": "unnamed_list_struct", + "field_name": "", + "preserve_grouping": False, + } + ct._schema.metadata["nested"] = nested_meta + ct._storage.save_schema(schema_to_dict(ct._schema)) + return ct + + if max_rows is not None: + batches = _limited_batches(batches, max_rows) + + # When flattening a root list>, the actual element count is not + # known ahead of time. Pass capacity_hint=None so that from_arrow falls back + # to _EXPECTED_SIZE_DEFAULT (1 M), which gives compute_chunks_blocks() a + # reasonable block size instead of the catastrophic (1, 1) produced by + # capacity=1. The CLI path computes a better estimate by sampling. + if _is_unnamed_root_flatten: + _capacity_hint = None + elif pf.metadata is not None: + _capacity_hint = ( + pf.metadata.num_rows if max_rows is None else min(max_rows, pf.metadata.num_rows) + ) + else: + _capacity_hint = max_rows + return cls.from_arrow( arrow_schema, batches, @@ -3880,11 +5111,13 @@ def from_parquet( cparams=cparams, dparams=dparams, validate=validate, - capacity_hint=pf.metadata.num_rows if pf.metadata is not None else None, + capacity_hint=_capacity_hint, string_max_length=string_max_length, auto_null_sentinels=auto_null_sentinels, blosc2_batch_size=blosc2_batch_size, blosc2_items_per_block=blosc2_items_per_block, + list_serializer=list_serializer, + separate_nested_cols=separate_nested_cols, ) # ------------------------------------------------------------------ @@ -4250,6 +5483,16 @@ def rename_column(self, old: str, new: str) -> None: On disk tables the corresponding persisted column leaf is renamed. + Renaming a flat column to a dotted name (e.g. ``"trip.begin.lon"``) + promotes it to a nested leaf column: it will be stored under the + hierarchical path ``/_cols/trip/begin/lon`` on disk and can be + accessed via ``t["trip.begin.lon"]`` or the attribute-chain proxy + ``t.trip.begin.lon``. This is the primary way to define nested + columns when importing from non-Arrow sources:: + + t.rename_column("trip_begin_lon", "trip.begin.lon") + t["trip.begin.lon"].mean() # works as a regular Column + Raises ------ ValueError @@ -4387,7 +5630,9 @@ def _fetch_col_at_positions(self, name: str, positions: np.ndarray): ) col = self._cols[name] spec = self._schema.columns_by_name[name].spec - if self._is_list_spec(spec) or isinstance(spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)): + if self._is_list_spec(spec) or isinstance( + spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec, DictionarySpec) + ): return col[positions] values = col[positions] if isinstance(spec, timestamp): @@ -4848,7 +6093,11 @@ def _structured_array_dtype(self) -> np.dtype: col_info = self._schema.columns_by_name.get(name) if col_info is None: dtype = np.asarray(self[name][:0]).dtype - elif self._is_list_column(col_info) or self._is_varlen_scalar_column(col_info): + elif ( + self._is_list_column(col_info) + or self._is_varlen_scalar_column(col_info) + or self._is_dictionary_column(col_info) + ): dtype = np.dtype(object) else: dtype = col_info.dtype if col_info.dtype is not None else np.dtype(object) @@ -4867,6 +6116,36 @@ def __array__(self, dtype=None, copy=None): arr = arr.astype(dtype, copy=True if copy is None else copy) return arr.copy() if copy else arr + def _logical_to_physical_name(self, name: str) -> str: + """Resolve a user/logical column path to a stored physical column name.""" + if name in self._cols or name in self._computed_cols: + return name + nested = self._schema.metadata.get("nested") if self._schema.metadata else None + if isinstance(nested, dict): + mapping = nested.get("logical_to_physical") + if isinstance(mapping, dict): + physical = mapping.get(name) + if isinstance(physical, str) and (physical in self._cols or physical in self._computed_cols): + return physical + return name + + def _expand_logical_column_selector(self, name: str) -> list[str]: + """Resolve one logical selector to one or more physical column names. + + If *name* points to a scalar leaf, returns ``[leaf]``. If it points to + a struct-like prefix (e.g. ``"trip"``), expands to descendant leaves. + """ + physical = self._logical_to_physical_name(name) + if physical in self._cols or physical in self._computed_cols: + return [physical] + prefix_parts = split_field_path(physical) + expanded = [ + col for col in self.col_names if split_field_path(col)[: len(prefix_parts)] == prefix_parts + ] + if expanded: + return expanded + return [physical] + def __getitem__(self, key): """Type-driven indexing for columns, rows, projections, and filters. @@ -4874,7 +6153,10 @@ def __getitem__(self, key): - ``str``: return a :class:`Column` when it matches a stored or computed column name; otherwise evaluate it as a boolean expression via - :meth:`where`. + :meth:`where`. Dotted names (e.g. ``"trip.begin.lon"``) select + nested leaf columns directly; a struct-prefix name + (e.g. ``"trip.begin"``) that matches multiple descendant leaves returns + a :class:`_StructPathColumn` view. - boolean :class:`blosc2.LazyExpr` or :class:`blosc2.NDArray`: return the same filtered view as :meth:`where`, e.g. ``t[t.temperature_f > 70]``. - ``int``: return one live row as a namedtuple-like object. @@ -4902,10 +6184,20 @@ def __getitem__(self, key): Project columns:: slim = t[["sensor_id", "temperature_f"]] + + Access a nested leaf column with a dotted name or an attribute chain:: + + lons = t["trip.begin.lon"] # Column for the nested leaf + lons = t.trip.begin.lon # equivalent attribute-chain form """ if isinstance(key, str): - if key in self._cols or key in self._computed_cols: - return Column(self, key) + physical = self._logical_to_physical_name(key) + if physical in self._cols or physical in self._computed_cols: + return Column(self, physical) + expanded = self._expand_logical_column_selector(key) + cc = self._schema.columns_by_name.get(physical) + if len(expanded) > 1 or (expanded and cc is not None and isinstance(cc.spec, StructSpec)): + return _StructPathColumn(self, physical, expanded) return self.where(key) if isinstance(key, (blosc2.NDArray, blosc2.LazyExpr)) and getattr(key, "dtype", None) == np.bool_: return self.where(key) @@ -4913,9 +6205,21 @@ def __getitem__(self, key): raise TypeError("Tuple indexing is not supported for CTable in V1") return self._getitem_row_selector(key) + def _nested_namespace(self, prefix: str): + prefix_parts = split_field_path(prefix) + for name in self.col_names: + parts = split_field_path(name) + if parts[: len(prefix_parts)] == prefix_parts and len(parts) > len(prefix_parts): + return _NestedColumnNamespace(self, prefix) + return None + def __getattr__(self, s: str): - if s in self._cols or s in self._computed_cols: - return Column(self, s) + physical = self._logical_to_physical_name(s) + if physical in self._cols or physical in self._computed_cols: + return Column(self, physical) + ns = self._nested_namespace(s) + if ns is not None: + return ns return super().__getattribute__(s) # ------------------------------------------------------------------ @@ -4959,6 +6263,11 @@ def compact(self): replacement.flush() self._cols[name] = replacement continue + if self._is_dictionary_column(col): + # Keep dictionary values intact; just compact the codes. + live_codes = np.asarray(v.codes[real_poss[: self._n_rows]], dtype=np.int32) + v.codes[: self._n_rows] = live_codes + continue start = 0 block_size = self._valid_rows.blocks[0] end = min(block_size, self._n_rows) @@ -4980,6 +6289,16 @@ def _normalise_sort_keys( """Validate and normalise sort key arguments; return (cols, ascending).""" if isinstance(cols, str): cols = [cols] + + resolved_cols: list[str] = [] + for name in cols: + expanded = self._expand_logical_column_selector(name) + if len(expanded) != 1: + raise ValueError( + f"Sort key {name!r} resolves to multiple columns {expanded!r}; please choose a leaf column." + ) + resolved_cols.append(expanded[0]) + cols = resolved_cols if isinstance(ascending, bool): ascending = [ascending] * len(cols) if len(cols) != len(ascending): @@ -5094,7 +6413,13 @@ def _build_lex_keys( # Materialise computed column values at live positions raw = np.asarray(cc["lazy"][:])[live_pos] else: - raw = self._cols[name][live_pos] + col_info = self._schema.columns_by_name.get(name) + if col_info is not None and self._is_dictionary_column(col_info): + # Sort dictionary columns by decoded string values. + decoded = self._cols[name][live_pos] + raw = np.array(decoded, dtype=object) + else: + raw = self._cols[name][live_pos] col_info = self._schema.columns_by_name.get(name) nv = getattr(col_info.spec, "null_value", None) if col_info else None @@ -5136,7 +6461,12 @@ def sort_by( cols: Column name or list of column names to sort by. When multiple columns are given, the first is the primary key, the second is - the tiebreaker, and so on. + the tiebreaker, and so on. For tables with **nested (dotted) + column names**, pass the dotted leaf name directly:: + + t.sort_by("trip.begin.lon") + t.sort_by(["trip.begin.lon", "payment.fare"], ascending=[True, False]) + ascending: Sort direction. A single bool applies to all keys; a list must have the same length as *cols*. @@ -5186,37 +6516,52 @@ def sort_by( sorted_pos = live_pos[order] if inplace: - for col in self._schema.columns: - arr = self._cols[col.name] - if self._is_list_column(col): - new_arr = ListArray(spec=col.spec) - new_arr.extend((arr[int(pos)] for pos in sorted_pos), validate=False) - new_arr.flush() - self._cols[col.name] = new_arr - else: - arr[:n] = arr[sorted_pos] - self._valid_rows[:n] = True - self._valid_rows[n:] = False - self._n_rows = n - self._last_pos = n - self._mark_all_indexes_stale() + self._sort_by_inplace(sorted_pos, n) return self - else: - # Build a new in-memory table with the sorted rows - result = self._empty_copy() - for col in self._schema.columns: - col_name = col.name - arr = self._cols[col_name] - if self._is_list_column(col): - result._cols[col_name].extend((arr[int(pos)] for pos in sorted_pos), validate=False) - result._cols[col_name].flush() - else: - result._cols[col_name][:n] = arr[sorted_pos] - result._valid_rows[:n] = True - result._valid_rows[n:] = False - result._n_rows = n - result._last_pos = n - return result + + return self._sorted_copy_from_positions(sorted_pos, n) + + def _sort_by_inplace(self, sorted_pos: np.ndarray, n: int) -> None: + for col in self._schema.columns: + arr = self._cols[col.name] + if self._is_list_column(col): + new_arr = ListArray(spec=col.spec) + new_arr.extend((arr[int(pos)] for pos in sorted_pos), validate=False) + new_arr.flush() + self._cols[col.name] = new_arr + elif self._is_dictionary_column(col): + sorted_codes = np.asarray(arr.codes[sorted_pos], dtype=np.int32) + arr.codes[:n] = sorted_codes + else: + arr[:n] = arr[sorted_pos] + self._valid_rows[:n] = True + self._valid_rows[n:] = False + self._n_rows = n + self._last_pos = n + self._mark_all_indexes_stale() + + def _sorted_copy_from_positions(self, sorted_pos: np.ndarray, n: int) -> CTable: + # Build a new in-memory table with the sorted rows + result = self._empty_copy() + for col in self._schema.columns: + col_name = col.name + arr = self._cols[col_name] + if self._is_list_column(col): + result._cols[col_name].extend((arr[int(pos)] for pos in sorted_pos), validate=False) + result._cols[col_name].flush() + elif self._is_dictionary_column(col): + # Copy dictionary values, then sorted codes. + for v in arr.dictionary: + result._cols[col_name].encode(v) + sorted_codes = np.asarray(arr.codes[sorted_pos], dtype=np.int32) + result._cols[col_name].codes[:n] = sorted_codes + else: + result._cols[col_name][:n] = arr[sorted_pos] + result._valid_rows[:n] = True + result._valid_rows[n:] = False + result._n_rows = n + result._last_pos = n + return result def copy( self, @@ -5287,6 +6632,13 @@ def copy( src = (arr[int(pos)] for pos in live_pos) if compact else (arr[i] for i in range(n)) result._cols[col_name].extend(src, validate=False) result._cols[col_name].flush() + elif self._is_dictionary_column(col): + # Copy dictionary values, then copy (live) codes. + for v in arr.dictionary: + result._cols[col_name].encode(v) + pos_slice = live_pos if compact else np.arange(n, dtype=np.int64) + raw_codes = np.asarray(arr.codes[pos_slice], dtype=np.int32) + result._cols[col_name].codes[:n] = raw_codes else: result._cols[col_name][:n] = arr[live_pos] if compact else arr[:n] @@ -5324,6 +6676,20 @@ def _empty_copy(self, capacity: int | None = None) -> CTable: cparams=col_storage.get("cparams"), dparams=col_storage.get("dparams"), ) + elif self._is_varlen_scalar_column(col): + new_cols[col.name] = mem_storage.create_varlen_scalar_column( + col.name, + spec=col.spec, + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) + elif self._is_dictionary_column(col): + new_cols[col.name] = mem_storage.create_dictionary_column( + col.name, + spec=col.spec, + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) else: new_cols[col.name] = mem_storage.create_column( col.name, @@ -5643,6 +7009,7 @@ def _resolve_index_catalog_entry( if col_name is not None and expression is not None: raise ValueError("col_name and expression are mutually exclusive") if col_name is not None: + col_name = self._logical_to_physical_name(col_name) if col_name not in catalog: raise KeyError(f"No index found for column {col_name!r}.") return col_name, catalog[col_name] @@ -5822,7 +7189,14 @@ def create_index( # noqa: C901 tmpdir: str | None = None, **kwargs, ) -> blosc2.Index: - """Build and register an index for a stored column or table expression.""" + """Build and register an index for a stored column or table expression. + + For tables with **nested (dotted) column names**, pass the dotted leaf + name directly:: + + t.create_index("trip.begin.lon") + t.where("trip.begin.lon > -87.7").nrows # index is used automatically + """ if self.base is not None: raise ValueError("Cannot create an index on a view.") if col_name is not None and field is not None: @@ -5832,6 +7206,8 @@ def create_index( # noqa: C901 if operands is not None and expression is None: raise ValueError("operands can only be provided together with expression") col_name = field if field is not None else col_name + if col_name is not None: + col_name = self._logical_to_physical_name(col_name) from blosc2.indexing import ( _IN_MEMORY_INDEXES, @@ -5920,6 +7296,10 @@ def create_index( # noqa: C901 f"Cannot create an index on variable-length scalar column {col_name!r}: " "indexing for vlstring/vlbytes/struct/object columns is not supported yet." ) + # Dictionary columns: index the underlying int32 codes array. + is_dictionary = isinstance(self._schema.columns_by_name[col_name].spec, DictionarySpec) + if is_dictionary: + col_arr = col_arr.codes # index the int32 codes NDArray is_persistent = self._storage.index_anchor_path(col_name) is not None if is_persistent: @@ -6303,6 +7683,9 @@ def info_items(self) -> list[tuple[str, object]]: @staticmethod def _dtype_info_label(dtype: np.dtype | None, spec: SchemaSpec | None = None) -> str: """Return a compact dtype label for info reports.""" + if isinstance(spec, DictionarySpec): + ordered_tag = ", ordered" if spec.ordered else "" + return f"dictionary[str{ordered_tag}]" if isinstance(spec, VLStringSpec): return "vlstring" if isinstance(spec, VLBytesSpec): @@ -6370,6 +7753,19 @@ def append(self, data: list | np.void | np.ndarray) -> None: Materialized columns whose values are omitted are auto-filled from their recorded expression. Raises ``ValueError`` if the table is read-only or a view. + + For tables with **nested (dotted) column names** the row dict may be + supplied either as a flat mapping of dotted keys or as a nested dict + that mirrors the original struct shape — both are accepted and + automatically flattened to the physical dotted leaf names:: + + # flat dotted keys + t.append({"trip.begin.lon": -87.6, "trip.begin.lat": 41.8, + "payment.fare": 12.5}) + + # original nested dict (auto-flattened) + t.append({"trip": {"begin": {"lon": -87.6, "lat": 41.8}}, + "payment": {"fare": 12.5}}) """ if self._read_only: raise ValueError("Table is read-only (opened with mode='r').") @@ -6395,12 +7791,15 @@ def append(self, data: list | np.void | np.ndarray) -> None: col_array = self._cols[name] if self._is_list_column(col) or self._is_varlen_scalar_column(col): col_array.append(row[name]) + elif self._is_dictionary_column(col): + col_array[pos] = row[name] # DictionaryColumn encodes on __setitem__ else: col_array[pos] = row[name] + n_rows = self.nrows self._valid_rows[pos] = True self._last_pos = pos + 1 - self._n_rows += 1 + self._n_rows = n_rows + 1 self._mark_all_indexes_stale() def delete(self, ind: int | slice | str | Iterable) -> None: @@ -6426,10 +7825,11 @@ def delete(self, ind: int | slice | str | Iterable) -> None: false_pos = true_pos[ind] n_deleted = len(np.unique(false_pos)) + n_rows = self.nrows valid_rows_np[false_pos] = False self._valid_rows[:] = valid_rows_np # write back in-place; no new array created - self._n_rows -= n_deleted + self._n_rows = n_rows - n_deleted if self._last_pos is None or np.any(false_pos == self._last_pos - 1): self._last_pos = None # last live row deleted; recalculate on next write self._storage.bump_visibility_epoch() @@ -6447,6 +7847,22 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> Pass ``validate=False`` to skip per-row Pydantic validation on trusted bulk imports. Raises ``ValueError`` if the table is read-only or a view. + + For tables with **nested (dotted) column names** both the dict-of-arrays + and list-of-dicts forms accept the original nested dict shape and + auto-flatten it to physical dotted leaf names:: + + # nested dict of arrays + t.extend({ + "trip": {"begin": {"lon": lons, "lat": lats}}, + "payment": {"fare": fares}, + }) + + # list of nested dicts + t.extend([ + {"trip": {"begin": {"lon": -87.6, "lat": 41.8}}, "payment": {"fare": 12.5}}, + {"trip": {"begin": {"lon": -87.5, "lat": 41.7}}, "payment": {"fare": 8.0}}, + ]) """ if self._read_only: raise ValueError("Table is read-only (opened with mode='r').") @@ -6476,6 +7892,8 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> provided_names.add(name) else: if isinstance(data, dict): + if any(isinstance(v, dict) for v in data.values()): + data = self._flatten_nested_dict(data) known_names = [name for name in current_col_names if name in data] if not known_names: raise ValueError("No known stored columns provided for extend().") @@ -6499,6 +7917,26 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> new_nrows = len(data) raw_columns = {name: data[name] for name in data.dtype.names if name in current_col_names} provided_names = set(raw_columns) + elif data and isinstance(data[0], dict): + # List of dicts: flatten any nested dicts and pivot to column arrays. + flat_rows = [ + self._flatten_nested_dict(row) if any(isinstance(v, dict) for v in row.values()) else row + for row in data + ] + new_nrows = len(flat_rows) + col_set = set(input_col_names) + raw_columns = { + name: [row[name] for row in flat_rows] + for name in input_col_names + if name in flat_rows[0] + } + provided_names = set(raw_columns) + # Fill any remaining columns from the rows (may include extra keys) + for row in flat_rows: + for key in row: + if key in col_set and key not in raw_columns: + raw_columns[key] = [r.get(key) for r in flat_rows] + provided_names.add(key) else: new_nrows = len(data) batch_columns = list(zip(*data, strict=False)) @@ -6522,12 +7960,15 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> scalar_processed_cols: dict[str, blosc2.NDArray] = {} list_processed_cols: dict[str, list] = {} varlen_scalar_processed_cols: dict[str, list] = {} + dict_processed_cols: dict[str, list] = {} for name in current_col_names: col_meta = self._schema.columns_by_name[name] if self._is_list_column(col_meta): list_processed_cols[name] = list(raw_columns[name]) elif self._is_varlen_scalar_column(col_meta): varlen_scalar_processed_cols[name] = list(raw_columns[name]) + elif self._is_dictionary_column(col_meta): + dict_processed_cols[name] = list(raw_columns[name]) else: target_dtype = self._cols[name].dtype if isinstance(col_meta.spec, timestamp): @@ -6568,12 +8009,16 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> self._cols[name].extend(list_processed_cols[name], validate=do_validate) elif self._is_varlen_scalar_column(col_meta): self._cols[name].extend(varlen_scalar_processed_cols[name]) + elif self._is_dictionary_column(col_meta): + # DictionaryColumn.__setitem__ with a slice encodes all values. + self._cols[name][start_pos:end_pos] = dict_processed_cols[name] else: self._cols[name][start_pos:end_pos] = scalar_processed_cols[name][:] + n_rows = self.nrows self._valid_rows[start_pos:end_pos] = True self._last_pos = end_pos - self._n_rows += new_nrows + self._n_rows = n_rows + new_nrows self._mark_all_indexes_stale() # ------------------------------------------------------------------ @@ -6584,11 +8029,41 @@ def _where_expression_operands(self) -> dict[str, blosc2.NDArray | blosc2.LazyEx operands = {} for name, arr in self._cols.items(): col = self._schema.columns_by_name.get(name) - if col is not None and not (self._is_list_column(col) or self._is_varlen_scalar_column(col)): + if col is not None and not ( + self._is_list_column(col) + or self._is_varlen_scalar_column(col) + or self._is_dictionary_column(col) + ): operands[name] = arr operands.update({name: cc["lazy"] for name, cc in self._computed_cols.items()}) return operands + def _rewrite_nested_expression( + self, expr: str, operands: dict[str, blosc2.NDArray | blosc2.LazyExpr] + ) -> tuple[str, dict[str, blosc2.NDArray | blosc2.LazyExpr]]: + """Rewrite dotted nested names in *expr* to safe identifiers. + + `blosc2.lazyexpr` does not accept dotted identifiers, but nested leaf + columns are naturally addressed as dotted paths (e.g. ``trip.begin.lon``). + This maps them to temporary aliases and returns rewritten expression and + operand mapping. + """ + dotted = [name for name in operands if "." in name] + if not dotted: + return expr, operands + + rewritten = expr + new_operands = dict(operands) + # Longest names first so trip.begin.lon is rewritten before trip.begin. + for i, name in enumerate(sorted(dotted, key=len, reverse=True)): + alias = f"__nf{i}" + pattern = rf"(? None: for col in self._schema.columns: if self._is_varlen_scalar_column(col) and re.search( @@ -6664,6 +8139,12 @@ def where( view = t.where((t["unit price"] * t["quantity"]) > 100) + For tables with **nested (dotted) column names**, dotted leaf names and + attribute-chain proxies work in both string and expression forms:: + + view = t.where("trip.begin.lon > -87.7 and payment.fare > 10") + view = t.where(t.trip.begin.lon > -87.7) + Notes ----- Use bitwise operators (``&``, ``|``, ``~``) or string expressions for @@ -6684,7 +8165,9 @@ def where( """ if isinstance(expr_result, str): self._guard_varlen_scalar_expression(expr_result) - expr_result = blosc2.lazyexpr(expr_result, self._where_expression_operands()) + operands = self._where_expression_operands() + expr_result, operands = self._rewrite_nested_expression(expr_result, operands) + expr_result = blosc2.lazyexpr(expr_result, operands) if isinstance(expr_result, np.ndarray) and expr_result.dtype == np.bool_: expr_result = blosc2.asarray(expr_result) if isinstance(expr_result, Column): diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index fab3ac11..145e161b 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -28,6 +28,7 @@ import blosc2 from blosc2.batch_array import BatchArray +from blosc2.dictionary_column import DictionaryColumn from blosc2.list_array import ListArray from blosc2.scalar_array import ( _make_persistent_backend, @@ -94,6 +95,19 @@ def create_varlen_scalar_column( def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: raise NotImplementedError + def create_dictionary_column( + self, + name: str, + *, + spec, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + ) -> DictionaryColumn: + raise NotImplementedError + + def open_dictionary_column(self, name: str, spec) -> DictionaryColumn: + raise NotImplementedError + def create_valid_rows( self, *, @@ -206,6 +220,17 @@ def create_varlen_scalar_column(self, name, *, spec, cparams=None, dparams=None) def open_varlen_scalar_column(self, name, spec): raise RuntimeError("In-memory tables have no on-disk representation to open.") + def create_dictionary_column(self, name, *, spec, cparams=None, dparams=None): + from blosc2.schema import VLStringSpec + + chunks, blocks = (4096,), (256,) + codes = blosc2.zeros((4096,), dtype=np.int32, chunks=chunks, blocks=blocks) + dict_store = _ScalarVarLenArray(VLStringSpec(nullable=False)) + return DictionaryColumn(spec, codes, dict_store) + + def open_dictionary_column(self, name, spec): + raise RuntimeError("In-memory tables have no on-disk representation to open.") + def create_valid_rows(self, *, shape, chunks, blocks): return blosc2.zeros(shape, dtype=np.bool_, chunks=chunks, blocks=blocks) @@ -268,6 +293,63 @@ def index_anchor_path(self, col_name: str) -> str | None: _COLS_DIR = "_cols" +def split_field_path(path: str) -> tuple[str, ...]: + """Split a dotted logical field path into segments. + + A backslash escapes separator characters, so ``"a\\.b.c"`` means the + two-segment path ``("a.b", "c")``. The empty string is the canonical root. + """ + if path == "": + return () + parts: list[str] = [] + buf: list[str] = [] + escaped = False + for ch in path: + if escaped: + buf.append(ch) + escaped = False + elif ch == "\\": + escaped = True + elif ch == ".": + parts.append("".join(buf)) + buf = [] + else: + buf.append(ch) + if escaped: + buf.append("\\") + parts.append("".join(buf)) + return tuple(parts) + + +def join_field_path(parts: tuple[str, ...] | list[str]) -> str: + """Join logical path segments using dot syntax with backslash escaping.""" + escaped_parts = [] + for part in parts: + buf: list[str] = [] + for ch in part: + if ch in {"\\", ".", "/"}: + buf.append("\\") + buf.append(ch) + escaped_parts.append("".join(buf)) + return ".".join(escaped_parts) + + +def _encode_storage_segment(segment: str) -> str: + """Percent-encode characters that are structural in logical/storage paths.""" + return segment.replace("%", "%25").replace("/", "%2F").replace(".", "%2E").replace("\\", "%5C") + + +def _column_name_to_relpath(name: str) -> str: + """Map a logical column name to a hierarchical path under ``_cols``. + + Unescaped dots are interpreted as nested path separators + (``a.b.c`` -> ``a/b/c``). Literal dots/slashes/backslashes in field names + can be represented with :func:`join_field_path` and are percent-encoded in + the physical storage path. + """ + return "/".join(_encode_storage_segment(part) for part in split_field_path(name)) + + class FileTableStorage(TableStorage): """Arrays stored as TreeStore leaves inside *urlpath*. @@ -281,13 +363,13 @@ class FileTableStorage(TableStorage): ``'r'`` — open existing read-only. """ - def __init__(self, urlpath: str, mode: str) -> None: + def __init__(self, urlpath: str, mode: str, store: blosc2.TreeStore | None = None) -> None: if mode not in ("r", "a", "w"): raise ValueError(f"mode must be 'r', 'a', or 'w'; got {mode!r}") self._root = urlpath self._mode = mode self._meta: blosc2.SChunk | None = None - self._store: blosc2.TreeStore | None = None + self._store: blosc2.TreeStore | None = store # ------------------------------------------------------------------ # Key helpers @@ -310,8 +392,13 @@ def _list_col_path(self, name: str) -> str: # For .b2d, working_dir == self._root, so behaviour is unchanged. return os.path.join(self._open_store().working_dir, rel_key + ".b2b") + def _dict_col_path(self, name: str) -> str: + """Path for the dictionary values store of a dictionary column.""" + rel_key = self._col_key(name).lstrip("/") + return os.path.join(self._open_store().working_dir, rel_key + "_dict.b2b") + def _col_key(self, name: str) -> str: - return f"/{_COLS_DIR}/{name}" + return f"/{_COLS_DIR}/{_column_name_to_relpath(name)}" def _key_to_path(self, key: str) -> str: rel_key = key.lstrip("/") @@ -366,13 +453,14 @@ def create_list_column(self, name, *, spec, cparams, dparams): kwargs["cparams"] = cparams if dparams is not None: kwargs["dparams"] = dparams + os.makedirs(os.path.dirname(self._list_col_path(name)), exist_ok=True) return ListArray(spec=spec, **kwargs) def open_list_column(self, name: str) -> ListArray: store = self._open_store() if store.is_zip_store and self._mode == "r": # In read mode, .b2z is never extracted — read the member at its zip offset directly. - rel = f"{_COLS_DIR}/{name}.b2b" + rel = self._col_key(name).lstrip("/") + ".b2b" if rel not in store.offsets: raise KeyError(f"List column {name!r} not found in {self._root!r}") opened = blosc2.blosc2_ext.open(store.b2z_path, mode="r", offset=store.offsets[rel]["offset"]) @@ -388,7 +476,7 @@ def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: store = self._open_store() path = self._list_col_path(name) if store.is_zip_store and self._mode == "r": - rel = f"{_COLS_DIR}/{name}.b2b" + rel = self._col_key(name).lstrip("/") + ".b2b" if rel not in store.offsets: raise KeyError(f"Varlen scalar column {name!r} not found in {self._root!r}") backend = BatchArray( @@ -401,6 +489,47 @@ def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: _validate_role_metadata(backend, spec) return _ScalarVarLenArray(spec, backend) + def create_dictionary_column(self, name, *, spec, cparams=None, dparams=None) -> DictionaryColumn: + from blosc2.schema import VLStringSpec + + # Codes: stored as a regular NDArray under _cols/name + codes = self.create_column( + name, + dtype=np.int32, + shape=(4096,), + chunks=(4096,), + blocks=(256,), + cparams=cparams, + dparams=dparams, + ) + # Dictionary values: stored as a varlen scalar (vlstring) at name_dict.b2b + dict_spec = VLStringSpec(nullable=False) + dict_path = self._dict_col_path(name) + dict_backend = _make_persistent_backend(dict_spec, dict_path, "w") + dict_store = _ScalarVarLenArray(dict_spec, dict_backend) + return DictionaryColumn(spec, codes, dict_store) + + def open_dictionary_column(self, name: str, spec) -> DictionaryColumn: + from blosc2.schema import VLStringSpec + + codes = self.open_column(name) + dict_spec = VLStringSpec(nullable=False) + store = self._open_store() + dict_path = self._dict_col_path(name) + if store.is_zip_store and self._mode == "r": + rel = self._col_key(name).lstrip("/") + "_dict.b2b" + if rel not in store.offsets: + raise KeyError(f"Dictionary column dict store {name!r} not found in {self._root!r}") + dict_backend = BatchArray( + _from_schunk=blosc2.blosc2_ext.open( + store.b2z_path, mode="r", offset=store.offsets[rel]["offset"] + ) + ) + else: + dict_backend = _open_persistent_backend(dict_path, self._mode, spec=dict_spec) + dict_store = _ScalarVarLenArray(dict_spec, dict_backend) + return DictionaryColumn(spec, codes, dict_store) + def create_valid_rows(self, *, shape, chunks, blocks): valid_rows = blosc2.zeros( shape, @@ -685,9 +814,12 @@ def _open_leaf(self, logical_key: str) -> Any: full_key = self._table_key(logical_key) return DictStore.__getitem__(self._store, full_key) + def _col_logical_key(self, name: str) -> str: + return f"/{_COLS_DIR}/{_column_name_to_relpath(name)}" + def _list_col_path(self, name: str) -> str: """Filesystem path for a list-style column (``.b2b``).""" - return self._dest_path(f"/_cols/{name}", ".b2b") + return self._dest_path(self._col_logical_key(name), ".b2b") # ------------------------------------------------------------------ # TableStorage interface — lifecycle @@ -735,16 +867,16 @@ def create_column( kwargs["cparams"] = cparams if dparams is not None: kwargs["dparams"] = dparams - dest_path = self._dest_path(f"/_cols/{name}", ".b2nd") + dest_path = self._dest_path(self._col_logical_key(name), ".b2nd") os.makedirs(os.path.dirname(dest_path), exist_ok=True) col = blosc2.zeros(shape, dtype=dtype, urlpath=dest_path, mode="w", **kwargs) rel_path = os.path.relpath(dest_path, self._working_dir()).replace(os.sep, "/") - self._store.map_tree[self._table_key(f"/_cols/{name}")] = rel_path + self._store.map_tree[self._table_key(self._col_logical_key(name))] = rel_path self._store._modified = True return col def open_column(self, name: str) -> blosc2.NDArray: - return self._open_leaf(f"/_cols/{name}") + return self._open_leaf(self._col_logical_key(name)) def create_list_column( self, @@ -768,7 +900,7 @@ def create_list_column( def open_list_column(self, name: str) -> ListArray: if self._store.is_zip_store and self._mode == "r": - rel = self._table_key(f"/_cols/{name}").lstrip("/") + ".b2b" + rel = self._table_key(self._col_logical_key(name)).lstrip("/") + ".b2b" if rel not in self._store.offsets: raise KeyError(f"List column {name!r} not found in {self._store.localpath!r}") opened = blosc2.blosc2_ext.open( @@ -793,7 +925,7 @@ def create_varlen_scalar_column( def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: if self._store.is_zip_store and self._mode == "r": - rel = self._table_key(f"/_cols/{name}").lstrip("/") + ".b2b" + rel = self._table_key(self._col_logical_key(name)).lstrip("/") + ".b2b" if rel not in self._store.offsets: raise KeyError(f"Varlen scalar column {name!r} not found in {self._store.localpath!r}") backend = BatchArray( @@ -808,6 +940,59 @@ def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: _validate_role_metadata(backend, spec) return _ScalarVarLenArray(spec, backend) + def _dict_col_path(self, name: str) -> str: + """Path for the dictionary values store of a dictionary column.""" + return self._dest_path(self._col_logical_key(name), "_dict.b2b") + + def create_dictionary_column( + self, + name: str, + *, + spec, + cparams=None, + dparams=None, + ) -> DictionaryColumn: + from blosc2.schema import VLStringSpec + + codes = self.create_column( + name, + dtype=np.int32, + shape=(4096,), + chunks=(4096,), + blocks=(256,), + cparams=cparams, + dparams=dparams, + ) + dict_spec = VLStringSpec(nullable=False) + dict_path = self._dict_col_path(name) + os.makedirs(os.path.dirname(dict_path), exist_ok=True) + dict_backend = _make_persistent_backend(dict_spec, dict_path, "w") + dict_store = _ScalarVarLenArray(dict_spec, dict_backend) + return DictionaryColumn(spec, codes, dict_store) + + def open_dictionary_column(self, name: str, spec) -> DictionaryColumn: + from blosc2.schema import VLStringSpec + + codes = self.open_column(name) + dict_spec = VLStringSpec(nullable=False) + if self._store.is_zip_store and self._mode == "r": + rel = self._table_key(self._col_logical_key(name)).lstrip("/") + "_dict.b2b" + if rel not in self._store.offsets: + raise KeyError( + f"Dictionary column dict store {name!r} not found in {self._store.localpath!r}" + ) + dict_backend = BatchArray( + _from_schunk=blosc2.blosc2_ext.open( + self._store.b2z_path, + mode="r", + offset=self._store.offsets[rel]["offset"], + ) + ) + else: + dict_backend = _open_persistent_backend(self._dict_col_path(name), self._mode, spec=dict_spec) + dict_store = _ScalarVarLenArray(dict_spec, dict_backend) + return DictionaryColumn(spec, codes, dict_store) + def create_valid_rows( self, *, @@ -876,7 +1061,7 @@ def column_names_from_schema(self) -> list[str]: return [c["name"] for c in self.load_schema()["columns"]] def delete_column(self, name: str) -> None: - full_key = self._table_key(f"/_cols/{name}") + full_key = self._table_key(self._col_logical_key(name)) if full_key in self._store.map_tree: filepath = self._store.map_tree.pop(full_key) full_path = os.path.join(self._working_dir(), filepath) @@ -890,10 +1075,10 @@ def delete_column(self, name: str) -> None: raise KeyError(name) def rename_column(self, old: str, new: str) -> blosc2.NDArray: - old_key = self._table_key(f"/_cols/{old}") - new_key = self._table_key(f"/_cols/{new}") + old_key = self._table_key(self._col_logical_key(old)) + new_key = self._table_key(self._col_logical_key(new)) if old_key in self._store.map_tree: - new_dest = self._dest_path(f"/_cols/{new}", ".b2nd") + new_dest = self._dest_path(self._col_logical_key(new), ".b2nd") old_dest = os.path.join(self._working_dir(), self._store.map_tree[old_key]) os.makedirs(os.path.dirname(new_dest), exist_ok=True) os.replace(old_dest, new_dest) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 4a8ff2e1..7a7b38ab 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -343,22 +343,37 @@ def _init_write_append_mode( self._update_map_tree() def _update_map_tree(self): - # Build map_tree from supported external leaves in working dir. + """Build map_tree from supported external leaves in working dir. + + Trust canonical external leaf suffixes on the fast path. Fall back to + metadata probing for legacy or manually renamed leaves with unusual + suffixes, preserving discovery warnings and compatibility. + """ + external_exts = {".b2nd", ".b2f", ".b2b"} for root, _, files in os.walk(self.working_dir): for file in files: filepath = os.path.join(root, file) if os.path.abspath(filepath) == os.path.abspath(self.estore_path): continue rel_path = os.path.relpath(filepath, self.working_dir).replace(os.sep, "/") - if self._probe_external_leaf_path(rel_path): + if os.path.splitext(rel_path)[1] in external_exts or self._probe_external_leaf_path( + rel_path + ): self.map_tree[self._logical_key_from_relpath(rel_path)] = rel_path def _update_map_tree_from_offsets(self): - """Build map_tree from supported external leaves in a zip store.""" + """Build map_tree from supported external leaves in a zip store. + + Zip-backed stores written by DictStore/TreeStore use canonical external + leaf suffixes. Trusting those suffixes avoids opening every member just + to classify it, which is especially important for compact CTable stores + with many columns. + """ + external_exts = {".b2nd", ".b2f", ".b2b"} for filepath in self.offsets: if filepath == "embed.b2e": continue - if self._probe_external_leaf_offset(filepath): + if os.path.splitext(filepath)[1] in external_exts or self._probe_external_leaf_offset(filepath): self.map_tree[self._logical_key_from_relpath(filepath)] = filepath def _annotate_external_value( diff --git a/src/blosc2/dictionary_column.py b/src/blosc2/dictionary_column.py new file mode 100644 index 00000000..f9148d07 --- /dev/null +++ b/src/blosc2/dictionary_column.py @@ -0,0 +1,280 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Dictionary-encoded string column for CTable. + +Physical layout +--------------- +A dictionary column is stored as two components: + +* **codes** — a fixed-width ``int32`` NDArray with one code per physical row + slot. The special code ``null_code`` (default ``-1``) marks null slots. +* **dict_store** — a variable-length string array (:class:`_ScalarVarLenArray`) + holding unique category values in first-seen order. + +An in-memory mapping ``_value_to_code: dict[str, int]`` is built lazily from +the persisted dict_store on open and kept in sync during writes. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from blosc2.scalar_array import _ScalarVarLenArray + from blosc2.schema import DictionarySpec + + +_NULL_INT32 = np.int32(-1) + + +class DictionaryColumn: + """Row-wise dictionary-encoded string column wrapping codes + dict_store. + + This class is internal; obtain instances via + ``storage.create_dictionary_column()`` or ``storage.open_dictionary_column()``. + + Parameters + ---------- + spec: + The :class:`~blosc2.schema.DictionarySpec` that describes this column. + codes: + A ``blosc2.NDArray`` of dtype ``int32`` with one slot per physical row. + dict_store: + A :class:`~blosc2.scalar_array._ScalarVarLenArray` holding unique + category strings in insertion order (no nulls). + """ + + def __init__(self, spec: DictionarySpec, codes, dict_store: _ScalarVarLenArray) -> None: + self._spec = spec + self._codes = codes # int32 NDArray (physical slot array) + self._dict_store = dict_store # _ScalarVarLenArray of vlstring (unique values) + # Cache: str → int32 code. Built lazily from dict_store on first access. + self._value_to_code: dict[str, int] | None = None + + # ------------------------------------------------------------------ + # Cache management + # ------------------------------------------------------------------ + + def _ensure_cache(self) -> None: + """Build the value→code mapping from the persisted dict_store.""" + if self._value_to_code is not None: + return + self._dict_store.flush() + cache: dict[str, int] = {} + for code, value in enumerate(self._dict_store): + if value is not None: + cache[value] = code + self._value_to_code = cache + + def _invalidate_cache(self) -> None: + self._value_to_code = None + + # ------------------------------------------------------------------ + # Encoding / decoding + # ------------------------------------------------------------------ + + def encode(self, value: str | None) -> int: + """Encode *value* to an int32 code. Appends new values to the dictionary.""" + if value is None: + if not self._spec.nullable: + raise ValueError(f"Dictionary column {self._spec!r} is not nullable; received None.") + return self._spec.null_code + if not isinstance(value, str): + raise TypeError(f"Dictionary column expects str or None values, got {type(value).__name__!r}.") + self._ensure_cache() + assert self._value_to_code is not None + code = self._value_to_code.get(value) + if code is not None: + return code + # New category — append to dictionary. + new_code = len(self._value_to_code) + if new_code > np.iinfo(np.int32).max: + raise OverflowError( + "Dictionary column has exceeded the maximum number of unique values (2^31 - 1)." + ) + self._dict_store.append(value) + self._value_to_code[value] = new_code + return new_code + + def decode(self, code: int) -> str | None: + """Decode an int32 *code* to its string value, or ``None`` for null codes.""" + if code == self._spec.null_code: + return None + self._ensure_cache() + return self._dict_store[int(code)] + + def encode_batch(self, values) -> np.ndarray: + """Encode a sequence of str/None to a numpy ``int32`` array of codes.""" + result = np.empty(len(values), dtype=np.int32) + for i, v in enumerate(values): + result[i] = self.encode(v) + return result + + def value_to_code(self, value: str) -> int: + """Return the code for *value*. Raises :exc:`KeyError` if absent.""" + self._ensure_cache() + assert self._value_to_code is not None + if value not in self._value_to_code: + raise KeyError(value) + return self._value_to_code[value] + + def code_to_value(self, code: int) -> str | None: + """Return the category string for *code*.""" + return self.decode(code) + + # ------------------------------------------------------------------ + # Arrow-optimised batch import + # ------------------------------------------------------------------ + + def extend_from_arrow(self, pa, arrow_col, pos: int, m: int, *, ordered: bool = False) -> None: + """Write *m* rows from an Arrow dictionary array into the codes NDArray at *pos*. + + Performs global dictionary unification: chunk-local codes are remapped + to global codes. ``ordered=True`` raises if chunk dictionary order + differs from the established global order. + """ + local_dict = arrow_col.dictionary.to_pylist() + + # Build local-code → global-code mapping. + local_to_global: dict[int, int] = {} + for local_code, value in enumerate(local_dict): + if value is None: + local_to_global[local_code] = self._spec.null_code + else: + local_to_global[local_code] = self.encode(value) + + if ordered and len(local_dict) > 0: + self._validate_ordered_chunk_dict(local_dict) + + # Translate Arrow indices to global int32 codes. + indices = arrow_col.indices.to_pylist() + global_codes = np.empty(m, dtype=np.int32) + for i, idx in enumerate(indices): + if idx is None: + if not self._spec.nullable: + raise ValueError("Dictionary column is not nullable but Arrow input contains nulls.") + global_codes[i] = self._spec.null_code + else: + global_codes[i] = local_to_global[int(idx)] + + self._codes[pos : pos + m] = global_codes + + def _validate_ordered_chunk_dict(self, local_dict: list) -> None: + """Raise if *local_dict* order differs from the existing global order.""" + self._ensure_cache() + assert self._value_to_code is not None + for local_code, value in enumerate(local_dict): + if value is None: + continue + global_code = self._value_to_code.get(value) + if global_code is not None and global_code != local_code: + raise ValueError( + f"ordered=True dictionary column has inconsistent ordering across Arrow " + f"batches: value {value!r} has global code {global_code} but appears as " + f"local code {local_code} in this chunk." + ) + + # ------------------------------------------------------------------ + # Core interface: __len__, __getitem__, __setitem__ + # ------------------------------------------------------------------ + + def __len__(self) -> int: + """Return the physical slot capacity (same as the codes NDArray length).""" + return len(self._codes) + + def __getitem__(self, key) -> str | None | list: + """Return decoded value(s) for the given index. + + - ``int`` → ``str | None`` + - ``slice`` → ``list`` + - ``numpy.ndarray``/``list`` → ``list`` + """ + if isinstance(key, (int, np.integer)): + return self.decode(int(self._codes[int(key)])) + if isinstance(key, slice): + codes_arr = np.asarray(self._codes[key], dtype=np.int32) + return [self.decode(int(c)) for c in codes_arr] + if isinstance(key, (list, np.ndarray)): + codes_arr = self._codes[key] + if isinstance(codes_arr, np.ndarray): + return [self.decode(int(c)) for c in codes_arr.ravel()] + return [self.decode(int(codes_arr))] + raise TypeError(f"DictionaryColumn indices must be int, slice, or array; got {type(key)!r}") + + def __setitem__(self, key, value) -> None: + """Encode *value* (str/None or list thereof) and write the code(s).""" + if isinstance(key, (int, np.integer)): + self._codes[int(key)] = np.int32(self.encode(value)) + elif isinstance(key, slice): + if isinstance(value, (list, tuple, np.ndarray)): + self._codes[key] = self.encode_batch(list(value)) + else: + # scalar broadcast + code = np.int32(self.encode(value)) + self._codes[key] = code + elif isinstance(key, (list, np.ndarray)): + self._codes[key] = self.encode_batch(list(value)) + else: + raise TypeError(f"DictionaryColumn indices must be int, slice, or array; got {type(key)!r}") + + def resize(self, shape: tuple) -> None: + """Resize the underlying codes NDArray (delegates to the NDArray).""" + self._codes.resize(shape) + + # ------------------------------------------------------------------ + # Flush / close + # ------------------------------------------------------------------ + + def flush(self) -> None: + """Flush pending dict_store batches to the backend.""" + self._dict_store.flush() + + # ------------------------------------------------------------------ + # Public properties + # ------------------------------------------------------------------ + + @property + def codes(self): + """The underlying ``int32`` NDArray of category codes.""" + return self._codes + + @property + def dictionary(self) -> list[str]: + """Return the list of unique dictionary values in insertion order.""" + self._dict_store.flush() + return list(self._dict_store) + + @property + def spec(self) -> DictionarySpec: + return self._spec + + @property + def dtype(self): + """Always ``None`` — dictionary columns have no fixed NumPy dtype.""" + return None + + @property + def urlpath(self) -> str | None: + return getattr(self._codes, "urlpath", None) + + @property + def nbytes(self) -> int: + return self._codes.nbytes + self._dict_store.nbytes + + @property + def cbytes(self) -> int: + return self._codes.cbytes + self._dict_store.cbytes + + @property + def cratio(self) -> float: + cb = self.cbytes + if cb == 0: + return float("inf") + return self.nbytes / cb diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 7946aeb3..7858f7a1 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -42,6 +42,8 @@ import numpy as np import blosc2 +from blosc2 import compute_chunks_blocks +from blosc2.info import InfoReporter from .b2objects import ( encode_b2object_payload, @@ -51,13 +53,6 @@ write_b2object_user_vlmeta, ) from .dsl_kernel import DSLKernel, DSLSyntaxError, DSLValidator, specialize_miniexpr_inputs - -if blosc2._HAS_NUMBA: - import numba - -from blosc2 import compute_chunks_blocks -from blosc2.info import InfoReporter - from .proxy import convert_dtype from .utils import ( check_smaller_shape, @@ -73,6 +68,7 @@ linalg_funcs, npcumprod, npcumsum, + populate_safe_numpy_globals, process_key, reducers, safe_numpy_globals, @@ -105,6 +101,7 @@ def ne_evaluate(expression, local_dict=None, **kwargs): } if blosc2.IS_WASM: global safe_numpy_globals + populate_safe_numpy_globals(expression) if "out" in kwargs: out = kwargs.pop("out") out[:] = eval(expression, safe_numpy_globals, local_dict) @@ -221,17 +218,7 @@ def _get_result(expression, chunk_operands, ne_args, where=None, indices=None, _ blosc2_funcs = constructors + linalg_funcs + elementwise_funcs + reducers # functions that have to be evaluated before chunkwise lazyexpr machinery eager_funcs = linalg_funcs + reducers + ["slice"] + ["." + attr for attr in linalg_attrs] -# Gather all callable functions in numpy -numpy_funcs = { - name - for name, member in inspect.getmembers(np, callable) - if not name.startswith("_") and not isinstance(member, np.ufunc) -} -numpy_ufuncs = {name for name, member in inspect.getmembers(np, lambda x: isinstance(x, np.ufunc))} -# Add these functions to the list of available functions -# (will be evaluated via the array interface) -additional_funcs = sorted((numpy_funcs | numpy_ufuncs) - set(blosc2_funcs)) -functions = blosc2_funcs + additional_funcs +functions = blosc2_funcs _constructor_call_patterns = {name: re.compile(rf"\b{re.escape(name)}\s*\(") for name in constructors} @@ -264,21 +251,34 @@ def _find_constructor_call(expression: str, constructor: str) -> re.Match | None def get_expr_globals(expression): """Build a dictionary of functions needed for evaluating the expression.""" _globals = {"np": np, "blosc2": blosc2} - # Only check for functions that actually appear in the expression - # This avoids many unnecessary string searches + # Only check for functions that actually appear in the expression. for func in functions: if func in expression: - # Try blosc2 first if hasattr(blosc2, func): _globals[func] = getattr(blosc2, func) - # Fall back to numpy else: try: _globals[func] = safe_numpy_globals[func] - # Function not found in either module except KeyError as e: raise AttributeError(f"Function {func} not found in blosc2 or numpy") from e + # Lazily support bare numpy calls not covered by the Blosc2 function list. + populate_safe_numpy_globals(expression) + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError: + return _globals + for node in ast.walk(tree): + if not isinstance(node, ast.Call) or not isinstance(node.func, ast.Name): + continue + func = node.func.id + if func in _globals: + continue + if hasattr(blosc2, func): + _globals[func] = getattr(blosc2, func) + elif func in safe_numpy_globals: + _globals[func] = safe_numpy_globals[func] + return _globals @@ -4755,6 +4755,8 @@ def _reconstruct_lazyudf(expr, lazyarray, operands_dict, array): "blosc2": blosc2, } if blosc2._HAS_NUMBA: + import numba + SAFE_GLOBALS["numba"] = numba # Register the source so inspect can find it diff --git a/src/blosc2/list_array.py b/src/blosc2/list_array.py index 181da0a3..ec890de3 100644 --- a/src/blosc2/list_array.py +++ b/src/blosc2/list_array.py @@ -20,7 +20,7 @@ from blosc2.batch_array import BatchArray from blosc2.info import InfoReporter, format_nbytes_info from blosc2.objectarray import ObjectArray -from blosc2.schema import ListSpec, SchemaSpec, StructSpec +from blosc2.schema import DictionarySpec, ListSpec, SchemaSpec, StructSpec, timestamp from blosc2.schema import list as list_spec_builder _SUPPORTED_SERIALIZERS = {"msgpack", "arrow"} @@ -132,6 +132,14 @@ def _coerce_scalar_item(spec: SchemaSpec, value: Any) -> Any: # noqa: C901 if isinstance(spec, StructSpec): return _coerce_struct_item(spec, value) + if isinstance(spec, ListSpec): + return coerce_list_cell(spec, value) + if isinstance(spec, DictionarySpec): + if value is None: + raise ValueError("ListArray does not support nullable items inside a list in V1") + if not isinstance(value, str): + value = str(value) + return value if getattr(spec, "python_type", None) is str: if not isinstance(value, str): @@ -146,7 +154,12 @@ def _coerce_scalar_item(spec: SchemaSpec, value: Any) -> Any: # noqa: C901 dtype = getattr(spec, "dtype", None) if dtype is None: raise TypeError(f"Unsupported list item spec {type(spec).__name__!r}") - value = np.array(value, dtype=dtype).item() + if isinstance(spec, timestamp) and ( + isinstance(value, (np.datetime64, str)) or hasattr(value, "isoformat") + ): + value = np.datetime64(value).astype(f"datetime64[{spec.unit}]").astype(np.int64).item() + else: + value = np.array(value, dtype=dtype).item() ge = getattr(spec, "ge", None) if ge is not None and value < ge: @@ -380,6 +393,28 @@ def extend(self, values: Iterable[Any], *, validate: bool = True) -> None: self._pending_cells.extend(cells) self._flush_full_batches() + def extend_arrow(self, arrow_array) -> None: + """Append a PyArrow list array without materializing Python cells. + + This requires batch storage with ``serializer='arrow'`` and is intended + for trusted Arrow/Parquet import paths. + """ + pa = _require_pyarrow() + if isinstance(arrow_array, pa.ChunkedArray): + chunks = arrow_array.chunks + else: + chunks = [arrow_array] + if self.spec.storage != "batch" or self.spec.serializer != "arrow": + values = arrow_array.to_pylist() if hasattr(arrow_array, "to_pylist") else list(arrow_array) + self.extend(values, validate=False) + return + for chunk in chunks: + if len(chunk) == 0: + continue + self._backend.append(chunk) + self._persisted_row_count += len(chunk) + self._invalidate_batch_caches() + def flush(self) -> None: """Persist any pending rows when using the batch backend.""" if self.spec.storage != "batch": @@ -455,6 +490,11 @@ def _get_many_grouped(self, indices: list[int]) -> list[Any]: def _get_many(self, indices: list[int]) -> list[Any]: if self.spec.storage == "vl": return [self._backend[index] for index in indices] + # For small selections from block-addressable batches, scalar access is + # much cheaper than materializing the full containing batch. This is + # common for filtered column previews and small logical slices. + if getattr(self._backend, "items_per_block", None) is not None and len(indices) <= 1024: + return [self[index] for index in indices] if len(indices) <= 1: return self._get_many_grouped(indices) monotonic = True @@ -489,7 +529,7 @@ def __getitem__(self, index: int | slice | list[int] | tuple[int, ...] | np.ndar if index >= self._persisted_row_count: return self._pending_cells[index - self._persisted_row_count] batch_index, inner_index = self._locate_persisted_row(index) - return self._get_batch_values(batch_index)[inner_index] + return self._backend[batch_index][inner_index] def __setitem__(self, index: int, value: Any) -> None: """Replace one list cell.""" @@ -568,6 +608,13 @@ def batch_rows(self) -> int | None: return self.spec.batch_rows return None + @property + def items_per_block(self) -> int | None: + """Maximum number of list cells per internal compressed block.""" + if self.spec.storage != "batch": + return None + return self._backend.items_per_block + @property def nbytes(self) -> int: """Uncompressed byte size reported by the backend.""" @@ -597,6 +644,8 @@ def info_items(self) -> list: ("backend", self.spec.storage), ("serializer", self.spec.serializer), ("rows", len(self)), + ("batch_rows", self.batch_rows), + ("items_per_block", self.items_per_block), ("pending_rows", len(self._pending_cells) if self.spec.storage == "batch" else 0), ("nbytes", format_nbytes_info(self.nbytes)), ("cbytes", format_nbytes_info(self.cbytes)), diff --git a/src/blosc2/schema.py b/src/blosc2/schema.py index c39c034e..b9d00852 100644 --- a/src/blosc2/schema.py +++ b/src/blosc2/schema.py @@ -761,6 +761,114 @@ def vlbytes( ) +# --------------------------------------------------------------------------- +# Dictionary spec +# --------------------------------------------------------------------------- + + +class DictionarySpec(SchemaSpec): + """Dictionary-encoded string column stored as int32 codes with a global string dictionary. + + Each row value is a plain Python ``str`` (or ``None`` when nullable). + Internally the column stores compact integer codes (``int32``) in an NDArray, + with a separate append-only variable-length string array holding the unique + category values. This matches Arrow dictionary encoding semantics. + + Parameters + ---------- + index_type: + Must be :class:`int32`. The physical dtype for category codes. + value_type: + Must be :class:`VLStringSpec`. The type of dictionary values. + ordered: + If ``True``, the dictionary has semantic ordering. Ordered comparisons + (``<``, ``>``) are not implemented in v1 but the flag is stored and + exported to Arrow. + nullable: + If ``True`` (default), null row slots are allowed. Nulls are represented + internally by the reserved code ``null_code`` (default ``-1``). + null_code: + The reserved code value for null slots. Default is ``-1``. + """ + + python_type = str + dtype = None # physical codes are int32, but logical type is str + + def __init__( + self, + *, + index_type=None, + value_type=None, + ordered: _builtin_bool = False, + nullable: _builtin_bool = True, + null_code: int = -1, + ): + from blosc2.schema import int32 as _int32 + + if index_type is not None and not isinstance(index_type, _int32): + raise TypeError( + f"DictionarySpec index_type must be blosc2.int32() in v1; got {type(index_type).__name__!r}" + ) + if value_type is not None and not isinstance(value_type, VLStringSpec): + raise TypeError( + "DictionarySpec value_type must be blosc2.vlstring() in v1; " + f"got {type(value_type).__name__!r}" + ) + self.index_type = index_type if index_type is not None else _int32() + self.value_type = value_type if value_type is not None else VLStringSpec() + self.ordered = _builtin_bool(ordered) + self.nullable = _builtin_bool(nullable) + self.null_code = int(null_code) + + def to_pydantic_kwargs(self) -> dict[str, Any]: + return {} + + def to_metadata_dict(self) -> dict[str, Any]: + return { + "kind": "dictionary", + "index_type": self.index_type.to_metadata_dict(), + "value_type": self.value_type.to_metadata_dict(), + "ordered": self.ordered, + "nullable": self.nullable, + "null_code": self.null_code, + } + + +def dictionary( + *, + index_type=None, + value_type=None, + ordered: bool = False, + nullable: bool = True, +) -> DictionarySpec: + """Build a dictionary-encoded string column descriptor. + + Dictionary columns store repeated string values as compact ``int32`` codes + with a separate global dictionary of unique string values. This matches + Arrow dictionary encoding and is ideal for low-cardinality string columns + such as categories or enumerated values. + + Parameters + ---------- + index_type: + The physical type for category codes. Must be ``blosc2.int32()`` in v1. + Defaults to ``blosc2.int32()`` when not specified. + value_type: + The type of dictionary values. Must be ``blosc2.vlstring()`` in v1. + Defaults to ``blosc2.vlstring()`` when not specified. + ordered: + If ``True``, dictionary order is semantically meaningful. + nullable: + If ``True`` (default), null row values are allowed (stored as code ``-1``). + """ + return DictionarySpec( + index_type=index_type, + value_type=value_type, + ordered=ordered, + nullable=nullable, + ) + + def struct(fields: dict[str, SchemaSpec], *, nullable: bool = False) -> StructSpec: """Build a structured schema descriptor for dict-like CTable values. diff --git a/src/blosc2/schema_compiler.py b/src/blosc2/schema_compiler.py index a6cfe04f..006f9e22 100644 --- a/src/blosc2/schema_compiler.py +++ b/src/blosc2/schema_compiler.py @@ -22,6 +22,7 @@ from blosc2.schema import ( BLOSC2_FIELD_METADATA_KEY, + DictionarySpec, ListSpec, ObjectSpec, SchemaSpec, @@ -76,6 +77,8 @@ "vlbytes": VLBytesSpec, "object": ObjectSpec, "timestamp": timestamp, + # dictionary + "dictionary": DictionarySpec, } # --------------------------------------------------------------------------- @@ -102,6 +105,8 @@ def compute_display_width(spec: SchemaSpec) -> int: """Return a reasonable terminal display width for *spec*'s column.""" + if isinstance(spec, DictionarySpec): + return 32 if isinstance(spec, (VLStringSpec, VLBytesSpec, ObjectSpec)): return 40 if isinstance(spec, (ListSpec, StructSpec)): @@ -211,7 +216,8 @@ def validate_annotation_matches_spec(name: str, annotation: Any, spec: SchemaSpe origin = typing.get_origin(annotation) if origin not in (list, list): raise TypeError( - f"Column {name!r}: annotation {annotation!r} is incompatible with list spec; expected list[T]." + f"Column {name!r}: annotation {annotation!r} is incompatible with list spec; " + "expected list[T]." ) args = typing.get_args(annotation) if len(args) != 1: @@ -225,6 +231,14 @@ def validate_annotation_matches_spec(name: str, annotation: Any, spec: SchemaSpe ) return + if isinstance(spec, DictionarySpec): + if annotation is not str: + raise TypeError( + f"Column {name!r}: annotation {annotation!r} is incompatible with " + f"DictionarySpec (expected str)." + ) + return + if isinstance(spec, timestamp): if annotation in (object, np.datetime64, datetime.datetime, str, int): return @@ -259,15 +273,15 @@ def _validate_column_name(name: str) -> None: * must be a non-empty string * must not start with ``_`` (reserved for internal table layout) - * must not contain ``/`` (used as path separator in persistent layout) * must not be one of the reserved internal names + + Literal ``/`` characters are allowed in logical names; persistent CTable + storage percent-encodes path segments before writing under ``_cols``. """ if not name: raise ValueError("Column name cannot be empty.") if name.startswith("_"): raise ValueError(f"Column name cannot start with '_' (reserved for internal use): {name!r}") - if "/" in name: - raise ValueError(f"Column name cannot contain '/': {name!r}") if name in _RESERVED_COLUMN_NAMES: raise ValueError(f"Column name {name!r} is reserved for internal CTable use.") @@ -404,6 +418,10 @@ def spec_from_metadata_dict(data: dict[str, Any]) -> SchemaSpec: return ListSpec(item_spec, **data) if kind == "struct": return StructSpec.from_metadata_dict({"fields": data.pop("fields"), **data}) + if kind == "dictionary": + index_type = spec_from_metadata_dict(data.pop("index_type")) + value_type = spec_from_metadata_dict(data.pop("value_type")) + return DictionarySpec(index_type=index_type, value_type=value_type, **data) spec_cls = _KIND_TO_SPEC.get(kind) if spec_cls is None: raise ValueError(f"Unknown column kind {kind!r}") @@ -447,8 +465,9 @@ def schema_to_dict(schema: CompiledSchema) -> dict[str, Any]: entry["blocks"] = list(col.config.blocks) cols.append(entry) + schema_version = 2 if schema.metadata.get("nested") is not None else 1 result = { - "version": 1, + "version": schema_version, "row_cls": schema.row_cls.__name__ if schema.row_cls is not None else None, "columns": cols, } @@ -470,7 +489,7 @@ def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: If *data* uses an unknown schema version or an unknown column kind. """ version = data.get("version", 1) - if version != 1: + if version not in (1, 2): raise ValueError(f"Unsupported schema version {version!r}") columns: list[CompiledColumn] = [] diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 35182bea..e765dcdc 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1742,12 +1742,9 @@ def _open_treestore_root_object(store, urlpath, mode): if manifest["kind"] == "ctable": if mode not in {"r", "a"}: return store - # Discard the probe store without repacking — it was only opened - # to peek at the manifest. A full close() would trigger to_b2z() - # even though nothing was modified, and CTable.open() below will - # create its own store anyway. - store.discard() - return blosc2.CTable.open(urlpath, mode=mode) + # Reuse the TreeStore that was opened to inspect the root manifest. + # This avoids a second TreeStore open when dispatching root CTables. + return blosc2.CTable._open_from_existing_filestore(urlpath, mode=mode, store=store) return store diff --git a/src/blosc2/utils.py b/src/blosc2/utils.py index 4e02341c..6483982f 100644 --- a/src/blosc2/utils.py +++ b/src/blosc2/utils.py @@ -7,6 +7,7 @@ import ast import builtins +import contextlib import inspect import math import sys @@ -88,41 +89,6 @@ def format_expr_scalar(value): return value -global safe_numpy_globals -# Use numpy eval when running in WebAssembly -safe_numpy_globals = {"np": np} -# Add all first-level numpy functions -safe_numpy_globals.update( - {name: getattr(np, name) for name in dir(np) if callable(getattr(np, name)) and not name.startswith("_")} -) - -if not NUMPY_GE_2_0: # handle non-array-api compliance - safe_numpy_globals["acos"] = np.arccos - safe_numpy_globals["acosh"] = np.arccosh - safe_numpy_globals["asin"] = np.arcsin - safe_numpy_globals["asinh"] = np.arcsinh - safe_numpy_globals["atan"] = np.arctan - safe_numpy_globals["atanh"] = np.arctanh - safe_numpy_globals["atan2"] = np.arctan2 - safe_numpy_globals["permute_dims"] = np.transpose - safe_numpy_globals["pow"] = np.power - safe_numpy_globals["bitwise_left_shift"] = np.left_shift - safe_numpy_globals["bitwise_right_shift"] = np.right_shift - safe_numpy_globals["bitwise_invert"] = np.bitwise_not - safe_numpy_globals["concat"] = np.concatenate - safe_numpy_globals["matrix_transpose"] = np.transpose - safe_numpy_globals["vecdot"] = npvecdot - safe_numpy_globals["cumulative_sum"] = npcumsum - safe_numpy_globals["cumulative_prod"] = npcumprod - -# handle different naming conventions between numpy and blosc2 -safe_numpy_globals["contains"] = _string_contains -safe_numpy_globals["startswith"] = _string_startswith -safe_numpy_globals["endswith"] = _string_endswith -safe_numpy_globals["upper"] = _string_upper -safe_numpy_globals["lower"] = _string_lower - - elementwise_funcs = [ "abs", "acos", @@ -267,6 +233,66 @@ def format_expr_scalar(value): constructors += ["reshape"] +_NUMPY_ALIASES = { + "acos": np.arccos, + "acosh": np.arccosh, + "asin": np.arcsin, + "asinh": np.arcsinh, + "atan": np.arctan, + "atanh": np.arctanh, + "atan2": np.arctan2, + "concat": getattr(np, "concat", np.concatenate), + "contains": _string_contains, + "cumulative_prod": npcumprod, + "cumulative_sum": npcumsum, + "endswith": _string_endswith, + "lower": _string_lower, + "matrix_transpose": getattr(np, "matrix_transpose", np.transpose), + "permute_dims": nptranspose, + "pow": np.power, + "startswith": _string_startswith, + "upper": _string_upper, + "vecdot": npvecdot, +} +if not NUMPY_GE_2_0: # handle non-array-api compliance + _NUMPY_ALIASES.update( + { + "bitwise_invert": np.bitwise_not, + "bitwise_left_shift": np.left_shift, + "bitwise_right_shift": np.right_shift, + } + ) + +# Use numpy eval when running in WebAssembly. Keep this intentionally small: +# scanning every callable in numpy triggers lazy imports such as numpy.f2py and +# numpy.testing during ``import blosc2``. +safe_numpy_globals = {"np": np, **_NUMPY_ALIASES} +for _name in set(elementwise_funcs + linalg_funcs + reducers + constructors): + if _name not in safe_numpy_globals and not _name.startswith("_"): + with contextlib.suppress(AttributeError): + _value = getattr(np, _name) + if callable(_value): + safe_numpy_globals[_name] = _value + + +def populate_safe_numpy_globals(expression: str) -> None: + """Add bare numpy call names used by *expression* to safe_numpy_globals.""" + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError: + return + for node in ast.walk(tree): + if not isinstance(node, ast.Call) or not isinstance(node.func, ast.Name): + continue + name = node.func.id + if name in safe_numpy_globals or name.startswith("_"): + continue + with contextlib.suppress(AttributeError): + value = getattr(np, name) + if callable(value): + safe_numpy_globals[name] = value + + # --- Shape utilities --- def linalg_shape(func_name, args, kwargs): # noqa: C901 # --- Linear algebra and tensor manipulation --- diff --git a/tests/ctable/test_arrow_interop.py b/tests/ctable/test_arrow_interop.py index f3f28d9c..8bc1bde9 100644 --- a/tests/ctable/test_arrow_interop.py +++ b/tests/ctable/test_arrow_interop.py @@ -7,6 +7,7 @@ """Tests for CTable.to_arrow() and CTable.from_arrow().""" +import datetime from dataclasses import dataclass import numpy as np @@ -310,6 +311,34 @@ def test_from_arrow_list_struct_nullable_values_roundtrip(): assert t[2].nutriments == [{"name": "energy", "value": 42.0}] +def test_from_arrow_list_struct_timestamp_roundtrip(): + event_type = pa.struct( + [ + pa.field("when", pa.timestamp("ms")), + pa.field("value", pa.float64()), + ] + ) + at = pa.table( + { + "events": pa.array( + [ + [{"when": datetime.datetime(2020, 1, 1), "value": 1.5}], + None, + ], + type=pa.list_(event_type), + ) + } + ) + + t = CTable.from_arrow(at.schema, at.to_batches()) + assert t[0].events == [{"when": 1577836800000, "value": 1.5}] + assert t[1].events is None + + out = t.to_arrow() + assert out.schema.field("events").type == pa.list_(event_type) + assert out.column("events").to_pylist()[0][0]["when"].isoformat() == "2020-01-01T00:00:00" + + def test_from_arrow_unsupported_type_raises(): at = pa.table({"duration": pa.array([1, 2, 3], type=pa.duration("s"))}) with pytest.raises(TypeError, match="No blosc2 spec"): diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py index 41f2cfa1..63b06947 100644 --- a/tests/ctable/test_column.py +++ b/tests/ctable/test_column.py @@ -26,6 +26,12 @@ class StrRow: label: str = blosc2.field(blosc2.string(max_length=16)) +@dataclass +class DictRow: + vendor: str = blosc2.field(blosc2.dictionary()) + fare: float = blosc2.field(blosc2.float64()) + + DATA20 = [(i, float(i * 10), True) for i in range(20)] @@ -50,6 +56,49 @@ def test_column_metadata(): assert tabla.score._mask is None +def test_column_float32_repr_uses_numpy_formatting(): + """Column/table repr uses compact NumPy-style formatting for float32 previews.""" + + @dataclass + class Float32Row: + value: float = blosc2.field(blosc2.float32()) + + tabla = CTable(Float32Row, new_data=[(222.22,), (210.8,)]) + col_text = repr(tabla.value) + table_text = str(tabla) + + assert "222.22" in col_text + assert "222.22000122070312" not in col_text + assert "222.22" in table_text + assert "222.22000122070312" not in table_text + + +def test_column_info(): + """Column.info reports logical and physical storage details.""" + tabla = CTable(Row, new_data=DATA20) + info = tabla.score.info + text = repr(info) + + assert len(info) == len(tabla.score.info_items) + assert ("type", "Column") in tabla.score.info_items + assert ("name", "score") in tabla.score.info_items + assert "logical_length" in text + assert "physical_length" in text + assert "logical_shape" not in text + assert "table_physical_length" not in text + assert "storage" in text + + +def test_dictionary_column_info(): + """Dictionary Column.info reports dictionary-specific details without code-shape duplication.""" + tabla = CTable(DictRow, new_data=[("Uber", 10.5), ("Lyft", 7.2), ("Uber", 15.0)]) + text = repr(tabla.vendor.info) + + assert "dictionary_size" in text + assert "dictionary[str]" in text + assert "codes_shape" not in text + + def test_column_getitem_no_holes(): """int, slice, and list indexing on a full table.""" tabla = CTable(Row, new_data=DATA20) diff --git a/tests/ctable/test_ctable_indexing.py b/tests/ctable/test_ctable_indexing.py index 672c3942..000d5ed0 100644 --- a/tests/ctable/test_ctable_indexing.py +++ b/tests/ctable/test_ctable_indexing.py @@ -306,6 +306,7 @@ def test_catalog_survives_reopen(tmpdir): assert not idxs[0].stale +@pytest.mark.heavy def test_where_with_index_matches_scan_persistent(tmpdir): path = str(tmpdir / "table.b2d") t = _make_table(200, persistent_path=path) @@ -320,6 +321,7 @@ def test_where_with_index_matches_scan_persistent(tmpdir): assert ids_idx == ids_scan +@pytest.mark.heavy def test_persistent_index_drop_releases_sidecars_without_gc(tmpdir): import gc diff --git a/tests/ctable/test_dictionary_column.py b/tests/ctable/test_dictionary_column.py new file mode 100644 index 00000000..13dfbb8e --- /dev/null +++ b/tests/ctable/test_dictionary_column.py @@ -0,0 +1,485 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### +"""Tests for the CTable dictionary column type.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import pytest + +import blosc2 +from blosc2 import CTable, DictionarySpec +from blosc2.dictionary_column import DictionaryColumn +from blosc2.schema_compiler import compile_schema, schema_from_dict, schema_to_dict + +pa = pytest.importorskip("pyarrow") +pq = pytest.importorskip("pyarrow.parquet") + + +# --------------------------------------------------------------------------- +# Unit tests: DictionarySpec and schema compiler +# --------------------------------------------------------------------------- + + +class TestDictionarySpec: + def test_default_construction(self): + spec = blosc2.dictionary() + assert spec.ordered is False + assert spec.nullable is True + assert spec.null_code == -1 + + def test_wrong_index_type_raises(self): + with pytest.raises(TypeError, match="int32"): + blosc2.dictionary(index_type=blosc2.int64()) + + def test_wrong_value_type_raises(self): + with pytest.raises(TypeError, match="vlstring"): + blosc2.dictionary(value_type=blosc2.string(max_length=32)) + + def test_metadata_roundtrip(self): + spec = blosc2.dictionary(ordered=True, nullable=False) + d = spec.to_metadata_dict() + assert d["kind"] == "dictionary" + assert d["ordered"] is True + assert d["nullable"] is False + assert d["null_code"] == -1 + + def test_schema_serialization_roundtrip(self): + @dataclass + class Row: + vendor: str = blosc2.field(blosc2.dictionary()) + fare: float = blosc2.field(blosc2.float64()) + + schema = compile_schema(Row) + d = schema_to_dict(schema) + schema2 = schema_from_dict(d) + col = schema2.columns_by_name["vendor"] + assert isinstance(col.spec, DictionarySpec) + assert col.spec.ordered is False + assert col.spec.nullable is True + + def test_dataclass_annotation_must_be_str(self): + from blosc2.schema_compiler import validate_annotation_matches_spec + + spec = blosc2.dictionary() + with pytest.raises(TypeError, match="str"): + validate_annotation_matches_spec("x", int, spec) + + def test_dataclass_annotation_str_ok(self): + from blosc2.schema_compiler import validate_annotation_matches_spec + + spec = blosc2.dictionary() + validate_annotation_matches_spec("x", str, spec) # should not raise + + +# --------------------------------------------------------------------------- +# CTable behavior tests +# --------------------------------------------------------------------------- + + +@dataclass +class TripRow: + vendor: str = blosc2.field(blosc2.dictionary()) + fare: float = blosc2.field(blosc2.float64()) + + +DATA = [ + {"vendor": "Uber", "fare": 10.5}, + {"vendor": "Lyft", "fare": 7.2}, + {"vendor": "Uber", "fare": 15.0}, + {"vendor": "Via", "fare": 5.0}, +] + +# Tuple form for extend() +DATA_TUPLES = [ + ("Uber", 10.5), + ("Lyft", 7.2), + ("Uber", 15.0), + ("Via", 5.0), +] + + +def _logical_mask_values(ct, mask): + """Materialize a physical predicate as logical/live-row values.""" + arr = mask.compute() if isinstance(mask, blosc2.LazyExpr) else mask + arr = arr[:] if isinstance(arr, blosc2.NDArray) else arr + return arr[ct._valid_rows[:]].tolist() + + +class TestCTableBehavior: + def test_append_and_read(self): + ct = CTable(TripRow) + for row in DATA: + ct.append(row) + assert ct.nrows == 4 + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + assert ct["vendor"][0] == "Uber" + assert ct["vendor"][1] == "Lyft" + + def test_repeated_strings_reuse_codes(self): + ct = CTable(TripRow) + for row in DATA: + ct.append(row) + codes = ct._cols["vendor"].codes[:4].tolist() + assert codes[0] == codes[2] # "Uber" appears twice with same code + assert len(ct._cols["vendor"].dictionary) == 3 # Uber, Lyft, Via + + def test_null_slot(self): + ct = CTable(TripRow) + ct.append({"vendor": None, "fare": 0.0}) + assert ct["vendor"][0] is None + assert ct._cols["vendor"].codes[0] == -1 + + def test_nullable_false_rejects_null(self): + @dataclass + class NNRow: + vendor: str = blosc2.field(blosc2.dictionary(nullable=False)) + fare: float = blosc2.field(blosc2.float64()) + + ct = CTable(NNRow) + with pytest.raises((ValueError, TypeError)): + ct.append({"vendor": None, "fare": 0.0}) + + def test_invalid_type_raises(self): + ct = CTable(TripRow) + with pytest.raises((TypeError, ValueError)): + ct.append({"vendor": 42, "fare": 0.0}) + + def test_extend_batch(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + assert ct.nrows == 4 + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + + def test_codes_and_dictionary_properties(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + dc = ct._cols["vendor"] + assert isinstance(dc, DictionaryColumn) + assert list(dc.dictionary) == ["Uber", "Lyft", "Via"] + codes = dc.codes[:4].tolist() + assert codes == [0, 1, 0, 2] + + def test_equality_filter(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"] == "Uber" + assert _logical_mask_values(ct, mask) == [True, False, True, False] + + def test_equality_absent_value_returns_false(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"] == "Waymo" + assert _logical_mask_values(ct, mask) == [False, False, False, False] + + def test_equality_none(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + ct.append({"vendor": None, "fare": 0.0}) + mask = ct["vendor"] == None # noqa: E711 + assert _logical_mask_values(ct, mask) == [False, False, False, False, True] + + def test_dictionary_predicate_combines_with_regular_predicate_in_aggregate(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + assert ct["fare"].sum(where=(ct["fare"] > 6) & (ct["vendor"] == "Uber")) == pytest.approx(25.5) + + def test_isin(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"].isin(["Uber", "Via"]) + assert mask.tolist() == [True, False, True, True] + + def test_isin_absent_values(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"].isin(["Waymo"]) + assert all(not v for v in mask.tolist()) + + def test_is_null(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + ct.append({"vendor": None, "fare": 0.0}) + assert _logical_mask_values(ct, ct["vendor"].is_null()) == [False, False, False, False, True] + + def test_null_count(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + ct.append({"vendor": None, "fare": 0.0}) + assert ct["vendor"].null_count() == 1 + + def test_is_dictionary_property(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + assert ct["vendor"].is_dictionary is True + assert ct["fare"].is_dictionary is False + + +# --------------------------------------------------------------------------- +# Persistence tests +# --------------------------------------------------------------------------- + + +class TestPersistence: + def test_b2d_roundtrip(self, tmp_path): + p = str(tmp_path / "trips.b2d") + ct = CTable(TripRow, urlpath=p, mode="w") + ct.extend(DATA_TUPLES) + ct.close() + + ct2 = CTable.open(p, mode="r") + assert ct2.nrows == 4 + assert ct2["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + assert ct2._cols["vendor"].dictionary == ["Uber", "Lyft", "Via"] + ct2.close() + + def test_b2z_roundtrip(self, tmp_path): + p = str(tmp_path / "trips.b2z") + ct = CTable(TripRow, urlpath=p, mode="w") + ct.extend(DATA_TUPLES) + ct.close() + + ct2 = CTable.open(p, mode="r") + assert ct2.nrows == 4 + assert ct2["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + ct2.close() + + +# --------------------------------------------------------------------------- +# Arrow import / export tests +# --------------------------------------------------------------------------- + + +class TestArrowInterop: + def _make_arrow_table(self, index_type=None, value_type=None, values=None, ordered=False): + if index_type is None: + index_type = pa.int32() + if value_type is None: + value_type = pa.string() + if values is None: + values = ["Uber", "Lyft", "Uber", None] + return pa.table( + { + "vendor": pa.array(values, type=pa.dictionary(index_type, value_type, ordered=ordered)), + "fare": pa.array([10.5, 7.2, 15.0, 0.0], type=pa.float64()), + } + ) + + def test_import_dict_int32(self): + at = self._make_arrow_table(index_type=pa.int32()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_dict_int8(self): + at = self._make_arrow_table(index_type=pa.int8()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_dict_int16(self): + at = self._make_arrow_table(index_type=pa.int16()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_dict_int64(self): + at = self._make_arrow_table(index_type=pa.int64()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_dict_uint8(self): + at = self._make_arrow_table(index_type=pa.uint8()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_dict_uint32(self): + at = self._make_arrow_table(index_type=pa.uint32()) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", None] + + def test_import_nulls_preserved(self): + at = self._make_arrow_table(values=["A", None, "B", None]) + ct = CTable.from_arrow(at.schema, at.to_batches()) + assert ct["vendor"][:] == ["A", None, "B", None] + assert ct._cols["vendor"].codes[:4].tolist() == [0, -1, 1, -1] + + def test_export_produces_dict_type(self): + at = self._make_arrow_table() + ct = CTable.from_arrow(at.schema, at.to_batches()) + (batch,) = ct.iter_arrow_batches() + field = batch.schema.field("vendor") + assert pa.types.is_dictionary(field.type) + assert field.type.index_type == pa.int32() + assert field.type.value_type == pa.string() + + def test_export_values_match(self): + at = self._make_arrow_table() + ct = CTable.from_arrow(at.schema, at.to_batches()) + (batch,) = ct.iter_arrow_batches() + assert batch.column("vendor").to_pylist() == ["Uber", "Lyft", "Uber", None] + + def test_parquet_roundtrip(self, tmp_path): + path = tmp_path / "test.parquet" + at = self._make_arrow_table(values=["Uber", "Lyft", "Uber", "Via"]) + pq.write_table(at, path) + ct = CTable.from_parquet(path) + assert isinstance(ct._schema.columns_by_name["vendor"].spec, DictionarySpec) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + + path2 = tmp_path / "roundtrip.parquet" + ct.to_parquet(path2) + at2 = pq.read_table(path2) + assert pa.types.is_dictionary(at2.schema.field("vendor").type) + assert at2.column("vendor").to_pylist() == ["Uber", "Lyft", "Uber", "Via"] + + def test_chunked_dict_unification(self): + """Two batches with different chunk-local dictionaries → global unification.""" + batch1 = pa.record_batch( + {"vendor": pa.array(["Uber", "Lyft"], type=pa.dictionary(pa.int32(), pa.string()))}, + schema=pa.schema([pa.field("vendor", pa.dictionary(pa.int32(), pa.string()))]), + ) + batch2 = pa.record_batch( + {"vendor": pa.array(["Via", "Uber"], type=pa.dictionary(pa.int32(), pa.string()))}, + schema=pa.schema([pa.field("vendor", pa.dictionary(pa.int32(), pa.string()))]), + ) + schema = pa.schema([pa.field("vendor", pa.dictionary(pa.int32(), pa.string()))]) + ct = CTable.from_arrow(schema, [batch1, batch2]) + assert ct["vendor"][:] == ["Uber", "Lyft", "Via", "Uber"] + codes = ct._cols["vendor"].codes[:4].tolist() + # Uber should have the same code in both positions + assert codes[0] == codes[3] + + def test_ordered_dict_inconsistent_order_raises(self): + schema = pa.schema([pa.field("x", pa.dictionary(pa.int32(), pa.string(), ordered=True))]) + batch1 = pa.record_batch( + {"x": pa.array(["A", "B"], type=pa.dictionary(pa.int32(), pa.string(), ordered=True))}, + schema=schema, + ) + # Batch2 has different order for existing values + batch2 = pa.record_batch( + {"x": pa.array(["B", "A"], type=pa.dictionary(pa.int32(), pa.string(), ordered=True))}, + schema=schema, + ) + with pytest.raises(ValueError, match="ordered"): + CTable.from_arrow(schema, [batch1, batch2]) + + def test_unsupported_dict_value_type_raises(self): + schema = pa.schema([pa.field("x", pa.dictionary(pa.int32(), pa.int64()))]) + at = pa.table({"x": pa.array([1, 2], type=pa.dictionary(pa.int32(), pa.int64()))}) + with pytest.raises(TypeError, match="dictionary"): + CTable.from_arrow(schema, at.to_batches()) + + +# --------------------------------------------------------------------------- +# Index tests +# --------------------------------------------------------------------------- + + +class TestIndex: + def test_create_index(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + idx = ct.create_index("vendor") + assert idx is not None + + def test_index_metadata_is_logical(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + ct.create_index("vendor") + catalog = ct._storage.load_index_catalog() + assert "vendor" in catalog + + def test_equality_uses_codes(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"] == "Uber" + assert _logical_mask_values(ct, mask) == [True, False, True, False] + + def test_isin_uses_codes(self): + ct = CTable(TripRow) + ct.extend(DATA_TUPLES) + mask = ct["vendor"].isin(["Lyft", "Via"]) + assert mask.tolist() == [False, True, False, True] + + def test_append_after_index(self, tmp_path): + p = str(tmp_path / "indexed.b2d") + ct = CTable(TripRow, urlpath=p, mode="w") + ct.extend(DATA_TUPLES) + ct.create_index("vendor") + ct.append({"vendor": "Uber", "fare": 20.0}) + assert ct.nrows == 5 + mask = ct["vendor"] == "Uber" + assert mask.sum() == 3 + ct.close() + + +# --------------------------------------------------------------------------- +# CLI tests +# --------------------------------------------------------------------------- + + +def test_cli_preserves_dict_by_default(tmp_path): + from blosc2.cli.parquet_to_blosc2 import main + + path = tmp_path / "dict.parquet" + out = tmp_path / "dict.b2d" + at = pa.table( + {"vendor": pa.array(["Uber", "Lyft", "Uber", "Via"], type=pa.dictionary(pa.int32(), pa.string()))} + ) + pq.write_table(at, path) + + assert main([str(path), str(out)]) == 0 + + ct = CTable.open(str(out), mode="r") + assert isinstance(ct._schema.columns_by_name["vendor"].spec, DictionarySpec) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber", "Via"] + ct.close() + + +def test_cli_decode_dictionaries_flag(tmp_path): + from blosc2.cli.parquet_to_blosc2 import main + from blosc2.schema import VLStringSpec + + path = tmp_path / "dict.parquet" + out = tmp_path / "dict_decoded.b2d" + at = pa.table( + {"vendor": pa.array(["Uber", "Lyft", "Uber"], type=pa.dictionary(pa.int32(), pa.string()))} + ) + pq.write_table(at, path) + + assert main(["--decode-dictionaries", str(path), str(out)]) == 0 + + ct = CTable.open(str(out), mode="r") + assert isinstance(ct._schema.columns_by_name["vendor"].spec, VLStringSpec) + assert ct["vendor"][:] == ["Uber", "Lyft", "Uber"] + ct.close() + + +def test_cli_dict_export_roundtrip(tmp_path): + from blosc2.cli.parquet_to_blosc2 import main + + path = tmp_path / "dict.parquet" + out = tmp_path / "dict.b2d" + exported = tmp_path / "dict_exported.parquet" + + at = pa.table( + { + "vendor": pa.array(["Uber", "Lyft", None, "Via"], type=pa.dictionary(pa.int32(), pa.string())), + "score": pa.array([1, 2, 3, 4], type=pa.int32()), + } + ) + pq.write_table(at, path) + + assert main([str(path), str(out)]) == 0 + assert main(["--export", str(out), str(exported)]) == 0 + + rt = pq.read_table(exported) + assert rt.column("vendor").to_pylist() == ["Uber", "Lyft", None, "Via"] + assert rt.column("score").to_pylist() == [1, 2, 3, 4] + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_getitem_access.py b/tests/ctable/test_getitem_access.py new file mode 100644 index 00000000..d0f25ad6 --- /dev/null +++ b/tests/ctable/test_getitem_access.py @@ -0,0 +1,145 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable +from blosc2.ctable import Column + + +@dataclass +class AccessRow: + id: int = blosc2.field(blosc2.int64()) + score: float = blosc2.field(blosc2.float64()) + active: bool = blosc2.field(blosc2.bool()) + note: str = blosc2.field(blosc2.vlstring(nullable=True)) + tags: list[int] = blosc2.field(blosc2.list(blosc2.int64(), nullable=True)) # noqa: RUF009 + + +DATA = [ + (0, 1.5, True, "zero", [0, 1]), + (1, 2.5, False, None, None), + (2, 3.5, True, "two", [2]), + (3, 4.5, False, "three", [3, 4]), +] + + +def test_getitem_string_column(): + t = CTable(AccessRow, new_data=DATA) + col = t["id"] + assert isinstance(col, Column) + assert list(col) == [0, 1, 2, 3] + + +def test_getitem_int_returns_namedtuple_row(): + t = CTable(AccessRow, new_data=DATA) + row = t[1] + assert row.id == 1 + assert row.score == 2.5 + assert row.active is False + assert row.note is None + assert row.tags is None + assert row["id"] == 1 + assert row[0] == 1 + assert row.as_dict()["score"] == 2.5 + + +def test_getitem_int_negative_and_bounds(): + t = CTable(AccessRow, new_data=DATA) + assert t[-1].id == 3 + with pytest.raises(IndexError): + _ = t[len(DATA)] + + +def test_getitem_slice_returns_view(): + t = CTable(AccessRow, new_data=DATA) + sub = t[1:3] + assert isinstance(sub, CTable) + assert list(sub.id) == [1, 2] + assert sub.base is t + + +def test_getitem_integer_list_and_bool_mask_return_views(): + t = CTable(AccessRow, new_data=DATA) + gathered = t[[3, 0, 2]] + assert isinstance(gathered, CTable) + assert set(gathered.id) == {0, 2, 3} + + mask = np.array([True, False, True, False]) + filtered = t[mask] + assert isinstance(filtered, CTable) + assert list(filtered.id) == [0, 2] + + +def test_getitem_list_of_strings_projects_columns(): + t = CTable(AccessRow, new_data=DATA) + sub = t[["id", "note"]] + assert isinstance(sub, CTable) + assert sub.col_names == ["id", "note"] + assert list(sub.id) == [0, 1, 2, 3] + assert list(sub.note) == ["zero", None, "two", "three"] + + +def test_getitem_string_expression_filters_rows(): + t = CTable(AccessRow, new_data=DATA) + sub = t["id >= 2"] + assert isinstance(sub, CTable) + assert list(sub.id) == [2, 3] + + +def test_where_columns_projects_after_filter(): + t = CTable(AccessRow, new_data=DATA) + sub = t.where("id >= 1", columns=["id", "note"]) + assert sub.col_names == ["id", "note"] + assert list(sub.id) == [1, 2, 3] + assert list(sub.note) == [None, "two", "three"] + + +def test_getitem_invalid_key_type_raises(): + t = CTable(AccessRow, new_data=DATA) + with pytest.raises(TypeError): + _ = t[1.5] + with pytest.raises(TypeError): + _ = t[(1, 2)] + + +def test_getitem_projection_unknown_column_raises(): + t = CTable(AccessRow, new_data=DATA) + with pytest.raises(KeyError): + _ = t[["id", "missing"]] + + +def test_getitem_non_boolean_expression_raises(): + t = CTable(AccessRow, new_data=DATA) + with pytest.raises(TypeError): + _ = t["id + 1"] + + +def test_ctable_array_materialization_uses_structured_dtype(): + t = CTable(AccessRow, new_data=DATA) + arr = np.asarray(t) + assert arr.dtype.fields is not None + assert arr.dtype["id"] == np.dtype(np.int64) + assert arr.dtype["score"] == np.dtype(np.float64) + assert arr.dtype["active"] == np.dtype(np.bool_) + assert arr.dtype["note"] == np.dtype(object) + assert arr.dtype["tags"] == np.dtype(object) + assert arr[1]["id"] == 1 + assert arr[1]["note"] is None + assert arr[2]["tags"] == [2] + + +def test_ctable_view_array_materialization(): + t = CTable(AccessRow, new_data=DATA) + arr = np.asarray(t[1:3]) + assert arr.shape == (2,) + assert arr[0]["id"] == 1 + assert arr[1]["note"] == "two" diff --git a/tests/ctable/test_nested_access_storage.py b/tests/ctable/test_nested_access_storage.py new file mode 100644 index 00000000..6eaa70b7 --- /dev/null +++ b/tests/ctable/test_nested_access_storage.py @@ -0,0 +1,165 @@ +from dataclasses import dataclass + +import pytest + +import blosc2 + +try: + import pyarrow as pa + import pyarrow.parquet as pq +except ImportError: # pragma: no cover - optional dependency + pa = None + pq = None + +pytestmark = pytest.mark.skipif(pa is None, reason="pyarrow is required for nested Arrow/Parquet tests") + + +@dataclass +class AccessRow: + trip_begin_lon: float + payment_fare: float + + +@dataclass +class PersistRow: + a: int + + +def test_dotted_column_attribute_namespace_and_where_string(): + t = blosc2.CTable(AccessRow) + t.append((1.0, 10.0)) + t.append((2.0, 30.0)) + t.append((3.0, 40.0)) + + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("payment_fare", "payment.fare") + + assert t["trip.begin.lon"].sum() == 6.0 + assert t.trip.begin.lon.max() == 3.0 + + view1 = t.where("payment.fare > 20") + assert view1.nrows == 2 + + view2 = t.where(t.payment.fare > 20) + assert view2.nrows == 2 + + +def test_dotted_column_persists_under_hierarchical_cols(tmp_path): + t = blosc2.CTable(PersistRow) + t.append((1,)) + t.rename_column("a", "trip.begin.lon") + + path = tmp_path / "nested.b2d" + t.save(str(path), overwrite=True) + + leaf = path / "_cols" / "trip" / "begin" / "lon.b2nd" + assert leaf.exists() + + opened = blosc2.CTable.open(str(path)) + assert opened["trip.begin.lon"][0] == 1 + + +def test_select_struct_prefix_expands_descendants(): + t = blosc2.CTable(AccessRow) + t.append((1.0, 10.0)) + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("payment_fare", "payment.fare") + + s = t.select(["trip"]) + assert s.col_names == ["trip.begin.lon"] + + +def test_from_arrow_flattens_struct_columns_to_dotted_leaves(): + trip_type = pa.struct([("begin", pa.struct([("lon", pa.float64()), ("lat", pa.float64())]))]) + schema = pa.schema([pa.field("trip", trip_type)]) + batch = pa.record_batch( + [ + pa.array( + [ + {"begin": {"lon": 1.1, "lat": 2.2}}, + {"begin": {"lon": 3.3, "lat": 4.4}}, + ], + type=trip_type, + ) + ], + schema=schema, + ) + + t = blosc2.CTable.from_arrow(schema, [batch]) + assert "trip.begin.lon" in t.col_names + assert "trip.begin.lat" in t.col_names + assert t["trip.begin.lon"][1] == 3.3 + + row0 = t[0] + assert isinstance(row0.trip, dict) + assert row0.trip["begin"]["lon"] == 1.1 + assert row0.trip["begin"]["lat"] == 2.2 + + +def test_nested_field_name_escaping_for_literal_dot_and_slash(tmp_path): + trip_type = pa.struct([pa.field("begin/point", pa.struct([pa.field("lon.deg", pa.float64())]))]) + schema = pa.schema([pa.field("trip.info", trip_type)]) + batch = pa.record_batch( + [ + pa.array( + [ + {"begin/point": {"lon.deg": 1.0}}, + {"begin/point": {"lon.deg": 2.0}}, + ], + type=trip_type, + ) + ], + schema=schema, + ) + + path = tmp_path / "escaped.b2d" + t = blosc2.CTable.from_arrow(schema, [batch], urlpath=str(path)) + + leaf_name = r"trip\.info.begin\/point.lon\.deg" + assert t.col_names == [leaf_name] + assert t[leaf_name][1] == 2.0 + assert t[r"trip\.info"][0] == {"begin/point": {"lon.deg": 1.0}} + assert t.where(r"trip\.info.begin\/point.lon\.deg > 1.5").nrows == 1 + + leaf_path = path / "_cols" / "trip%2Einfo" / "begin%2Fpoint" / "lon%2Edeg.b2nd" + assert leaf_path.exists() + + opened = blosc2.CTable.open(str(path)) + assert opened.col_names == [leaf_name] + assert opened[leaf_name][1] == 2.0 + + out = t.to_arrow() + assert out.schema.names == ["trip.info"] + assert out.column("trip.info").to_pylist()[1]["begin/point"]["lon.deg"] == 2.0 + + +def test_nested_struct_parquet_roundtrip(tmp_path): + trip_type = pa.struct([("begin", pa.struct([("lon", pa.float64()), ("lat", pa.float64())]))]) + schema = pa.schema([pa.field("trip", trip_type)]) + table = pa.table( + { + "trip": pa.array( + [ + {"begin": {"lon": 1.1, "lat": 2.2}}, + {"begin": {"lon": 3.3, "lat": 4.4}}, + {"begin": {"lon": 5.5, "lat": 6.6}}, + ], + type=trip_type, + ) + }, + schema=schema, + ) + + src = tmp_path / "src.parquet" + pq.write_table(table, src) + + t = blosc2.CTable.from_parquet(src) + assert t.col_names == ["trip.begin.lon", "trip.begin.lat"] + assert t[2].trip["begin"]["lon"] == 5.5 + + dst = tmp_path / "dst.parquet" + t.to_parquet(dst) + out = pq.read_table(dst) + assert out.num_rows == 3 + assert out.schema.names == ["trip"] + assert out.column("trip").to_pylist()[0]["begin"]["lon"] == 1.1 diff --git a/tests/ctable/test_nested_metadata_root.py b/tests/ctable/test_nested_metadata_root.py new file mode 100644 index 00000000..a8c61364 --- /dev/null +++ b/tests/ctable/test_nested_metadata_root.py @@ -0,0 +1,93 @@ +import pytest + +import blosc2 +from blosc2.schema_compiler import schema_from_dict, schema_to_dict + +try: + import pyarrow as pa +except ImportError: # pragma: no cover - optional dependency + pa = None + +pytestmark = pytest.mark.skipif(pa is None, reason="pyarrow is required for nested Arrow/Parquet tests") + + +def _table_with_empty_root_alias(): + md = {b"blosc2_empty_root_physical": b"root"} + schema = pa.schema([pa.field("root", pa.float64())]).with_metadata(md) + batch = pa.record_batch([pa.array([1.0, 2.0, 3.0])], schema=schema) + return blosc2.CTable.from_arrow(schema, [batch]) + + +def test_schema_version_2_with_nested_metadata_roundtrip(): + schema = pa.schema([pa.field("x.y", pa.float64())]) + batch = pa.record_batch([pa.array([1.0, 2.0])], schema=schema) + t = blosc2.CTable.from_arrow(schema, [batch]) + + d = schema_to_dict(t._schema) + assert d["version"] == 2 + assert "nested" in d["metadata"] + + restored = schema_from_dict(d) + assert restored.metadata["nested"]["physical_to_storage"]["x.y"] == "_cols/x/y" + + +def test_empty_root_metadata_exports_back_to_empty_arrow_name(): + t = _table_with_empty_root_alias() + out = t.to_arrow() + assert out.schema.names == [""] + + +def test_empty_root_logical_alias_getitem_select_and_index(): + t = _table_with_empty_root_alias() + assert t[""][0] == 1.0 + s = t.select([""]) + assert s.col_names == ["root"] + + ix = t.create_index(col_name="") + assert ix is not None + + # index management should accept logical alias too + t.rebuild_index(col_name="") + t.drop_index(col_name="") + + +def test_sort_by_nested_prefix_requires_leaf_column(): + schema = pa.schema([pa.field("trip.begin.lon", pa.float64()), pa.field("trip.begin.lat", pa.float64())]) + batch = pa.record_batch([pa.array([2.0, 1.0]), pa.array([20.0, 10.0])], schema=schema) + t = blosc2.CTable.from_arrow(schema, [batch]) + + with pytest.raises(ValueError): + t.sort_by("trip") + + s = t.sort_by("trip.begin.lon") + assert s["trip.begin.lon"][0] == 1.0 + + +@pytest.mark.heavy +def test_nested_ops_compat_matrix_smoke(): + n = 20_000 + lon = pa.array([float(i % 1000) for i in range(n)], type=pa.float64()) + lat = pa.array([float((i * 2) % 1000) for i in range(n)], type=pa.float64()) + fare = pa.array([float(i % 50) for i in range(n)], type=pa.float64()) + schema = pa.schema( + [ + pa.field("trip.begin.lon", pa.float64()), + pa.field("trip.begin.lat", pa.float64()), + pa.field("payment.fare", pa.float64()), + ] + ) + batch = pa.record_batch([lon, lat, fare], schema=schema) + + t = blosc2.CTable.from_arrow(schema, [batch]) + + view = t.where("payment.fare > 25") + assert 0 < view.nrows < n + + t.create_index(col_name="payment.fare") + t.rebuild_index(col_name="payment.fare") + + sorted_t = t.sort_by("trip.begin.lon") + assert sorted_t["trip.begin.lon"][0] <= sorted_t["trip.begin.lon"][1] + + proj = t.select(["trip"]) + assert proj.col_names == ["trip.begin.lon", "trip.begin.lat"] diff --git a/tests/ctable/test_parquet_interop.py b/tests/ctable/test_parquet_interop.py index 64f73f56..193eddf6 100644 --- a/tests/ctable/test_parquet_interop.py +++ b/tests/ctable/test_parquet_interop.py @@ -8,6 +8,7 @@ """Tests for CTable.to_parquet(), from_parquet(), iter_arrow_batches(), and from_arrow().""" +import io from dataclasses import dataclass import numpy as np @@ -662,6 +663,29 @@ def test_invalid_batch_size_from_parquet(self, tmp_path): with pytest.raises(ValueError, match="batch_size"): CTable.from_parquet(path, batch_size=0) + def test_invalid_max_rows_from_parquet(self, tmp_path): + t = CTable(Row, new_data=DATA10) + path = tmp_path / "x.parquet" + t.to_parquet(path) + with pytest.raises(ValueError, match="max_rows"): + CTable.from_parquet(path, max_rows=-1) + + def test_max_rows_from_parquet_limits_rows(self, tmp_path): + t = CTable(Row, new_data=DATA10) + path = tmp_path / "x.parquet" + t.to_parquet(path) + out = CTable.from_parquet(path, batch_size=4, max_rows=6) + assert len(out) == 6 + np.testing.assert_array_equal(out["id"][:], np.arange(6)) + + def test_max_rows_zero_from_parquet_imports_empty_table(self, tmp_path): + t = CTable(Row, new_data=DATA10) + path = tmp_path / "x.parquet" + t.to_parquet(path) + out = CTable.from_parquet(path, max_rows=0) + assert len(out) == 0 + assert out.col_names == ["id", "score", "active", "label"] + def test_string_truncation_error(self, tmp_path): """Importing longer strings than max_length raises ValueError.""" at = pa.table({"name": pa.array(["a" * 300, "b"], type=pa.string())}) @@ -672,6 +696,104 @@ def test_string_truncation_error(self, tmp_path): CTable.from_parquet(path, string_max_length=10) +def test_parquet_cli_progress_is_opt_in(tmp_path, capsys): + from blosc2.cli.parquet_to_blosc2 import main + + path = tmp_path / "progress.parquet" + out = tmp_path / "progress.b2d" + pq.write_table(pa.table({"x": pa.array([1, 2, 3], type=pa.int64())}), path) + + assert main(["--parquet-batch-size", "1", str(path), str(out)]) == 0 + captured = capsys.readouterr() + assert " batch" not in captured.out + + out_progress = tmp_path / "progress_enabled.b2d" + assert main(["--progress", "--parquet-batch-size", "1", str(path), str(out_progress)]) == 0 + captured = capsys.readouterr() + assert " batch" in captured.out + + +def test_parquet_cli_nested_progress_skips_write_lines(tmp_path, capsys): + from blosc2.cli.parquet_to_blosc2 import main + + buf, _ = _make_taxi_parquet_buf(n_outer_rows=3) + path = tmp_path / "taxi.parquet" + out = tmp_path / "taxi.b2d" + path.write_bytes(buf.getvalue()) + + assert ( + main( + [ + "--progress", + "--parquet-batch-size", + "1", + "--blosc2-batch-size", + "1", + str(path), + str(out), + ] + ) + == 0 + ) + captured = capsys.readouterr() + assert " parquet batch" in captured.out + assert " write" not in captured.out + + +def test_parquet_cli_separate_nested_flattens_top_level_structs(tmp_path, capsys): + from blosc2.cli.parquet_to_blosc2 import main + + trip_type = pa.struct( + [ + pa.field("sec", pa.float32()), + pa.field("begin", pa.struct([pa.field("lon", pa.float64()), pa.field("lat", pa.float64())])), + ] + ) + path = tmp_path / "struct.parquet" + out = tmp_path / "struct.b2d" + table = pa.table( + { + "trip": pa.array( + [ + {"sec": 10.0, "begin": {"lon": -87.6, "lat": 41.8}}, + {"sec": 20.0, "begin": {"lon": -87.7, "lat": 41.9}}, + ], + type=trip_type, + ), + "fare": pa.array([15.0, 25.0], type=pa.float32()), + } + ) + pq.write_table(table, path) + + assert main([str(path), str(out)]) == 0 + captured = capsys.readouterr() + assert "Struct→columns: 1" in captured.out + + ct = CTable.open(str(out), mode="r") + assert ct.col_names == ["trip.sec", "trip.begin.lon", "trip.begin.lat", "fare"] + np.testing.assert_allclose(ct["trip.begin.lon"][:], [-87.6, -87.7]) + ct.close() + + +def test_parquet_cli_no_separate_nested_preserves_top_level_struct_as_list(tmp_path): + from blosc2.cli.parquet_to_blosc2 import main + + trip_type = pa.struct([pa.field("sec", pa.float32())]) + path = tmp_path / "struct.parquet" + out = tmp_path / "struct.b2d" + pq.write_table( + pa.table({"trip": pa.array([{"sec": 10.0}, {"sec": 20.0}], type=trip_type)}), + path, + ) + + assert main(["--no-separate-nested-cols", str(path), str(out)]) == 0 + + ct = CTable.open(str(out), mode="r") + assert ct.col_names == ["trip"] + assert ct["trip"][:] == [[{"sec": 10.0}], [{"sec": 20.0}]] + ct.close() + + def test_parquet_cli_timestamp_unit_auto(tmp_path): from blosc2.cli.parquet_to_blosc2 import main @@ -696,5 +818,495 @@ def test_parquet_cli_timestamp_unit_auto(tmp_path): assert table._cols["ts"][:].tolist() == [1735689600, 1735689601, 1735689602] +# --------------------------------------------------------------------------- +# separate_nested_cols / unnamed-root list> import +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# Shared schema / data helpers +# --------------------------------------------------------------------------- + + +def _make_taxi_schema(): + """Return a simplified taxi-like Arrow schema (inner struct fields).""" + trip_type = pa.struct( + [ + pa.field("sec", pa.float32()), + pa.field( + "begin", + pa.struct([pa.field("lon", pa.float64()), pa.field("lat", pa.float64())]), + ), + ] + ) + payment_type = pa.struct( + [ + pa.field("fare", pa.float64()), + pa.field("tips", pa.float64()), + ] + ) + return pa.struct( + [ + pa.field("trip", trip_type), + pa.field("payment", payment_type), + pa.field("company", pa.string()), + ] + ) + + +def _make_taxi_parquet_buf(n_outer_rows=2): + """Create an in-memory Parquet buffer with an unnamed root list>. + + *n_outer_rows* controls how many Parquet rows (outer lists) to create. + Each outer list contains 1–3 trip records. + """ + root_struct = _make_taxi_schema() + root_list = pa.list_(root_struct) + + all_rows = [ + [ + { + "trip": {"sec": 10.0, "begin": {"lon": -87.6, "lat": 41.8}}, + "payment": {"fare": 15.0, "tips": 2.0}, + "company": "Taxi Corp", + }, + { + "trip": {"sec": 20.0, "begin": {"lon": -87.7, "lat": 41.9}}, + "payment": {"fare": 25.0, "tips": 3.0}, + "company": "Blue Cab", + }, + ], + [ + { + "trip": {"sec": 5.0, "begin": {"lon": -87.5, "lat": 41.7}}, + "payment": {"fare": 10.0, "tips": 1.0}, + "company": "Taxi Corp", + }, + ], + [ + { + "trip": {"sec": 30.0, "begin": {"lon": -87.3, "lat": 41.6}}, + "payment": {"fare": 5.0, "tips": 0.5}, + "company": "City Cab", + }, + { + "trip": {"sec": 15.0, "begin": {"lon": -87.4, "lat": 41.5}}, + "payment": {"fare": 12.0, "tips": 1.5}, + "company": "Blue Cab", + }, + { + "trip": {"sec": 8.0, "begin": {"lon": -87.2, "lat": 41.4}}, + "payment": {"fare": 9.0, "tips": 0.0}, + "company": "Taxi Corp", + }, + ], + ] + rows = all_rows[:n_outer_rows] + arr = pa.array(rows, type=root_list) + buf = io.BytesIO() + pq.write_table(pa.table({"": arr}), buf) + buf.seek(0) + return buf, rows + + +def _count_elements(rows): + """Count the total number of list elements across outer rows.""" + return sum(len(r) for r in rows) + + +# --------------------------------------------------------------------------- +# Detection helper tests +# --------------------------------------------------------------------------- + + +class TestDetectUnnamedRootListStruct: + def test_detects_single_unnamed_list_struct(self): + root_struct = _make_taxi_schema() + schema = pa.schema([pa.field("", pa.list_(root_struct))]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is True + + def test_detects_large_list_variant(self): + root_struct = _make_taxi_schema() + schema = pa.schema([pa.field("", pa.large_list(root_struct))]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is True + + def test_rejects_named_field(self): + root_struct = _make_taxi_schema() + schema = pa.schema([pa.field("events", pa.list_(root_struct))]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is False + + def test_rejects_multiple_fields(self): + root_struct = _make_taxi_schema() + schema = pa.schema([pa.field("", pa.list_(root_struct)), pa.field("id", pa.int64())]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is False + + def test_rejects_non_list_unnamed_field(self): + root_struct = _make_taxi_schema() + schema = pa.schema([pa.field("", root_struct)]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is False + + def test_rejects_list_of_scalar(self): + schema = pa.schema([pa.field("", pa.list_(pa.int64()))]) + assert CTable._detect_unnamed_root_list_struct(pa, schema) is False + + +# --------------------------------------------------------------------------- +# Phase 1 acceptance tests +# --------------------------------------------------------------------------- + + +class TestUnnamedRootImport: + """Acceptance tests for Phase 1: unnamed-root list> import.""" + + def _make_ct(self, n_outer_rows=2, **kwargs): + buf, rows = _make_taxi_parquet_buf(n_outer_rows) + ct = CTable.from_parquet(buf, separate_nested_cols=True, **kwargs) + return ct, rows + + # ------------------------------------------------------------------ + # Row count + # ------------------------------------------------------------------ + + def test_nrows_equals_element_count_2_outer(self): + ct, rows = self._make_ct(n_outer_rows=2) + assert len(ct) == _count_elements(rows) # 3 + + def test_from_parquet_separates_nested_cols_by_default(self): + buf, rows = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf) + assert len(ct) == _count_elements(rows) + assert "column_0" not in ct.col_names + assert "trip.begin.lon" in ct.col_names + + def test_nrows_equals_element_count_3_outer(self): + ct, rows = self._make_ct(n_outer_rows=3) + assert len(ct) == _count_elements(rows) # 6 + + def test_max_rows_limits_flattened_element_rows(self): + ct, rows = self._make_ct(n_outer_rows=3, max_rows=4, batch_size=1) + expected = [r["payment"]["fare"] for outer in rows for r in outer][:4] + assert len(ct) == 4 + np.testing.assert_allclose(ct["payment.fare"][:].tolist(), expected) + assert ct._schema.metadata["nested"]["original_root"]["kind"] == "unnamed_list_struct" + + def test_max_rows_zero_imports_empty_flattened_table(self): + ct, _ = self._make_ct(n_outer_rows=3, max_rows=0) + assert len(ct) == 0 + assert "column_0" not in ct.col_names + assert "trip.begin.lon" in ct.col_names + assert ct._schema.metadata["nested"]["original_root"]["kind"] == "unnamed_list_struct" + + # ------------------------------------------------------------------ + # Column names — no column_0, no unnamed root in col_names + # ------------------------------------------------------------------ + + def test_col_names_no_column_0(self): + ct, _ = self._make_ct() + assert "column_0" not in ct.col_names + assert "" not in ct.col_names + + def test_col_names_contains_leaf_paths(self): + ct, _ = self._make_ct() + expected = { + "trip.sec", + "trip.begin.lon", + "trip.begin.lat", + "payment.fare", + "payment.tips", + "company", + } + assert set(ct.col_names) == expected + + # ------------------------------------------------------------------ + # Column access and analytics + # ------------------------------------------------------------------ + + def test_payment_fare_mean(self): + ct, rows = self._make_ct(n_outer_rows=2) + fares = [r["payment"]["fare"] for outer in rows for r in outer] + expected = np.mean(fares) + np.testing.assert_allclose(ct["payment.fare"].mean(), expected) + + def test_trip_begin_lon_mean(self): + ct, rows = self._make_ct(n_outer_rows=2) + lons = [r["trip"]["begin"]["lon"] for outer in rows for r in outer] + expected = np.mean(lons) + np.testing.assert_allclose(ct["trip.begin.lon"].mean(), expected) + + def test_payment_fare_values(self): + ct, rows = self._make_ct(n_outer_rows=2) + expected = [r["payment"]["fare"] for outer in rows for r in outer] + np.testing.assert_allclose(ct["payment.fare"][:].tolist(), expected) + + def test_company_column_values(self): + ct, rows = self._make_ct(n_outer_rows=2) + expected = [r["company"] for outer in rows for r in outer] + assert list(ct["company"][:]) == expected + + # ------------------------------------------------------------------ + # where() filtering + # ------------------------------------------------------------------ + + def test_where_payment_fare_gt_12(self): + ct, rows = self._make_ct(n_outer_rows=2) + all_fares = [r["payment"]["fare"] for outer in rows for r in outer] + expected_count = sum(1 for f in all_fares if f > 12) + result = ct.where("payment.fare > 12") + assert len(result) == expected_count + + def test_where_payment_fare_gt_20(self): + ct, rows = self._make_ct(n_outer_rows=2) + all_fares = [r["payment"]["fare"] for outer in rows for r in outer] + expected_count = sum(1 for f in all_fares if f > 20) + result = ct.where("payment.fare > 20") + assert len(result) == expected_count + + # ------------------------------------------------------------------ + # Provenance metadata + # ------------------------------------------------------------------ + + def test_original_root_metadata_present(self): + ct, _ = self._make_ct() + nested = ct._schema.metadata.get("nested", {}) + assert "original_root" in nested + + def test_original_root_metadata_kind(self): + ct, _ = self._make_ct() + orig = ct._schema.metadata["nested"]["original_root"] + assert orig["kind"] == "unnamed_list_struct" + + def test_original_root_metadata_field_name(self): + ct, _ = self._make_ct() + orig = ct._schema.metadata["nested"]["original_root"] + assert orig["field_name"] == "" + + def test_original_root_metadata_preserve_grouping_false(self): + ct, _ = self._make_ct() + orig = ct._schema.metadata["nested"]["original_root"] + assert orig["preserve_grouping"] is False + + # ------------------------------------------------------------------ + # Persistence: .b2d reopen + # ------------------------------------------------------------------ + + def test_b2d_reopen_nrows(self, tmp_path): + buf, rows = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=True, urlpath=str(tmp_path / "taxi.b2d")) + ct.close() + ct2 = CTable.open(str(tmp_path / "taxi.b2d"), mode="r") + assert len(ct2) == _count_elements(rows) + ct2.close() + + def test_b2d_reopen_col_names(self, tmp_path): + buf, _ = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=True, urlpath=str(tmp_path / "taxi.b2d")) + col_names = ct.col_names + ct.close() + ct2 = CTable.open(str(tmp_path / "taxi.b2d"), mode="r") + assert ct2.col_names == col_names + ct2.close() + + def test_b2d_reopen_values(self, tmp_path): + buf, rows = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=True, urlpath=str(tmp_path / "taxi.b2d")) + expected_fares = [r["payment"]["fare"] for outer in rows for r in outer] + ct.close() + ct2 = CTable.open(str(tmp_path / "taxi.b2d"), mode="r") + np.testing.assert_allclose(ct2["payment.fare"][:].tolist(), expected_fares) + ct2.close() + + def test_b2d_reopen_original_root_metadata(self, tmp_path): + buf, _ = _make_taxi_parquet_buf() + ct = CTable.from_parquet(buf, separate_nested_cols=True, urlpath=str(tmp_path / "taxi.b2d")) + ct.close() + ct2 = CTable.open(str(tmp_path / "taxi.b2d"), mode="r") + orig = ct2._schema.metadata["nested"]["original_root"] + assert orig["kind"] == "unnamed_list_struct" + ct2.close() + + def test_b2z_reopen(self, tmp_path): + buf, rows = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=True, urlpath=str(tmp_path / "taxi.b2z")) + ct.close() + ct2 = CTable.open(str(tmp_path / "taxi.b2z"), mode="r") + assert len(ct2) == _count_elements(rows) + assert "trip.begin.lon" in ct2.col_names + ct2.close() + + # ------------------------------------------------------------------ + # to_arrow() emits clean logical nested table + # ------------------------------------------------------------------ + + def test_to_arrow_no_unnamed_column(self): + ct, _ = self._make_ct() + arrow_table = ct.to_arrow() + assert "" not in arrow_table.schema.names + assert "column_0" not in arrow_table.schema.names + + def test_to_arrow_has_trip_and_payment_top_level(self): + ct, _ = self._make_ct() + arrow_table = ct.to_arrow() + names = arrow_table.schema.names + assert "trip" in names + assert "payment" in names + assert "company" in names + + def test_to_arrow_trip_is_struct(self): + ct, _ = self._make_ct() + arrow_table = ct.to_arrow() + assert pa.types.is_struct(arrow_table.schema.field("trip").type) + + def test_to_arrow_payment_fare_values(self): + ct, rows = self._make_ct(n_outer_rows=2) + arrow_table = ct.to_arrow() + expected = [r["payment"]["fare"] for outer in rows for r in outer] + payment_col = arrow_table.column("payment") + actual = [row.as_py()["fare"] for row in payment_col] + np.testing.assert_allclose(actual, expected) + + # ------------------------------------------------------------------ + # from_arrow with separate_nested_cols=True + # ------------------------------------------------------------------ + + def test_from_arrow_separate_nested_cols(self): + """from_arrow accepts separate_nested_cols=True directly.""" + root_struct = _make_taxi_schema() + root_list = pa.list_(root_struct) + data = [ + [ + { + "trip": {"sec": 10.0, "begin": {"lon": -87.6, "lat": 41.8}}, + "payment": {"fare": 15.0, "tips": 2.0}, + "company": "Taxi", + }, + { + "trip": {"sec": 5.0, "begin": {"lon": -87.5, "lat": 41.7}}, + "payment": {"fare": 10.0, "tips": 1.0}, + "company": "Taxi", + }, + ] + ] + arr = pa.array(data, type=root_list) + schema = pa.schema([pa.field("", root_list)]) + batch = pa.record_batch([arr], schema=schema) + ct = CTable.from_arrow(schema, [batch], separate_nested_cols=True) + assert len(ct) == 2 + assert "trip.begin.lon" in ct.col_names + assert "payment.fare" in ct.col_names + np.testing.assert_allclose(ct["payment.fare"][:].tolist(), [15.0, 10.0]) + + # ------------------------------------------------------------------ + # Behaviour when separate_nested_cols=False (existing behaviour) + # ------------------------------------------------------------------ + + def test_false_flag_gives_renamed_root_column(self): + """Without separate_nested_cols, the old renaming behaviour applies.""" + buf, _ = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=False) + # The unnamed "" field should be renamed to "root" + assert "root" in ct.col_names + + def test_false_flag_nrows_equals_parquet_rows(self): + """Without separate_nested_cols, nrows is the number of Parquet outer rows.""" + buf, rows = _make_taxi_parquet_buf(n_outer_rows=2) + ct = CTable.from_parquet(buf, separate_nested_cols=False) + # 2 Parquet rows, not 3 elements + assert len(ct) == len(rows) + + # ------------------------------------------------------------------ + # Edge cases + # ------------------------------------------------------------------ + + def test_empty_outer_list(self): + """Importing a Parquet file where all outer lists are empty gives 0 rows.""" + root_struct = _make_taxi_schema() + root_list = pa.list_(root_struct) + arr = pa.array([[], []], type=root_list) + buf = io.BytesIO() + pq.write_table(pa.table({"": arr}), buf) + buf.seek(0) + ct = CTable.from_parquet(buf, separate_nested_cols=True) + assert len(ct) == 0 + assert set(ct.col_names) == { + "trip.sec", + "trip.begin.lon", + "trip.begin.lat", + "payment.fare", + "payment.tips", + "company", + } + + def test_single_element(self): + """A single-element list imports as one CTable row.""" + root_struct = _make_taxi_schema() + root_list = pa.list_(root_struct) + arr = pa.array( + [ + [ + { + "trip": {"sec": 7.0, "begin": {"lon": -87.0, "lat": 41.0}}, + "payment": {"fare": 8.0, "tips": 0.5}, + "company": "X", + } + ] + ], + type=root_list, + ) + buf = io.BytesIO() + pq.write_table(pa.table({"": arr}), buf) + buf.seek(0) + ct = CTable.from_parquet(buf, separate_nested_cols=True) + assert len(ct) == 1 + assert ct["payment.fare"][0] == 8.0 + + def test_non_qualifying_schema_ignored_with_flag(self): + """separate_nested_cols=True is silently ignored for a normal (non-qualifying) schema.""" + at = pa.table({"x": pa.array([1, 2, 3], type=pa.int64()), "y": pa.array([4.0, 5.0, 6.0])}) + buf = io.BytesIO() + pq.write_table(at, buf) + buf.seek(0) + ct = CTable.from_parquet(buf, separate_nested_cols=True) + assert len(ct) == 3 + assert ct.col_names == ["x", "y"] + + def test_multiple_batches(self): + """separate_nested_cols works when Parquet is read in small batches.""" + buf, rows = _make_taxi_parquet_buf(n_outer_rows=3) + ct = CTable.from_parquet(buf, separate_nested_cols=True, batch_size=1) + assert len(ct) == _count_elements(rows) + fares = [r["payment"]["fare"] for outer in rows for r in outer] + np.testing.assert_allclose(ct["payment.fare"][:].tolist(), fares) + + def test_nested_list_inside_element_ignored_at_phase1(self): + """A nested list inside the element struct is imported as a ListArray column (phase 1).""" + path_type = pa.struct([pa.field("londiff", pa.float32()), pa.field("latdiff", pa.float32())]) + trip_with_path = pa.struct( + [ + pa.field("sec", pa.float32()), + pa.field("path", pa.list_(path_type)), + ] + ) + root_struct = pa.struct([pa.field("trip", trip_with_path), pa.field("fare", pa.float64())]) + root_list = pa.list_(root_struct) + data = [ + [ + {"trip": {"sec": 10.0, "path": [{"londiff": 0.1, "latdiff": 0.2}]}, "fare": 15.0}, + {"trip": {"sec": 5.0, "path": []}, "fare": 8.0}, + ] + ] + arr = pa.array(data, type=root_list) + buf = io.BytesIO() + pq.write_table(pa.table({"": arr}), buf) + buf.seek(0) + ct = CTable.from_parquet(buf, separate_nested_cols=True) + assert len(ct) == 2 + assert "fare" in ct.col_names + assert ct["fare"][:].tolist() == [15.0, 8.0] + # trip.path should be a ListArray column with one list per element row + assert "trip.path" in ct.col_names + assert ct["trip.path"].is_list + assert ct["trip.path"][0] == [{"londiff": pytest.approx(0.1), "latdiff": pytest.approx(0.2)}] + assert ct["trip.path"][1] == [] + + if __name__ == "__main__": pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_table_persistency.py b/tests/ctable/test_table_persistency.py index 3873318c..2f10d493 100644 --- a/tests/ctable/test_table_persistency.py +++ b/tests/ctable/test_table_persistency.py @@ -459,15 +459,10 @@ class Bad: CTable(Bad) -def test_column_name_cannot_contain_slash(): - @dataclass - class Bad: - pass - +def test_column_name_can_contain_slash(): from blosc2.schema_compiler import _validate_column_name - with pytest.raises(ValueError, match="/"): - _validate_column_name("a/b") + _validate_column_name("a/b") def test_column_name_cannot_be_empty(): diff --git a/tests/ctable/test_varlen_columns.py b/tests/ctable/test_varlen_columns.py index 11d643a2..4a527d6e 100644 --- a/tests/ctable/test_varlen_columns.py +++ b/tests/ctable/test_varlen_columns.py @@ -38,6 +38,16 @@ def test_ctable_varlen_append_extend_and_reads(): assert t.tags[2] == ["r", "s"] +def test_list_column_display(): + t = blosc2.CTable(Product, new_data=DATA) + text = str(t) + col_text = repr(t.tags) + + assert "['x', 'y']" in text + assert "" in col_text + assert "['x', 'y']" not in col_text + + def test_ctable_varlen_where_select_head_tail_and_compact(): t = blosc2.CTable(Product, new_data=DATA) view = t.where(t.qty >= 2) diff --git a/tests/ndarray/test_elementwise_funcs.py b/tests/ndarray/test_elementwise_funcs.py index 85d6edca..c82705f2 100644 --- a/tests/ndarray/test_elementwise_funcs.py +++ b/tests/ndarray/test_elementwise_funcs.py @@ -314,6 +314,7 @@ def test_unary_funcs(np_func, blosc_func, dtype, shape, chunkshape): _test_unary_func_impl(np_func, blosc_func, dtype, shape, chunkshape) +@pytest.mark.heavy @pytest.mark.parametrize(("np_func", "blosc_func"), UNARY_FUNC_PAIRS) @pytest.mark.parametrize("dtype", STR_DTYPES) @pytest.mark.parametrize("shape", [(10,), (20, 20)]) @@ -338,6 +339,7 @@ def test_binary_funcs(np_func, blosc_func, dtype, shape, chunkshape): _test_binary_func_impl(np_func, blosc_func, dtype, shape, chunkshape) +@pytest.mark.heavy @pytest.mark.parametrize(("np_func", "blosc_func"), BINARY_FUNC_PAIRS) @pytest.mark.parametrize("dtype", STR_DTYPES) @pytest.mark.parametrize(("shape", "chunkshape"), SHAPES_CHUNKS) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 6525a07d..7b33cdff 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -27,6 +27,37 @@ NITEMS_SMALL = 100 NITEMS = 1000 +_UNARY_FUNCTIONS = [ + "sin", + "cos", + "sqrt", + "tan", + "arctan", + "exp", + "log", + "conj", + "real", + "imag", + pytest.param("sinh", marks=pytest.mark.heavy), + pytest.param("cosh", marks=pytest.mark.heavy), + pytest.param("tanh", marks=pytest.mark.heavy), + pytest.param("arcsin", marks=pytest.mark.heavy), + pytest.param("arccos", marks=pytest.mark.heavy), + pytest.param("arcsinh", marks=pytest.mark.heavy), + pytest.param("arccosh", marks=pytest.mark.heavy), + pytest.param("arctanh", marks=pytest.mark.heavy), + pytest.param("expm1", marks=pytest.mark.heavy), + pytest.param("log10", marks=pytest.mark.heavy), + pytest.param("log1p", marks=pytest.mark.heavy), +] + +_LAZYEXPR_OPERAND_MIXES = [ + ("NDArray", "numpy"), + ("NDArray", "NDArray"), + pytest.param(("numpy", "NDArray"), marks=pytest.mark.heavy), + pytest.param(("numpy", "numpy"), marks=pytest.mark.heavy), +] + @pytest.fixture(params=[np.float32, np.float64]) def dtype_fixture(request): @@ -357,32 +388,7 @@ def test_comparison_operators(dtype_fixture, compare_expressions, comparison_ope # Skip this test for blosc2.IS_WASM @pytest.mark.skipif(blosc2.IS_WASM, reason="This test is not supported in WASM") -@pytest.mark.parametrize( - "function", - [ - "sin", - "cos", - "tan", - "sqrt", - "sinh", - "cosh", - "tanh", - "arcsin", - "arccos", - "arctan", - "arcsinh", - "arccosh", - "arctanh", - "exp", - "expm1", - "log", - "log10", - "log1p", - "conj", - "real", - "imag", - ], -) +@pytest.mark.parametrize("function", _UNARY_FUNCTIONS) def test_functions(function, dtype_fixture, shape_fixture): nelems = np.prod(shape_fixture) cparams = {"clevel": 0, "codec": blosc2.Codec.LZ4} # Compression parameters @@ -436,10 +442,7 @@ def test_functions(function, dtype_fixture, shape_fixture): np.testing.assert_allclose(expr[()], res_numexpr, rtol=1e-5) -@pytest.mark.parametrize( - "urlpath", - ["arr.b2nd", None], -) +@pytest.mark.parametrize("urlpath", [None, pytest.param("arr.b2nd", marks=pytest.mark.heavy)]) @pytest.mark.parametrize( "function", ["arctan2", "**"], @@ -784,15 +787,15 @@ def test_save_unsafe(): [ "sin", "sqrt", - "cosh", "arctan", - "arcsinh", "exp", - "expm1", "log", "conj", "real", "imag", + pytest.param("cosh", marks=pytest.mark.heavy), + pytest.param("arcsinh", marks=pytest.mark.heavy), + pytest.param("expm1", marks=pytest.mark.heavy), ], ) def test_save_functions(function, dtype_fixture, shape_fixture): @@ -911,10 +914,19 @@ def test_save_many_functions(dtype_fixture, shape_fixture): @pytest.mark.skipif(blosc2.IS_WASM, reason="This test is not supported in WASM") @pytest.mark.parametrize( - "constructor", ["arange", "linspace", "fromiter", "reshape", "zeros", "ones", "full"] + "constructor", + [ + "arange", + "linspace", + "reshape", + "zeros", + "ones", + pytest.param("fromiter", marks=pytest.mark.heavy), + pytest.param("full", marks=pytest.mark.heavy), + ], ) -@pytest.mark.parametrize("shape", [(10,), (10, 10), (10, 10, 10)]) -@pytest.mark.parametrize("dtype", ["int32", "float64", "i2"]) +@pytest.mark.parametrize("shape", [(10,), (10, 10), pytest.param((10, 10, 10), marks=pytest.mark.heavy)]) +@pytest.mark.parametrize("dtype", ["int32", "float64", pytest.param("i2", marks=pytest.mark.heavy)]) @pytest.mark.parametrize("disk", [True, False]) def test_save_constructor(disk, shape, dtype, constructor): lshape = math.prod(shape) @@ -1129,15 +1141,7 @@ def test_broadcasting_str(broadcast_fixture): np.testing.assert_allclose(res, nres) -@pytest.mark.parametrize( - "operand_mix", - [ - ("NDArray", "numpy"), - ("NDArray", "NDArray"), - ("numpy", "NDArray"), - ("numpy", "numpy"), - ], -) +@pytest.mark.parametrize("operand_mix", _LAZYEXPR_OPERAND_MIXES) @pytest.mark.parametrize("operand_guess", [True, False]) def test_lazyexpr(array_fixture, operand_mix, operand_guess): a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture @@ -1179,15 +1183,7 @@ def test_lazyexpr(array_fixture, operand_mix, operand_guess): np.testing.assert_allclose(res, nres[0:10:2]) -@pytest.mark.parametrize( - "operand_mix", - [ - ("NDArray", "numpy"), - ("NDArray", "NDArray"), - ("numpy", "NDArray"), - ("numpy", "numpy"), - ], -) +@pytest.mark.parametrize("operand_mix", _LAZYEXPR_OPERAND_MIXES) @pytest.mark.parametrize( "out_param", ["NDArray", "numpy"], @@ -1405,38 +1401,35 @@ def test_get_expr_operands(expression, expected_operands): "scalar", [ "np.int8(0)", - "np.uint8(0)", - "np.int16(0)", - "np.uint16(0)", - "np.int32(0)", - "np.uint32(0)", - "np.int64(0)", "np.float32(0)", "np.float64(0)", "np.complex64(0)", - "np.complex128(0)", + pytest.param("np.uint8(0)", marks=pytest.mark.heavy), + pytest.param("np.int16(0)", marks=pytest.mark.heavy), + pytest.param("np.uint16(0)", marks=pytest.mark.heavy), + pytest.param("np.int32(0)", marks=pytest.mark.heavy), + pytest.param("np.uint32(0)", marks=pytest.mark.heavy), + pytest.param("np.int64(0)", marks=pytest.mark.heavy), + pytest.param("np.complex128(0)", marks=pytest.mark.heavy), ], ) @pytest.mark.parametrize( ("dtype1", "dtype2"), [ (np.int8, np.int8), - (np.int8, np.int16), - (np.int8, np.int32), - (np.int8, np.int64), (np.int8, np.float32), - (np.int8, np.float64), - (np.uint16, np.uint16), (np.uint16, np.uint32), - # (np.uint16, np.uint64), # numexpr does not support uint64 - (np.uint16, np.float32), - # (np.uint16, np.float64), - # (np.int32, np.int32), - (np.int32, np.int64), - (np.float32, np.float32), (np.float32, np.float64), - (np.complex64, np.complex64), (np.complex64, np.complex128), + pytest.param(np.int8, np.int16, marks=pytest.mark.heavy), + pytest.param(np.int8, np.int32, marks=pytest.mark.heavy), + pytest.param(np.int8, np.int64, marks=pytest.mark.heavy), + pytest.param(np.int8, np.float64, marks=pytest.mark.heavy), + pytest.param(np.uint16, np.uint16, marks=pytest.mark.heavy), + pytest.param(np.uint16, np.float32, marks=pytest.mark.heavy), + pytest.param(np.int32, np.int64, marks=pytest.mark.heavy), + pytest.param(np.float32, np.float32, marks=pytest.mark.heavy), + pytest.param(np.complex64, np.complex64, marks=pytest.mark.heavy), ], ) def test_dtype_infer(dtype1, dtype2, scalar): diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index 7839d84b..ee1f738a 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -16,6 +16,52 @@ NITEMS_SMALL = 1000 NITEMS = 10_000 +_FAST_REDUCTION_OPS = [ + "sum", + "prod", + "min", + "max", + "any", + "mean", + "argmax", + "cumulative_sum", + pytest.param("all", marks=pytest.mark.heavy), + pytest.param("std", marks=pytest.mark.heavy), + pytest.param("var", marks=pytest.mark.heavy), + pytest.param("argmin", marks=pytest.mark.heavy), + pytest.param("cumulative_prod", marks=pytest.mark.heavy), +] + +_SAVE_REDUCTION_OPS = [ + "sum", + "prod", + "min", + "mean", + "argmax", + "cumulative_sum", + pytest.param("max", marks=pytest.mark.heavy), + pytest.param("any", marks=pytest.mark.heavy), + pytest.param("all", marks=pytest.mark.heavy), + pytest.param("std", marks=pytest.mark.heavy), + pytest.param("var", marks=pytest.mark.heavy), + pytest.param("argmin", marks=pytest.mark.heavy), + pytest.param("cumulative_prod", marks=pytest.mark.heavy), +] + +_MINIEXPR_REDUCTION_OPS = [ + "sum", + "prod", + "min", + "mean", + "argmax", + pytest.param("max", marks=pytest.mark.heavy), + pytest.param("any", marks=pytest.mark.heavy), + pytest.param("all", marks=pytest.mark.heavy), + pytest.param("std", marks=pytest.mark.heavy), + pytest.param("var", marks=pytest.mark.heavy), + pytest.param("argmin", marks=pytest.mark.heavy), +] + @pytest.fixture(params=[np.float32, np.float64]) def dtype_fixture(request): @@ -189,7 +235,19 @@ def test_reduce_params(array_fixture, axis, keepdims, dtype_out, reduce_op, kwar # TODO: "prod" is not supported here because it overflows with current values @pytest.mark.parametrize( "reduce_op", - ["cumulative_sum", "sum", "min", "max", "mean", "std", "var", "any", "all", "argmax", "argmin"], + [ + "cumulative_sum", + "sum", + "min", + "mean", + "argmax", + pytest.param("max", marks=pytest.mark.heavy), + pytest.param("std", marks=pytest.mark.heavy), + pytest.param("var", marks=pytest.mark.heavy), + pytest.param("any", marks=pytest.mark.heavy), + pytest.param("all", marks=pytest.mark.heavy), + pytest.param("argmin", marks=pytest.mark.heavy), + ], ) @pytest.mark.parametrize("axis", [None, 0, 1]) def test_reduce_expr_arr(array_fixture, axis, reduce_op): @@ -223,23 +281,7 @@ def test_reduce_expr_arr(array_fixture, axis, reduce_op): # Test broadcasting -@pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "mean", - "std", - "var", - "min", - "max", - "any", - "all", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], -) +@pytest.mark.parametrize("reduce_op", _FAST_REDUCTION_OPS) @pytest.mark.parametrize("axis", [0, (0, 1), None]) @pytest.mark.parametrize("keepdims", [True, False]) @pytest.mark.parametrize( @@ -247,7 +289,7 @@ def test_reduce_expr_arr(array_fixture, axis, reduce_op): [ ((5, 5, 5), (5, 5), (5,)), ((10, 10, 10), (10, 10), (10,)), - ((100, 100, 100), (100, 100), (100,)), + pytest.param(((100, 100, 100), (100, 100), (100,)), marks=pytest.mark.heavy), ], ) def test_broadcast_params(axis, keepdims, reduce_op, shapes): @@ -393,32 +435,15 @@ def test_reduce_slice(reduce_op): [ ((10, 50, 70), (10, 25, 50)), ((20, 50, 100), (10, 50, 100)), - ((10, 50, 100), (6, 25, 75)), - ((15, 30, 75), (7, 20, 50)), - ((1, 50, 100), (1, 50, 60)), + pytest.param((10, 50, 100), (6, 25, 75), marks=pytest.mark.heavy), + pytest.param((15, 30, 75), (7, 20, 50), marks=pytest.mark.heavy), + pytest.param((1, 50, 100), (1, 50, 60), marks=pytest.mark.heavy), ], ) @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [1, 0, 0.32]) -@pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "prod", - "min", - "max", - "any", - "all", - "mean", - "std", - "var", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], -) -@pytest.mark.parametrize("axis", [None, 0, 1]) +@pytest.mark.parametrize("fill_value", [1, 0, pytest.param(0.32, marks=pytest.mark.heavy)]) +@pytest.mark.parametrize("reduce_op", _FAST_REDUCTION_OPS) +@pytest.mark.parametrize("axis", [None, 0, pytest.param(1, marks=pytest.mark.heavy)]) def test_fast_path(chunks, blocks, disk, fill_value, reduce_op, axis): shape = (20, 50, 100) urlpath = "a1.b2nd" if disk else None @@ -455,15 +480,13 @@ def test_fast_path(chunks, blocks, disk, fill_value, reduce_op, axis): ("chunks", "blocks"), [ ((2, 5, 10), (1, 5, 10)), - ((1, 3, 7), (1, 3, 5)), - ((5, 6, 10), (3, 3, 7)), + pytest.param((1, 3, 7), (1, 3, 5), marks=pytest.mark.heavy), + pytest.param((5, 6, 10), (3, 3, 7), marks=pytest.mark.heavy), ], ) @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [0, 1, 0.32]) -@pytest.mark.parametrize( - "reduce_op", ["sum", "prod", "min", "max", "any", "all", "mean", "std", "var", "argmax", "argmin"] -) +@pytest.mark.parametrize("fill_value", [0, 1, pytest.param(0.32, marks=pytest.mark.heavy)]) +@pytest.mark.parametrize("reduce_op", _MINIEXPR_REDUCTION_OPS) def test_miniexpr_slice(chunks, blocks, disk, fill_value, reduce_op): shape = (10, 10, 12) axis = None @@ -486,26 +509,11 @@ def test_miniexpr_slice(chunks, blocks, disk, fill_value, reduce_op): @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [0, 1, 0.32]) @pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "prod", - "min", - "max", - "any", - "all", - "mean", - "std", - "var", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], + "fill_value", [1, pytest.param(0, marks=pytest.mark.heavy), pytest.param(0.32, marks=pytest.mark.heavy)] ) -@pytest.mark.parametrize("axis", [0, (0, 1), None]) +@pytest.mark.parametrize("reduce_op", _SAVE_REDUCTION_OPS) +@pytest.mark.parametrize("axis", [0, None, pytest.param((0, 1), marks=pytest.mark.heavy)]) def test_save_version1(disk, fill_value, reduce_op, axis): shape = (20, 50, 100) if reduce_op in ("argmax", "argmin", "cumulative_sum", "cumulative_prod"): @@ -547,26 +555,11 @@ def test_save_version1(disk, fill_value, reduce_op, axis): @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [0, 1, 0.32]) @pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "prod", - "min", - "max", - "any", - "all", - "mean", - "std", - "var", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], + "fill_value", [1, pytest.param(0, marks=pytest.mark.heavy), pytest.param(0.32, marks=pytest.mark.heavy)] ) -@pytest.mark.parametrize("axis", [0, (0, 1), None]) +@pytest.mark.parametrize("reduce_op", _SAVE_REDUCTION_OPS) +@pytest.mark.parametrize("axis", [0, None, pytest.param((0, 1), marks=pytest.mark.heavy)]) def test_save_version2(disk, fill_value, reduce_op, axis): shape = (20, 50, 100) if reduce_op in ("argmax", "argmin", "cumulative_sum", "cumulative_prod"): @@ -607,26 +600,11 @@ def test_save_version2(disk, fill_value, reduce_op, axis): @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [0, 1, 0.32]) @pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "prod", - "min", - "max", - "any", - "all", - "mean", - "std", - "var", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], + "fill_value", [1, pytest.param(0, marks=pytest.mark.heavy), pytest.param(0.32, marks=pytest.mark.heavy)] ) -@pytest.mark.parametrize("axis", [0, (0, 1), None]) +@pytest.mark.parametrize("reduce_op", _SAVE_REDUCTION_OPS) +@pytest.mark.parametrize("axis", [0, None, pytest.param((0, 1), marks=pytest.mark.heavy)]) def test_save_version3(disk, fill_value, reduce_op, axis): shape = (20, 50, 100) if reduce_op in ("argmax", "argmin", "cumulative_sum", "cumulative_prod"): @@ -667,26 +645,11 @@ def test_save_version3(disk, fill_value, reduce_op, axis): @pytest.mark.parametrize("disk", [True, False]) -@pytest.mark.parametrize("fill_value", [0, 1, 0.32]) @pytest.mark.parametrize( - "reduce_op", - [ - "sum", - "prod", - "min", - "max", - "any", - "all", - "mean", - "std", - "var", - "argmax", - "argmin", - "cumulative_sum", - "cumulative_prod", - ], + "fill_value", [1, pytest.param(0, marks=pytest.mark.heavy), pytest.param(0.32, marks=pytest.mark.heavy)] ) -@pytest.mark.parametrize("axis", [0, (0, 1), None]) +@pytest.mark.parametrize("reduce_op", _SAVE_REDUCTION_OPS) +@pytest.mark.parametrize("axis", [0, None, pytest.param((0, 1), marks=pytest.mark.heavy)]) def test_save_version4(disk, fill_value, reduce_op, axis): if reduce_op in ("argmax", "argmin", "cumulative_sum", "cumulative_prod"): axis = 1 if isinstance(axis, tuple) else axis diff --git a/tests/test_batch_array.py b/tests/test_batch_array.py index 6ac5fa79..04254e79 100644 --- a/tests/test_batch_array.py +++ b/tests/test_batch_array.py @@ -380,17 +380,24 @@ def test_batcharray_respects_explicit_use_dict_and_non_zstd(): assert barray.cparams.use_dict is False -def test_batcharray_guess_items_per_block_uses_l1_for_clevel_5(monkeypatch): +def test_batcharray_guess_items_per_block_uses_l1_for_low_clevel(monkeypatch): monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1000) - barray = blosc2.BatchArray(cparams={"clevel": 5}) + barray = blosc2.BatchArray(cparams={"clevel": 3}) assert barray._guess_blocksize([30, 30, 30, 30]) == 3 -def test_batcharray_guess_items_per_block_uses_l2_for_mid_clevel(monkeypatch): +def test_batcharray_guess_items_per_block_uses_half_l2_for_default_clevel(monkeypatch): + monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) + monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 150) + barray = blosc2.BatchArray(cparams={"clevel": 5}) + assert barray._guess_blocksize([60, 60, 60, 60]) == 1 + + +def test_batcharray_guess_items_per_block_uses_l2_for_high_clevel(monkeypatch): monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 150) - barray = blosc2.BatchArray(cparams={"clevel": 6}) + barray = blosc2.BatchArray(cparams={"clevel": 7}) assert barray._guess_blocksize([60, 60, 60, 60]) == 2