diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index ab77f625764..43d0c56a80c 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -10,6 +10,7 @@ use crate::CascadingCompressor; use crate::Scheme; use crate::SchemeExt; use crate::SchemeId; +use crate::schemes::binary; use crate::schemes::bool; use crate::schemes::decimal; use crate::schemes::float; @@ -56,6 +57,11 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &string::FSSTScheme, &string::StringConstantScheme, &string::NullDominatedSparseScheme, + //////////////////////////////////////////////////////////////////////////////////////////////// + // Binary schemes. + //////////////////////////////////////////////////////////////////////////////////////////////// + &binary::BinaryDictScheme, + &binary::BinaryConstantScheme, // Decimal schemes. &decimal::DecimalScheme, // Temporal schemes. @@ -175,6 +181,7 @@ impl BtrBlocksCompressorBuilder { float::NullDominatedSparseScheme.id(), string::StringDictScheme.id(), string::FSSTScheme.id(), + binary::BinaryDictScheme.id(), ]); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 0be774bfbc5..5e4bc4845ef 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -67,10 +67,14 @@ mod tests { use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::Constant; + use vortex_array::arrays::Dict; use vortex_array::arrays::List; use vortex_array::arrays::ListView; use vortex_array::arrays::ListViewArray; + use vortex_array::arrays::VarBinViewArray; use vortex_array::assert_arrays_eq; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; @@ -191,4 +195,35 @@ mod tests { assert_arrays_eq!(compressed, array); Ok(()) } + + #[test] + fn test_binary_constant_compressed() -> VortexResult<()> { + let values = vec![Some(b"constant-bytes".as_slice()); 100]; + let array = VarBinViewArray::from_iter(values, DType::Binary(Nullability::NonNullable)); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress( + &array.clone().into_array(), + &mut SESSION.create_execution_ctx(), + )?; + assert!(compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } + + #[test] + fn test_binary_dict_compressed() -> VortexResult<()> { + let distinct_values: [&[u8]; 3] = [b"alpha", b"beta", b"gamma"]; + let values = (0..1000) + .map(|idx| Some(distinct_values[idx % distinct_values.len()])) + .collect::>(); + let array = VarBinViewArray::from_iter(values, DType::Binary(Nullability::NonNullable)); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress( + &array.clone().into_array(), + &mut SESSION.create_execution_ctx(), + )?; + assert!(compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } } diff --git a/vortex-btrblocks/src/schemes/binary.rs b/vortex-btrblocks/src/schemes/binary.rs new file mode 100644 index 00000000000..2e8b28cd396 --- /dev/null +++ b/vortex-btrblocks/src/schemes/binary.rs @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Binary compression schemes. + +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::BinaryConstantScheme; +pub use vortex_compressor::builtins::BinaryDictScheme; diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs index 081c066856c..70d420c1ff3 100644 --- a/vortex-btrblocks/src/schemes/float.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -61,7 +61,6 @@ pub struct PcoScheme; // Re-export builtin schemes from vortex-compressor. pub use vortex_compressor::builtins::FloatConstantScheme; pub use vortex_compressor::builtins::FloatDictScheme; -pub use vortex_compressor::builtins::is_float_primitive; pub use vortex_compressor::stats::FloatStats; /// RLE scheme for float arrays. @@ -74,7 +73,7 @@ impl Scheme for ALPScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } /// Children: encoded_ints=0. @@ -152,7 +151,7 @@ impl Scheme for ALPRDScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } fn expected_compression_ratio( @@ -212,7 +211,7 @@ impl Scheme for NullDominatedSparseScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } /// Children: indices=0. @@ -297,7 +296,7 @@ impl Scheme for PcoScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } fn expected_compression_ratio( @@ -332,7 +331,7 @@ impl Scheme for FloatRLEScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } /// Children: values=0, indices=1, offsets=2. diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index 183d0c32fb3..55b8e5b53db 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -13,6 +13,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::patched::use_experimental_patches; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::scalar::Scalar; +use vortex_compressor::builtins::BinaryDictScheme; use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; use vortex_compressor::estimate::CompressionEstimate; @@ -89,7 +90,6 @@ pub struct PcoScheme; // Re-export builtin schemes from vortex-compressor. pub use vortex_compressor::builtins::IntConstantScheme; pub use vortex_compressor::builtins::IntDictScheme; -pub use vortex_compressor::builtins::is_integer_primitive; pub use vortex_compressor::stats::IntegerStats; /// RLE scheme for integer arrays. @@ -108,7 +108,7 @@ impl Scheme for FoRScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } /// Dict codes always start at 0, so FoR (which subtracts the min) is a no-op. @@ -126,6 +126,10 @@ impl Scheme for FoRScheme { ancestor: StringDictScheme.id(), children: ChildSelection::One(1), }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(1), + }, ] } @@ -221,7 +225,7 @@ impl Scheme for ZigZagScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } /// Children: encoded=0. @@ -264,6 +268,10 @@ impl Scheme for ZigZagScheme { ancestor: StringDictScheme.id(), children: ChildSelection::One(1), }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(1), + }, ] } @@ -317,7 +325,7 @@ impl Scheme for BitPackingScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } fn expected_compression_ratio( @@ -412,7 +420,7 @@ impl Scheme for SparseScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } fn stats_options(&self) -> GenerateStatsOptions { @@ -583,7 +591,7 @@ impl Scheme for RunEndScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } /// Children: values=0, ends=1. @@ -630,6 +638,10 @@ impl Scheme for RunEndScheme { ancestor: StringDictScheme.id(), children: ChildSelection::One(0), }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(0), + }, ] } @@ -683,7 +695,7 @@ impl Scheme for SequenceScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } /// Sequence encoding on dictionary codes just adds a layer of indirection without compressing @@ -703,6 +715,10 @@ impl Scheme for SequenceScheme { ancestor: StringDictScheme.id(), children: ChildSelection::One(1), }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(1), + }, ] } @@ -784,7 +800,7 @@ impl Scheme for PcoScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } fn expected_compression_ratio( @@ -938,7 +954,7 @@ impl Scheme for IntRLEScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } /// Children: values=0, indices=1, offsets=2. diff --git a/vortex-btrblocks/src/schemes/mod.rs b/vortex-btrblocks/src/schemes/mod.rs index 36dcaade4cf..16123429e86 100644 --- a/vortex-btrblocks/src/schemes/mod.rs +++ b/vortex-btrblocks/src/schemes/mod.rs @@ -3,6 +3,7 @@ //! Compression scheme implementations. +pub mod binary; pub mod bool; pub mod float; pub mod integer; @@ -13,6 +14,7 @@ pub mod temporal; pub(crate) mod patches; +use vortex_compressor::builtins::BinaryDictScheme; use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::IntDictScheme; use vortex_compressor::builtins::StringDictScheme; @@ -63,5 +65,9 @@ fn rle_ancestor_exclusions() -> Vec { ancestor: StringDictScheme.id(), children: ChildSelection::One(0), }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(0), + }, ] } diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 47bea1670a2..db98559c3da 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -55,7 +55,6 @@ pub struct ZstdBuffersScheme; // Re-export builtin schemes from vortex-compressor. pub use vortex_compressor::builtins::StringConstantScheme; pub use vortex_compressor::builtins::StringDictScheme; -pub use vortex_compressor::builtins::is_utf8_string; pub use vortex_compressor::stats::StringStats; impl Scheme for FSSTScheme { @@ -64,7 +63,7 @@ impl Scheme for FSSTScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } /// Children: lengths=0, code_offsets=1. @@ -88,7 +87,7 @@ impl Scheme for FSSTScheme { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let utf8 = data.array_as_utf8().into_owned(); + let utf8 = data.array_as_varbinview().into_owned(); let compressor_fsst = fsst_train_compressor(&utf8); let fsst = fsst_compress(&utf8, utf8.len(), utf8.dtype(), &compressor_fsst, exec_ctx); @@ -144,7 +143,7 @@ impl Scheme for NullDominatedSparseScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } /// Children: indices=0. @@ -173,7 +172,7 @@ impl Scheme for NullDominatedSparseScheme { exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { let len = data.array_len() as f64; - let stats = data.string_stats(exec_ctx); + let stats = data.varbinview_stats(exec_ctx); let value_count = stats.value_count(); // All-null arrays should be compressed as constant instead anyways. @@ -236,7 +235,7 @@ impl Scheme for ZstdScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } fn expected_compression_ratio( @@ -255,7 +254,7 @@ impl Scheme for ZstdScheme { _compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let compacted = data.array_as_utf8().into_owned().compact_buffers()?; + let compacted = data.array_as_varbinview().into_owned().compact_buffers()?; Ok( vortex_zstd::Zstd::from_var_bin_view_without_dict(&compacted, 3, 8192, exec_ctx)? .into_array(), @@ -270,7 +269,7 @@ impl Scheme for ZstdBuffersScheme { } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } fn expected_compression_ratio( diff --git a/vortex-compressor/src/builtins/constant/binary.rs b/vortex-compressor/src/builtins/constant/binary.rs new file mode 100644 index 00000000000..4ad24fe57b5 --- /dev/null +++ b/vortex-compressor/src/builtins/constant/binary.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding for binary arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::builtins::constant::compress_constant_array_with_validity; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::estimate::DeferredEstimate; +use crate::estimate::EstimateVerdict; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +/// Constant encoding for binary arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BinaryConstantScheme; + +impl Scheme for BinaryConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.binary.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + canonical.dtype().is_binary() + } + + fn expected_compression_ratio( + &self, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + // Constant detection on a sample is a false positive, since the sample being constant does + // not mean the full array is constant. + if compress_ctx.is_sample() { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + + let array_len = data.array().len(); + let stats = data.varbinview_stats(exec_ctx); + + // We want to use `Constant` if there are only nulls in the array. + if stats.value_count() == 0 { + debug_assert_eq!(stats.null_count() as usize, array_len); + return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse); + } + + // Since the estimated distinct count is always going to be less than or equal to the actual + // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. + if stats.estimated_distinct_count().is_some_and(|c| c > 1) { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + + // Otherwise our best bet is to actually check if the array is constant. + // This is an expensive check, but the alternative of not compressing a constant array is + // far less preferable. + CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( + |_compressor, data, _best_so_far, _ctx, exec_ctx| { + if is_constant(data.array(), exec_ctx)? { + Ok(EstimateVerdict::AlwaysUse) + } else { + Ok(EstimateVerdict::Skip) + } + }, + ))) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &ArrayAndStats, + _compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + compress_constant_array_with_validity(data.array(), exec_ctx) + } +} diff --git a/vortex-compressor/src/builtins/constant/bool.rs b/vortex-compressor/src/builtins/constant/bool.rs index 3716e85e9ea..a3bdcb0216e 100644 --- a/vortex-compressor/src/builtins/constant/bool.rs +++ b/vortex-compressor/src/builtins/constant/bool.rs @@ -9,7 +9,6 @@ use vortex_array::ExecutionCtx; use vortex_error::VortexResult; use crate::CascadingCompressor; -use crate::builtins::BoolConstantScheme; use crate::builtins::constant::compress_constant_array_with_validity; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; @@ -17,6 +16,10 @@ use crate::estimate::EstimateVerdict; use crate::scheme::Scheme; use crate::stats::ArrayAndStats; +/// Constant encoding for bool arrays where all valid values are the same. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BoolConstantScheme; + impl Scheme for BoolConstantScheme { fn scheme_name(&self) -> &'static str { "vortex.bool.constant" diff --git a/vortex-compressor/src/builtins/constant/float.rs b/vortex-compressor/src/builtins/constant/float.rs index 501e3cd9623..0480a1b7a53 100644 --- a/vortex-compressor/src/builtins/constant/float.rs +++ b/vortex-compressor/src/builtins/constant/float.rs @@ -9,9 +9,7 @@ use vortex_array::ExecutionCtx; use vortex_array::aggregate_fn::fns::is_constant::is_constant; use vortex_error::VortexResult; -use super::is_float_primitive; use crate::CascadingCompressor; -use crate::builtins::FloatConstantScheme; use crate::builtins::constant::compress_constant_array_with_validity; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; @@ -20,13 +18,17 @@ use crate::estimate::EstimateVerdict; use crate::scheme::Scheme; use crate::stats::ArrayAndStats; +/// Constant encoding for float arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatConstantScheme; + impl Scheme for FloatConstantScheme { fn scheme_name(&self) -> &'static str { "vortex.float.constant" } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } fn expected_compression_ratio( diff --git a/vortex-compressor/src/builtins/constant/integer.rs b/vortex-compressor/src/builtins/constant/integer.rs index a10481ad100..3f324c36e17 100644 --- a/vortex-compressor/src/builtins/constant/integer.rs +++ b/vortex-compressor/src/builtins/constant/integer.rs @@ -8,9 +8,7 @@ use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_error::VortexResult; -use super::is_integer_primitive; use crate::CascadingCompressor; -use crate::builtins::IntConstantScheme; use crate::builtins::constant::compress_constant_array_with_validity; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; @@ -18,13 +16,17 @@ use crate::estimate::EstimateVerdict; use crate::scheme::Scheme; use crate::stats::ArrayAndStats; +/// Constant encoding for integer arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntConstantScheme; + impl Scheme for IntConstantScheme { fn scheme_name(&self) -> &'static str { "vortex.int.constant" } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } fn expected_compression_ratio( diff --git a/vortex-compressor/src/builtins/constant/mod.rs b/vortex-compressor/src/builtins/constant/mod.rs index 139cb8d2790..d5927366fdf 100644 --- a/vortex-compressor/src/builtins/constant/mod.rs +++ b/vortex-compressor/src/builtins/constant/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Constant encoding schemes for bool, float, integer, and string arrays. +//! Constant encoding schemes for binary, bool, float, integer, and string arrays. use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -12,32 +12,19 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexExpect; use vortex_error::VortexResult; -use super::is_float_primitive; -use super::is_integer_primitive; -use super::is_utf8_string; - -/// Constant encoding for bool arrays where all valid values are the same. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct BoolConstantScheme; - -/// Constant encoding for integer arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntConstantScheme; - -/// Constant encoding for float arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatConstantScheme; - -/// Constant encoding for string arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct StringConstantScheme; - +mod binary; mod bool; mod float; mod integer; mod string; -/// Shared helper for compressing a constant array (bool, int, float, string) into a +pub use binary::BinaryConstantScheme; +pub use bool::BoolConstantScheme; +pub use float::FloatConstantScheme; +pub use integer::IntConstantScheme; +pub use string::StringConstantScheme; + +/// Shared helper for compressing a constant array (binary, bool, int, float, string) into a /// [`ConstantArray`]. /// /// Assumes that the source array has constant valid scalars. diff --git a/vortex-compressor/src/builtins/constant/string.rs b/vortex-compressor/src/builtins/constant/string.rs index fcd6138bca2..f55c1661660 100644 --- a/vortex-compressor/src/builtins/constant/string.rs +++ b/vortex-compressor/src/builtins/constant/string.rs @@ -9,9 +9,7 @@ use vortex_array::ExecutionCtx; use vortex_array::aggregate_fn::fns::is_constant::is_constant; use vortex_error::VortexResult; -use super::is_utf8_string; use crate::CascadingCompressor; -use crate::builtins::StringConstantScheme; use crate::builtins::constant::compress_constant_array_with_validity; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; @@ -20,13 +18,17 @@ use crate::estimate::EstimateVerdict; use crate::scheme::Scheme; use crate::stats::ArrayAndStats; +/// Constant encoding for string arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringConstantScheme; + impl Scheme for StringConstantScheme { fn scheme_name(&self) -> &'static str { "vortex.string.constant" } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } fn expected_compression_ratio( @@ -42,7 +44,7 @@ impl Scheme for StringConstantScheme { } let array_len = data.array().len(); - let stats = data.string_stats(exec_ctx); + let stats = data.varbinview_stats(exec_ctx); // We want to use `Constant` if there are only nulls in the array. if stats.value_count() == 0 { diff --git a/vortex-compressor/src/builtins/dict/binary.rs b/vortex-compressor/src/builtins/dict/binary.rs new file mode 100644 index 00000000000..bd194aa1121 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/binary.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Binary-specific dictionary encoding implementation. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted +//! for external compatibility. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::dict::DictArrayExt; +use vortex_array::arrays::dict::DictArraySlotsExt; +use vortex_array::arrays::primitive::PrimitiveArrayExt; +use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::builtins::IntDictScheme; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::estimate::DeferredEstimate; +use crate::estimate::EstimateVerdict; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Dictionary encoding for low-cardinality binary values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BinaryDictScheme; + +impl Scheme for BinaryDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.binary.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + canonical.dtype().is_binary() + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// Binary dict codes (child 1) are compact unsigned integers that should not be dict-encoded + /// again. + /// + /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, + /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }] + } + + fn expected_compression_ratio( + &self, + data: &ArrayAndStats, + _compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + let stats = data.varbinview_stats(exec_ctx); + + if stats.value_count() == 0 { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + + let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dictionary scheme. + if estimated_distinct_values_count > stats.value_count() / 2 { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + + // Let sampling determine the expected ratio. + CompressionEstimate::Deferred(DeferredEstimate::Sample) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let dict = dict_encode(data.array(), exec_ctx)?; + + // Values = child 0. + let compressed_values = + compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?; + + // Codes = child 1. + let narrowed_codes = dict + .codes() + .clone() + .execute::(exec_ctx)? + .narrow(exec_ctx)? + .into_array(); + let compressed_codes = + compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs index 7ef175ec904..bf591963db3 100644 --- a/vortex-compressor/src/builtins/dict/float.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -24,9 +24,7 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; -use crate::builtins::FloatDictScheme; use crate::builtins::IntDictScheme; -use crate::builtins::is_float_primitive; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; use crate::estimate::DeferredEstimate; @@ -40,13 +38,17 @@ use crate::stats::FloatErasedStats; use crate::stats::FloatStats; use crate::stats::GenerateStatsOptions; +/// Dictionary encoding for low-cardinality float values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatDictScheme; + impl Scheme for FloatDictScheme { fn scheme_name(&self) -> &'static str { "vortex.float.dict" } fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) + canonical.dtype().is_float() } fn stats_options(&self) -> GenerateStatsOptions { diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index e86f31f69b1..82af7f14de7 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -23,8 +23,6 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::CascadingCompressor; -use crate::builtins::IntDictScheme; -use crate::builtins::is_integer_primitive; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; use crate::estimate::EstimateVerdict; @@ -35,13 +33,17 @@ use crate::stats::GenerateStatsOptions; use crate::stats::IntegerErasedStats; use crate::stats::IntegerStats; +/// Dictionary encoding for low-cardinality integer values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntDictScheme; + impl Scheme for IntDictScheme { fn scheme_name(&self) -> &'static str { "vortex.int.dict" } fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) + canonical.dtype().is_int() } fn stats_options(&self) -> GenerateStatsOptions { diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index c8e573b4fbc..4862df2b211 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -1,23 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Dictionary encoding schemes for integer, float, and string arrays. - -/// Dictionary encoding for low-cardinality float values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatDictScheme; - -/// Dictionary encoding for low-cardinality integer values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntDictScheme; - -/// Dictionary encoding for low-cardinality string values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct StringDictScheme; +//! Dictionary encoding schemes for binary, integer, float, and string arrays. +mod binary; mod float; mod integer; mod string; +pub use binary::BinaryDictScheme; +pub use float::FloatDictScheme; pub use float::dictionary_encode as float_dictionary_encode; +pub use integer::IntDictScheme; pub use integer::dictionary_encode as integer_dictionary_encode; +pub use string::StringDictScheme; diff --git a/vortex-compressor/src/builtins/dict/string.rs b/vortex-compressor/src/builtins/dict/string.rs index 94be23fbd64..72db72c2cc5 100644 --- a/vortex-compressor/src/builtins/dict/string.rs +++ b/vortex-compressor/src/builtins/dict/string.rs @@ -21,8 +21,6 @@ use vortex_error::VortexResult; use crate::CascadingCompressor; use crate::builtins::IntDictScheme; -use crate::builtins::StringDictScheme; -use crate::builtins::is_utf8_string; use crate::ctx::CompressorContext; use crate::estimate::CompressionEstimate; use crate::estimate::DeferredEstimate; @@ -34,13 +32,17 @@ use crate::scheme::SchemeExt; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; +/// Dictionary encoding for low-cardinality string values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringDictScheme; + impl Scheme for StringDictScheme { fn scheme_name(&self) -> &'static str { "vortex.string.dict" } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } fn stats_options(&self) -> GenerateStatsOptions { @@ -72,7 +74,7 @@ impl Scheme for StringDictScheme { _compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { - let stats = data.string_stats(exec_ctx); + let stats = data.varbinview_stats(exec_ctx); if stats.value_count() == 0 { return CompressionEstimate::Verdict(EstimateVerdict::Skip); diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs index c5bd9f343f5..d014f1a6cf3 100644 --- a/vortex-compressor/src/builtins/mod.rs +++ b/vortex-compressor/src/builtins/mod.rs @@ -10,30 +10,9 @@ //! [`DictArray`]: vortex_array::arrays::DictArray //! [`MaskedArray`]: vortex_array::arrays::MaskedArray -use vortex_array::Canonical; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; - -/// Returns `true` if the canonical array is a primitive with an integer ptype. -pub fn is_integer_primitive(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) -} - -/// Returns `true` if the canonical form represents a floating-point primitive. -pub fn is_float_primitive(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) -} - -/// Returns `true` if the canonical array is a UTF-8 string type. -pub fn is_utf8_string(canonical: &Canonical) -> bool { - matches!(canonical, - Canonical::VarBinView(v) if - v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) - ) -} - mod dict; +pub use dict::BinaryDictScheme; pub use dict::FloatDictScheme; pub use dict::IntDictScheme; pub use dict::StringDictScheme; @@ -42,6 +21,7 @@ pub use dict::integer_dictionary_encode; mod constant; +pub use constant::BinaryConstantScheme; pub use constant::BoolConstantScheme; pub use constant::FloatConstantScheme; pub use constant::IntConstantScheme; diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 14504474e9d..9d31bbb9330 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -27,8 +27,6 @@ use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::arrays::scalar_fn::AnyScalarFn; use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::arrays::variant::VariantArrayExt; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; @@ -216,16 +214,8 @@ impl CascadingCompressor { )? .into_array()) } - Canonical::VarBinView(strings) => { - if strings - .dtype() - .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) - { - self.choose_and_compress(Canonical::VarBinView(strings), compress_ctx, exec_ctx) - } else { - // We do not compress binary arrays. - Ok(strings.into_array()) - } + Canonical::VarBinView(varbinview) => { + self.choose_and_compress(Canonical::VarBinView(varbinview), compress_ctx, exec_ctx) } Canonical::Extension(ext_array) => { let before_nbytes = ext_array.as_ref().nbytes(); diff --git a/vortex-compressor/src/stats/cache.rs b/vortex-compressor/src/stats/cache.rs index 6f7020191a1..66ab655728c 100644 --- a/vortex-compressor/src/stats/cache.rs +++ b/vortex-compressor/src/stats/cache.rs @@ -76,14 +76,14 @@ impl StatsCache { /// original array are not reused. /// /// Built-in stats are accessed via typed methods ([`integer_stats`], [`float_stats`], -/// [`string_stats`]) which generate stats lazily on first access using the stored +/// [`varbinview_stats`]) which generate stats lazily on first access using the stored /// [`GenerateStatsOptions`]. /// /// Extension schemes can use [`get_or_insert_with`] for custom stats types. /// /// [`integer_stats`]: ArrayAndStats::integer_stats /// [`float_stats`]: ArrayAndStats::float_stats -/// [`string_stats`]: ArrayAndStats::string_stats +/// [`varbinview_stats`]: ArrayAndStats::varbinview_stats /// [`get_or_insert_with`]: ArrayAndStats::get_or_insert_with pub struct ArrayAndStats { /// The array. This is always in canonical form. @@ -135,8 +135,8 @@ impl ArrayAndStats { /// /// # Panics /// - /// Panics if the array is not a UTF-8 string array. - pub fn array_as_utf8(&self) -> ArrayView<'_, VarBinView> { + /// Panics if the array is not a varbinview array. + pub fn array_as_varbinview(&self) -> ArrayView<'_, VarBinView> { self.array .as_opt::() .vortex_expect("the array is guaranteed to already be canonical by construction") @@ -190,8 +190,8 @@ impl ArrayAndStats { }) } - /// Returns string stats, generating them lazily on first access. - pub fn string_stats(&self, ctx: &mut ExecutionCtx) -> Arc { + /// Returns varbinview stats, generating them lazily on first access. + pub fn varbinview_stats(&self, ctx: &mut ExecutionCtx) -> Arc { let array = self.array.clone(); let opts = self.opts; self.cache.get_or_insert_with::(|| { diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs index 276fa8f056c..8f6e7720837 100644 --- a/vortex-compressor/src/stats/mod.rs +++ b/vortex-compressor/src/stats/mod.rs @@ -8,7 +8,7 @@ mod cache; mod float; mod integer; mod options; -mod string; +mod varbinview; pub use bool::BoolStats; pub use cache::ArrayAndStats; @@ -21,4 +21,4 @@ pub use integer::ErasedStats as IntegerErasedStats; pub use integer::IntegerStats; pub use integer::TypedStats as IntegerTypedStats; pub use options::GenerateStatsOptions; -pub use string::StringStats; +pub use varbinview::StringStats; diff --git a/vortex-compressor/src/stats/string.rs b/vortex-compressor/src/stats/varbinview.rs similarity index 83% rename from vortex-compressor/src/stats/string.rs rename to vortex-compressor/src/stats/varbinview.rs index 8613aa5cc37..486d431e684 100644 --- a/vortex-compressor/src/stats/string.rs +++ b/vortex-compressor/src/stats/varbinview.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! String compression statistics. +//! Variable-length byte/string compression statistics. use vortex_array::ExecutionCtx; use vortex_array::arrays::VarBinViewArray; @@ -12,10 +12,10 @@ use vortex_utils::aliases::hash_set::HashSet; use super::GenerateStatsOptions; -/// Array of variable-length byte arrays, and relevant stats for compression. +/// Array of variable-length byte/string values, and relevant stats for compression. #[derive(Clone, Debug)] pub struct StringStats { - /// The estimated number of distinct strings, or `None` if not computed. + /// The estimated number of distinct values, or `None` if not computed. /// This _must_ be non-zero. estimated_distinct_count: Option, /// The number of non-null values. @@ -24,10 +24,10 @@ pub struct StringStats { null_count: u32, } -/// Estimate the number of distinct strings in the var bin view array. -fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { - let views = strings.views(); - // Iterate the views. Two strings which are equal must have the same first 8-bytes. +/// Estimate the number of distinct values in the var bin view array. +fn estimate_distinct_count(varbinview: &VarBinViewArray) -> VortexResult { + let views = varbinview.views(); + // Iterate the views. Two values which are equal must have the same first 8-bytes. // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all // share a 4-byte prefix and have the same length. let mut distinct = HashSet::with_capacity(views.len() / 2); @@ -84,7 +84,7 @@ impl StringStats { .vortex_expect("StringStats::generate_opts should not fail") } - /// Returns the estimated number of distinct strings, or `None` if not computed. + /// Returns the estimated number of distinct values, or `None` if not computed. /// /// This estimation is always going to be less than or equal to the actual distinct count. pub fn estimated_distinct_count(&self) -> Option {