Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions vortex-btrblocks/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::CascadingCompressor;
use crate::Scheme;
use crate::SchemeExt;
use crate::SchemeId;
use crate::schemes::binary;
use crate::schemes::bool;
use crate::schemes::decimal;
use crate::schemes::float;
Expand Down Expand Up @@ -56,6 +57,11 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[
&string::FSSTScheme,
&string::StringConstantScheme,
&string::NullDominatedSparseScheme,
////////////////////////////////////////////////////////////////////////////////////////////////
// Binary schemes.
////////////////////////////////////////////////////////////////////////////////////////////////
&binary::BinaryDictScheme,
&binary::BinaryConstantScheme,
// Decimal schemes.
&decimal::DecimalScheme,
// Temporal schemes.
Expand Down Expand Up @@ -175,6 +181,7 @@ impl BtrBlocksCompressorBuilder {
float::NullDominatedSparseScheme.id(),
string::StringDictScheme.id(),
string::FSSTScheme.id(),
binary::BinaryDictScheme.id(),
]);

#[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
Expand Down
35 changes: 35 additions & 0 deletions vortex-btrblocks/src/canonical_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@ mod tests {
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::Constant;
use vortex_array::arrays::Dict;
use vortex_array::arrays::List;
use vortex_array::arrays::ListView;
use vortex_array::arrays::ListViewArray;
use vortex_array::arrays::VarBinViewArray;
use vortex_array::assert_arrays_eq;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_array::session::ArraySession;
use vortex_array::validity::Validity;
use vortex_buffer::BitBuffer;
Expand Down Expand Up @@ -191,4 +195,35 @@ mod tests {
assert_arrays_eq!(compressed, array);
Ok(())
}

#[test]
fn test_binary_constant_compressed() -> VortexResult<()> {
let values = vec![Some(b"constant-bytes".as_slice()); 100];
let array = VarBinViewArray::from_iter(values, DType::Binary(Nullability::NonNullable));
let btr = BtrBlocksCompressor::default();
let compressed = btr.compress(
&array.clone().into_array(),
&mut SESSION.create_execution_ctx(),
)?;
assert!(compressed.is::<Constant>());
assert_arrays_eq!(compressed, array);
Ok(())
}

#[test]
fn test_binary_dict_compressed() -> VortexResult<()> {
let distinct_values: [&[u8]; 3] = [b"alpha", b"beta", b"gamma"];
let values = (0..1000)
.map(|idx| Some(distinct_values[idx % distinct_values.len()]))
.collect::<Vec<_>>();
let array = VarBinViewArray::from_iter(values, DType::Binary(Nullability::NonNullable));
let btr = BtrBlocksCompressor::default();
let compressed = btr.compress(
&array.clone().into_array(),
&mut SESSION.create_execution_ctx(),
)?;
assert!(compressed.is::<Dict>());
assert_arrays_eq!(compressed, array);
Ok(())
}
}
8 changes: 8 additions & 0 deletions vortex-btrblocks/src/schemes/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Binary compression schemes.

// Re-export builtin schemes from vortex-compressor.
pub use vortex_compressor::builtins::BinaryConstantScheme;
pub use vortex_compressor::builtins::BinaryDictScheme;
11 changes: 5 additions & 6 deletions vortex-btrblocks/src/schemes/float.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ pub struct PcoScheme;
// Re-export builtin schemes from vortex-compressor.
pub use vortex_compressor::builtins::FloatConstantScheme;
pub use vortex_compressor::builtins::FloatDictScheme;
pub use vortex_compressor::builtins::is_float_primitive;
pub use vortex_compressor::stats::FloatStats;

/// RLE scheme for float arrays.
Expand All @@ -74,7 +73,7 @@ impl Scheme for ALPScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_float_primitive(canonical)
canonical.dtype().is_float()
}

/// Children: encoded_ints=0.
Expand Down Expand Up @@ -152,7 +151,7 @@ impl Scheme for ALPRDScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_float_primitive(canonical)
canonical.dtype().is_float()
}

fn expected_compression_ratio(
Expand Down Expand Up @@ -212,7 +211,7 @@ impl Scheme for NullDominatedSparseScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_float_primitive(canonical)
canonical.dtype().is_float()
}

/// Children: indices=0.
Expand Down Expand Up @@ -297,7 +296,7 @@ impl Scheme for PcoScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_float_primitive(canonical)
canonical.dtype().is_float()
}

fn expected_compression_ratio(
Expand Down Expand Up @@ -332,7 +331,7 @@ impl Scheme for FloatRLEScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_float_primitive(canonical)
canonical.dtype().is_float()
}

/// Children: values=0, indices=1, offsets=2.
Expand Down
34 changes: 25 additions & 9 deletions vortex-btrblocks/src/schemes/integer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::patched::use_experimental_patches;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::scalar::Scalar;
use vortex_compressor::builtins::BinaryDictScheme;
use vortex_compressor::builtins::FloatDictScheme;
use vortex_compressor::builtins::StringDictScheme;
use vortex_compressor::estimate::CompressionEstimate;
Expand Down Expand Up @@ -89,7 +90,6 @@ pub struct PcoScheme;
// Re-export builtin schemes from vortex-compressor.
pub use vortex_compressor::builtins::IntConstantScheme;
pub use vortex_compressor::builtins::IntDictScheme;
pub use vortex_compressor::builtins::is_integer_primitive;
pub use vortex_compressor::stats::IntegerStats;

/// RLE scheme for integer arrays.
Expand All @@ -108,7 +108,7 @@ impl Scheme for FoRScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

/// Dict codes always start at 0, so FoR (which subtracts the min) is a no-op.
Expand All @@ -126,6 +126,10 @@ impl Scheme for FoRScheme {
ancestor: StringDictScheme.id(),
children: ChildSelection::One(1),
},
AncestorExclusion {
ancestor: BinaryDictScheme.id(),
children: ChildSelection::One(1),
},
]
}

Expand Down Expand Up @@ -221,7 +225,7 @@ impl Scheme for ZigZagScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

/// Children: encoded=0.
Expand Down Expand Up @@ -264,6 +268,10 @@ impl Scheme for ZigZagScheme {
ancestor: StringDictScheme.id(),
children: ChildSelection::One(1),
},
AncestorExclusion {
ancestor: BinaryDictScheme.id(),
children: ChildSelection::One(1),
},
]
}

Expand Down Expand Up @@ -317,7 +325,7 @@ impl Scheme for BitPackingScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

fn expected_compression_ratio(
Expand Down Expand Up @@ -412,7 +420,7 @@ impl Scheme for SparseScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

fn stats_options(&self) -> GenerateStatsOptions {
Expand Down Expand Up @@ -583,7 +591,7 @@ impl Scheme for RunEndScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

/// Children: values=0, ends=1.
Expand Down Expand Up @@ -630,6 +638,10 @@ impl Scheme for RunEndScheme {
ancestor: StringDictScheme.id(),
children: ChildSelection::One(0),
},
AncestorExclusion {
ancestor: BinaryDictScheme.id(),
children: ChildSelection::One(0),
},
]
}

Expand Down Expand Up @@ -683,7 +695,7 @@ impl Scheme for SequenceScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

/// Sequence encoding on dictionary codes just adds a layer of indirection without compressing
Expand All @@ -703,6 +715,10 @@ impl Scheme for SequenceScheme {
ancestor: StringDictScheme.id(),
children: ChildSelection::One(1),
},
AncestorExclusion {
ancestor: BinaryDictScheme.id(),
children: ChildSelection::One(1),
},
]
}

Expand Down Expand Up @@ -784,7 +800,7 @@ impl Scheme for PcoScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

fn expected_compression_ratio(
Expand Down Expand Up @@ -938,7 +954,7 @@ impl Scheme for IntRLEScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
canonical.dtype().is_int()
}

/// Children: values=0, indices=1, offsets=2.
Expand Down
6 changes: 6 additions & 0 deletions vortex-btrblocks/src/schemes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

//! Compression scheme implementations.

pub mod binary;
pub mod bool;
pub mod float;
pub mod integer;
Expand All @@ -13,6 +14,7 @@ pub mod temporal;

pub(crate) mod patches;

use vortex_compressor::builtins::BinaryDictScheme;
use vortex_compressor::builtins::FloatDictScheme;
use vortex_compressor::builtins::IntDictScheme;
use vortex_compressor::builtins::StringDictScheme;
Expand Down Expand Up @@ -63,5 +65,9 @@ fn rle_ancestor_exclusions() -> Vec<AncestorExclusion> {
ancestor: StringDictScheme.id(),
children: ChildSelection::One(0),
},
AncestorExclusion {
ancestor: BinaryDictScheme.id(),
children: ChildSelection::One(0),
},
]
}
15 changes: 7 additions & 8 deletions vortex-btrblocks/src/schemes/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ pub struct ZstdBuffersScheme;
// Re-export builtin schemes from vortex-compressor.
pub use vortex_compressor::builtins::StringConstantScheme;
pub use vortex_compressor::builtins::StringDictScheme;
pub use vortex_compressor::builtins::is_utf8_string;
pub use vortex_compressor::stats::StringStats;

impl Scheme for FSSTScheme {
Expand All @@ -64,7 +63,7 @@ impl Scheme for FSSTScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_utf8_string(canonical)
canonical.dtype().is_utf8()
}

/// Children: lengths=0, code_offsets=1.
Expand All @@ -88,7 +87,7 @@ impl Scheme for FSSTScheme {
compress_ctx: CompressorContext,
exec_ctx: &mut ExecutionCtx,
) -> VortexResult<ArrayRef> {
let utf8 = data.array_as_utf8().into_owned();
let utf8 = data.array_as_varbinview().into_owned();
let compressor_fsst = fsst_train_compressor(&utf8);
let fsst = fsst_compress(&utf8, utf8.len(), utf8.dtype(), &compressor_fsst, exec_ctx);

Expand Down Expand Up @@ -144,7 +143,7 @@ impl Scheme for NullDominatedSparseScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_utf8_string(canonical)
canonical.dtype().is_utf8()
}

/// Children: indices=0.
Expand Down Expand Up @@ -173,7 +172,7 @@ impl Scheme for NullDominatedSparseScheme {
exec_ctx: &mut ExecutionCtx,
) -> CompressionEstimate {
let len = data.array_len() as f64;
let stats = data.string_stats(exec_ctx);
let stats = data.varbinview_stats(exec_ctx);
let value_count = stats.value_count();

// All-null arrays should be compressed as constant instead anyways.
Expand Down Expand Up @@ -236,7 +235,7 @@ impl Scheme for ZstdScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_utf8_string(canonical)
canonical.dtype().is_utf8()
}

fn expected_compression_ratio(
Expand All @@ -255,7 +254,7 @@ impl Scheme for ZstdScheme {
_compress_ctx: CompressorContext,
exec_ctx: &mut ExecutionCtx,
) -> VortexResult<ArrayRef> {
let compacted = data.array_as_utf8().into_owned().compact_buffers()?;
let compacted = data.array_as_varbinview().into_owned().compact_buffers()?;
Ok(
vortex_zstd::Zstd::from_var_bin_view_without_dict(&compacted, 3, 8192, exec_ctx)?
.into_array(),
Expand All @@ -270,7 +269,7 @@ impl Scheme for ZstdBuffersScheme {
}

fn matches(&self, canonical: &Canonical) -> bool {
is_utf8_string(canonical)
canonical.dtype().is_utf8()
}

fn expected_compression_ratio(
Expand Down
Loading
Loading