From 666b542120046f20735049c123e2a27b400eab71 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 27 Jun 2026 22:47:52 +0200 Subject: [PATCH] fastcdc: add FastCDC chunker with a keyed Gear hash Add a new "fastcdc" content-defined chunker selectable via --chunker-params. It uses the FastCDC Gear rolling hash (fp = (fp << 1) + Gear[byte]), which is window-less and cheaper per byte than buzhash's cyclic-polynomial update, so it chunks noticeably faster (see "borg benchmark cpu" output), while producing the same chunk-size distribution and deduplication. The Gear table is keyed: it is derived from the repo id key via CSPRNG (own "fastcdc" domain), exactly like the buzhash64 table, so chunk cut points stay unpredictable without the key (anti-fingerprinting). It implements the same FastCDC techniques as buzhash64 (sub-minimum skipping, normalized chunking with a required nc_level, min/max clamping); the mask uses the high bits of the hash (Gear accumulates entropy there). chunker-params: "fastcdc,chunk_min,chunk_max,chunk_mask,nc_level" - there is no window field, because Gear is window-less. e.g. fastcdc,19,23,21,2 Also: borg benchmark cpu now measures the fastcdc chunker; tests in borg.testsuite.chunkers (golden vector, size distribution, keyed gear table, param parsing, slow fuzz); docs and changelog. Benchmarks (scripts/chunker_bench.py, buzhash64 vs fastcdc, both nc_level=2, incompressible data unless noted): 5 GiB, 2 MiB target (default params): buzhash64: CV 0.294, 1011 MB/s fastcdc: CV 0.295, 1313 MB/s (+30%) 64 MiB, 64 KiB target: buzhash64: CV 0.374, shift-resilience 0.9928, 963 MB/s fastcdc: CV 0.359, shift-resilience 0.9929, 1331 MB/s (+38%) Re-backup of a 2.5 GiB file after scattered single-byte edits (dedup ratio, 0.5 = v2 fully deduplicated, lower is better): 64 edits: buzhash64 0.5237, fastcdc 0.5236 320 edits: buzhash64 0.6133, fastcdc 0.6161 borg benchmark cpu, 1 GB: fastcdc 3.80s, buzhash 4.36s, buzhash64 8.13s, fixed 0.56s. Chunk-size distribution, deduplication and shift-resilience match buzhash64 within noise; fastcdc is consistently faster. Also: fix bug when computing the mask, one needs to use 1ULL instead of 1, so the shifting computation is done in a uint64, not in a 32bit int. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 1 + docs/changes.rst | 6 + docs/global.rst.inc | 1 + docs/internals/data-structures.rst | 18 ++ scripts/chunker_bench.py | 9 +- setup.py | 3 + src/borg/archiver/benchmark_cmd.py | 2 + src/borg/archiver/completion_cmd.py | 8 +- src/borg/chunkers/__init__.py | 16 +- src/borg/chunkers/fastcdc.pyx | 285 ++++++++++++++++++++ src/borg/constants.py | 3 + src/borg/helpers/parseformat.py | 18 ++ src/borg/testsuite/chunkers/fastcdc_test.py | 161 +++++++++++ 13 files changed, 523 insertions(+), 8 deletions(-) create mode 100644 src/borg/chunkers/fastcdc.pyx create mode 100644 src/borg/testsuite/chunkers/fastcdc_test.py diff --git a/.gitignore b/.gitignore index f2f7461228..a2254bafd7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ src/borg/crypto/low_level.c src/borg/item.c src/borg/chunkers/buzhash.c src/borg/chunkers/buzhash64.c +src/borg/chunkers/fastcdc.c src/borg/chunkers/reader.c src/borg/checksums.c src/borg/platform/darwin.c diff --git a/docs/changes.rst b/docs/changes.rst index 8caa0d0e8b..9aac15a9f5 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -176,6 +176,12 @@ New features: ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level`` (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``). buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x. +- new ``fastcdc`` chunker: a FastCDC content-defined chunker using a window-less, keyed Gear + rolling hash (the gear table is derived from the repo's id key, like buzhash64, so cut points + stay unpredictable without the key). It supports the same normalized chunking as buzhash64 and + produces the same chunk-size distribution and deduplication, but chunks roughly 1.3-1.5x faster. + Select it via ``--chunker-params fastcdc,chunk_min,chunk_max,chunk_mask,nc_level`` (no window + field; e.g. ``fastcdc,19,23,21,2``). ``borg benchmark cpu`` now reports its throughput too. - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb`` or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function diff --git a/docs/global.rst.inc b/docs/global.rst.inc index a3c8df1cc8..196391a265 100644 --- a/docs/global.rst.inc +++ b/docs/global.rst.inc @@ -19,6 +19,7 @@ .. _OpenSSL: https://www.openssl.org/ .. _`Python 3`: https://www.python.org/ .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash +.. _FastCDC: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia .. _msgpack: https://msgpack.org/ .. _`msgpack-python`: https://pypi.org/project/msgpack-python/ .. _llfuse: https://pypi.org/project/llfuse/ diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 0a1e86edb0..2f63218758 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -403,6 +403,8 @@ Borg has these chunkers: - "buzhash": variable, content-defined blocksize, uses a rolling hash computed by the Buzhash_ algorithm. - "buzhash64": similar to "buzhash", but improved 64bit implementation +- "fastcdc": variable, content-defined blocksize, uses the window-less, keyed + Gear rolling hash (FastCDC_); faster than buzhash, same deduplication. For some more general usage hints see also ``--chunker-params``. @@ -483,6 +485,22 @@ The buzhash table is cryptographically derived from secret key material. These changes should improve resistance against attacks and also solve some of the issues of the original (32bit / XORed table) implementation. +"fastcdc" chunker ++++++++++++++++++ + +FastCDC_ content-defined chunker using the Gear rolling hash. Unlike buzhash it +is window-less (each byte's influence simply decays out of the hash), so its +update is cheaper and it chunks noticeably faster, while producing the same +deduplication and (with normalized chunking) the same chunk-size distribution. + +Like "buzhash64", the Gear table is cryptographically derived from secret key +material, so chunk cut points are unpredictable without the key. + +``borg create --chunker-params fastcdc,CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,NC_LEVEL`` + +There is no window size (Gear is window-less). NC_LEVEL is the normalized +chunking level (0 disables it); 2 is a good default. E.g.: ``fastcdc,19,23,21,2``. + .. _cache: The cache diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py index a90b66a04c..1d439d0099 100644 --- a/scripts/chunker_bench.py +++ b/scripts/chunker_bench.py @@ -129,7 +129,10 @@ def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal params = [min_exp, max_exp, mask_bits, win] kw = dict(key=None, sparse=False) if algo == "buzhash64": - params.append(nc_level) # nc_level is a positional buzhash64 param + params.append(nc_level) # nc_level is a positional param + kw["normal_size"] = normal_size + elif algo == "fastcdc": + params = [min_exp, max_exp, mask_bits, nc_level] # fastcdc is window-less kw["normal_size"] = normal_size chunker = get_chunker(algo, *params, **kw) sizes = [] @@ -285,11 +288,11 @@ def main(): print(f"shift test: {args.shift_edits} edits repeats: {args.repeat}") print("-" * 118) - # build (algo, nc_level) variants; for buzhash64 also run the requested NC level + # build (algo, nc_level) variants; for buzhash64/fastcdc also run the requested NC level variants = [] for algo in args.algo: variants.append((algo, 0)) - if algo == "buzhash64" and args.nc_level > 0: + if algo in ("buzhash64", "fastcdc") and args.nc_level > 0: variants.append((algo, args.nc_level)) for algo, nc in variants: diff --git a/setup.py b/setup.py index 19f1c92789..1ad76e8048 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ crypto_legacy_ll_source = "src/borg/legacy/crypto/low_level.pyx" buzhash_source = "src/borg/chunkers/buzhash.pyx" buzhash64_source = "src/borg/chunkers/buzhash64.pyx" +fastcdc_source = "src/borg/chunkers/fastcdc.pyx" reader_source = "src/borg/chunkers/reader.pyx" hashindex_source = "src/borg/hashindex.pyx" item_source = "src/borg/item.pyx" @@ -73,6 +74,7 @@ crypto_legacy_ll_source, buzhash_source, buzhash64_source, + fastcdc_source, reader_source, hashindex_source, item_source, @@ -189,6 +191,7 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s Extension("borg.item", [item_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags), + Extension("borg.chunkers.fastcdc", [fastcdc_source], extra_compile_args=cflags), Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags), ] diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index 099cac8b36..71615e6456 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -204,6 +204,8 @@ def chunkit(ch): "chunkit(ch)", locals(), ), + # fastcdc (window-less keyed gear hash); gear table creation is slow, keep it in setup + ("fastcdc,19,23,21,2", "ch = get_chunker('fastcdc', 19, 23, 21, 2, sparse=False)", "chunkit(ch)", locals()), ("fixed,1048576", "ch = get_chunker('fixed', 1048576, sparse=False)", "chunkit(ch)", locals()), ]: dt = timeit(func, setup, number=number_default, globals=vars) diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py index 1bce0cb15d..6b73444913 100644 --- a/src/borg/archiver/completion_cmd.py +++ b/src/borg/archiver/completion_cmd.py @@ -708,7 +708,13 @@ def do_completion(self, args): comp_spec_choices_str = " ".join(comp_spec_choices) # Chunker params choices (static list) - chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"] + chunker_params_choices = [ + "default", + "fixed,4194304", + "buzhash,19,23,21,4095", + "buzhash64,19,23,21,4095,2", + "fastcdc,19,23,21,2", + ] chunker_params_choices_str = " ".join(chunker_params_choices) # Relative time marker choices (static list) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 7282b5e8eb..461cc90d16 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -1,5 +1,6 @@ from .buzhash import Chunker from .buzhash64 import ChunkerBuzHash64 +from .fastcdc import ChunkerFastCDC from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa @@ -10,16 +11,23 @@ def get_chunker(algo, *params, **kw): sparse = kw.get("sparse", False) # key.chunk_seed only has 32 bits seed = key.chunk_seed if key is not None else 0 - # for buzhash64, we want a much longer key, so we derive it from the id key - bh64_key = ( - key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32 - ) if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": + # for buzhash64, we want a much longer key, so we derive it from the id key. # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level); # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto). + bh64_key = ( + key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32 + ) return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) + if algo == "fastcdc": + # keyed gear table, derived from the id key (own domain). params is + # (chunk_min_exp, chunk_max_exp, hash_mask_bits, nc_level) - no window (Gear is window-less). + fc_key = ( + key.derive_key(salt=b"", domain=b"fastcdc", size=32, from_id_key=True) if key is not None else b"\0" * 32 + ) + return ChunkerFastCDC(fc_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/fastcdc.pyx b/src/borg/chunkers/fastcdc.pyx new file mode 100644 index 0000000000..2248ff5406 --- /dev/null +++ b/src/borg/chunkers/fastcdc.pyx @@ -0,0 +1,285 @@ +# cython: language_level=3 + +import cython +import time + +from cpython.bytes cimport PyBytes_AsString +from libc.stdint cimport uint8_t, uint64_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memmove, memset + +from ..crypto.low_level import CSPRNG + +from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros +from .reader import FileReader, Chunk + +# FastCDC content-defined chunker (Xia et al., USENIX ATC 2016). +# +# Differences vs. the buzhash64 chunker in this package: +# * It uses the Gear rolling hash: fp = (fp << 1) + Gear[byte]. This is a single shift, +# add and table lookup per byte (no window, no "remove" term), so it is cheaper than +# buzhash's cyclic-polynomial update. +# * The Gear table is keyed from a 256-bit key via the same CSPRNG used by buzhash64, so +# cut points are unpredictable without the key (anti-fingerprinting), just like buzhash64. +# * Because the Gear hash accumulates information in its HIGH bits (the low bits only depend +# on the most recent bytes), the cut-decision mask uses the high bits of the hash. +# +# It implements the same FastCDC techniques the buzhash64 chunker uses: sub-minimum cut-point +# skipping, normalized chunking (strict/loose mask around a "normal" size), and min/max clamping. + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef uint64_t* fastcdc_init_gear(bytes key) except NULL: + """Generate a keyed 256-entry, 64-bit Gear table deterministically from a 256-bit key.""" + rng = CSPRNG(key) + cdef bytes rnd = rng.random_bytes(2048) # 256 * sizeof(uint64_t) + cdef const uint8_t* rp = PyBytes_AsString(rnd) + cdef uint64_t* gear = malloc(2048) + if gear == NULL: + raise MemoryError("Failed to allocate fastcdc gear table") + cdef int i, j + cdef uint64_t v + for i in range(256): + v = 0 + for j in range(8): + v |= (rp[i * 8 + j]) << (8 * j) + gear[i] = v + return gear + + +cdef inline uint64_t _high_mask(int bits): + """A mask with one-bits in the most significant positions (Gear's strong bits).""" + if bits <= 0: + return 0 + if bits >= 64: + return 0xFFFFFFFFFFFFFFFF + return ((1 << bits) - 1) << (64 - bits) + + +cdef class ChunkerFastCDC: + """ + FastCDC content-defined chunker, variable chunk sizes, keyed Gear hash. + + Unlike the buzhash chunkers, Gear is window-less, so there is no hash_window_size parameter. + """ + cdef uint64_t chunk_mask + cdef uint64_t mask_s, mask_l # normalized chunking: strict / loose masks + cdef size_t normal_size # chunk length at which we switch mask_s -> mask_l + cdef int nc_level # normalized chunking level (0 = disabled) + cdef uint64_t* gear + cdef uint8_t* data + cdef object _fd + cdef int fh + cdef int done, eof + cdef size_t min_size, buf_size, remaining, position, last + cdef long long bytes_read, bytes_yielded + cdef readonly float chunking_time + cdef object file_reader + cdef size_t reader_block_size + cdef bint sparse + + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int nc_level=0, size_t normal_size=0, bint sparse=False): + self.gear = NULL + self.data = NULL + min_size = 1 << chunk_min_exp + max_size = 1 << chunk_max_exp + assert max_size <= len(zeros) + assert min_size + 1 <= max_size, "too small max_size" + + self.chunk_mask = _high_mask(hash_mask_bits) + self.min_size = min_size + # Normalized chunking, identical structure to the buzhash64 chunker (see there), but with + # the mask one-bits placed in the high bits of the Gear hash. + assert nc_level >= 0 + assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits" + assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits" + self.nc_level = nc_level + if nc_level: + self.mask_s = _high_mask(hash_mask_bits + nc_level) + self.mask_l = _high_mask(hash_mask_bits - nc_level) + self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level))) + else: + self.mask_s = self.chunk_mask + self.mask_l = self.chunk_mask + self.normal_size = 0 + self.gear = fastcdc_init_gear(key) + self.buf_size = max_size + self.data = malloc(self.buf_size) + if self.data == NULL: + raise MemoryError("Failed to allocate chunker buffer") + self.fh = -1 + self.done = 0 + self.eof = 0 + self.remaining = 0 + self.position = 0 + self.last = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self._fd = None + self.chunking_time = 0.0 + self.reader_block_size = 1024 * 1024 + self.sparse = sparse + + def __dealloc__(self): + if self.gear != NULL: + free(self.gear) + self.gear = NULL + if self.data != NULL: + free(self.data) + self.data = NULL + + cdef int fill(self) except 0: + """Fill the chunker's buffer with more data.""" + cdef ssize_t n + cdef object chunk + + memmove(self.data, self.data + self.last, self.position + self.remaining - self.last) + self.position -= self.last + self.last = 0 + n = self.buf_size - self.position - self.remaining + + if self.eof or n == 0: + return 1 + + chunk = self.file_reader.read(n) + n = chunk.meta["size"] + + if n > 0: + if chunk.meta["allocation"] == CH_DATA: + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(chunk.data), n) + else: + memset(self.data + self.position + self.remaining, 0, n) + self.remaining += n + self.bytes_read += n + else: + self.eof = 1 + return 1 + + cdef object process(self) except *: + """Process the chunker's buffer and return the next chunk.""" + cdef uint64_t fp = 0, mask, mask_s = self.mask_s, mask_l = self.mask_l + cdef int nc_level = self.nc_level + cdef size_t n, old_last, min_size = self.min_size + cdef size_t normal_size = self.normal_size, normal_pos, chunk_len, did + cdef uint8_t* p + cdef uint8_t* stop + cdef uint8_t* cut + cdef uint64_t* gear = self.gear + + if self.done: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # ensure at least min_size + 1 bytes are buffered, or we are at eof + while self.remaining < min_size + 1 and not self.eof: + if not self.fill(): + return None + + # at eof with only a remainder (< min_size + 1): emit it as the final chunk + if self.eof and self.remaining < min_size + 1: + self.done = 1 + if self.remaining: + old_last = self.last + self.position += self.remaining + self.last = self.position + n = self.last - old_last + self.remaining = 0 + self.bytes_yielded += n + return memoryview((self.data + old_last)[:n]) + else: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # skip the sub-minimum region (no cut allowed below min_size), then gear-scan + self.position += min_size + self.remaining -= min_size + fp = 0 + + while True: + chunk_len = self.position - self.last + mask = mask_s if (nc_level and chunk_len < normal_size) else mask_l + + if self.remaining == 0: + if self.eof: + break # cut at end of data + if not self.fill(): + return None + if self.remaining == 0: + break # buffer full -> chunk reached max_size -> forced cut + continue + + p = self.data + self.position + stop = p + self.remaining + if nc_level and chunk_len < normal_size: + # do not scan past the strict->loose transition; re-evaluate the mask there + normal_pos = self.last + normal_size + if (self.data + normal_pos) < stop: + stop = self.data + normal_pos + + cut = NULL + while p < stop: + fp = (fp << 1) + gear[p[0]] + if (fp & mask) == 0: + cut = p + break + p += 1 + + if cut != NULL: + p = cut + 1 # cut right after the byte that triggered the boundary + did = p - (self.data + self.position) + self.position += did + self.remaining -= did + break + else: + did = p - (self.data + self.position) + self.position += did + self.remaining -= did + + old_last = self.last + self.last = self.position + n = self.last - old_last + self.bytes_yielded += n + return memoryview((self.data + old_last)[:n]) + + def chunkify(self, fd, fh=-1, fmap=None): + self._fd = fd + self.fh = fh + self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap) + self.done = 0 + self.remaining = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self.position = 0 + self.last = 0 + self.eof = 0 + return self + + def __iter__(self): + return self + + def __next__(self): + started_chunking = time.monotonic() + data = self.process() + got = len(data) + if zeros.startswith(data): + data = None + allocation = CH_ALLOC + else: + allocation = CH_DATA + self.chunking_time += time.monotonic() - started_chunking + return Chunk(data, size=got, allocation=allocation) + + +def fastcdc_get_gear_table(bytes key): + """Get the keyed gear table generated from (for tests / inspection).""" + cdef uint64_t* gear = fastcdc_init_gear(key) + cdef int i + try: + return [gear[i] for i in range(256)] + finally: + free(gear) diff --git a/src/borg/constants.py b/src/borg/constants.py index 3750af71a1..5b411e183e 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -103,6 +103,7 @@ # chunker algorithms CH_BUZHASH = "buzhash" CH_BUZHASH64 = "buzhash64" +CH_FASTCDC = "fastcdc" CH_FIXED = "fixed" CH_FAIL = "fail" @@ -119,6 +120,8 @@ # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL) +# fastcdc uses a window-less Gear hash, so it has no window_size parameter. +FASTCDC_PARAMS = (CH_FASTCDC, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, NC_LEVEL) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 72183885fd..b5b2928d93 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -324,6 +324,24 @@ def ChunkerParams(s): ) # note that for buzhash64, there is no problem with even window_size. return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level + if algo == CH_FASTCDC and count == 5: + # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level + # fastcdc uses a window-less Gear hash, so there is no window_size field. + # nc_level is required; use nc_level 0 to disable normalized chunking. + chunk_min, chunk_max, chunk_mask = (int(p) for p in params[1:4]) + nc_level = int(params[4]) + if not (chunk_min <= chunk_mask <= chunk_max): + raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") + if chunk_min < 6: + # see comment in 'fixed' algo check + raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)") + if chunk_max > 23: + raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)") + if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48): + raise ArgumentTypeError( + "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48" + ) + return CH_FASTCDC, chunk_min, chunk_max, chunk_mask, nc_level # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) diff --git a/src/borg/testsuite/chunkers/fastcdc_test.py b/src/borg/testsuite/chunkers/fastcdc_test.py new file mode 100644 index 0000000000..cf06193a6e --- /dev/null +++ b/src/borg/testsuite/chunkers/fastcdc_test.py @@ -0,0 +1,161 @@ +from hashlib import sha256 +from io import BytesIO +import os +import random + +import pytest + +from . import cf, cf_expand +from ...chunkers import ChunkerFastCDC, get_chunker +from ...chunkers.fastcdc import fastcdc_get_gear_table +from ...constants import * # NOQA +from ...helpers import hex_to_bin + + +# from os.urandom(32) +key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da") +key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8") + + +def H(data): + return sha256(data).digest() + + +def test_chunkpoints_fastcdc_unchanged(): + def twist(size): + x = 1 + a = bytearray(size) + for i in range(size): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return a + + data = twist(100000) + + runs = [] + for nc_level in (0, 2, 3): + for minexp in (4, 6, 7, 11, 12): + for maxexp in (15, 17): + if minexp >= maxexp: + continue + for maskbits in (4, 7, 10, 12): + if maskbits - nc_level < 1: # nc_level needs room below the base mask bits + continue + for key in (key0, key1): + fh = BytesIO(data) + chunker = ChunkerFastCDC(key, minexp, maxexp, maskbits, nc_level) + chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] + runs.append(H(b"".join(chunks))) + + # The "correct" hash below matches the existing chunker behavior. + # Future chunker optimizations must not change this, or existing repos will bloat. + overall_hash = H(b"".join(runs)) + print(overall_hash.hex()) + assert overall_hash == hex_to_bin("50d39b6f30214d78f665ff97a4800142cddcb6a7c5995e5d162f9c6dceb20cfe") + + +def test_fastcdc_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask, nc_level = 10, 16, 14, 2 # chunk size target 16 KiB, clip at 1 KiB and 64 KiB + chunker = ChunkerFastCDC(key0, min_exp, max_exp, mask, nc_level) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) + max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to the gear hash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 + + +def test_fastcdc_gear_table(): + # Test that the function returns a list of 256 integers + table0 = fastcdc_get_gear_table(key0) + assert len(table0) == 256 + for value in table0: + assert isinstance(value, int) + assert 0 <= value < 2**64 + + # deterministic (same key produces same table) + assert table0 == fastcdc_get_gear_table(key0) + + # different keys produce different tables + table1 = fastcdc_get_gear_table(key1) + assert table0 != table1 + + +def test_fastcdc_get_chunker(): + # without a key, get_chunker uses an all-zero key; chunking must still work and be deterministic + data = os.urandom(2 * 1024 * 1024) + a = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data))) + b = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data))) + assert a == b + assert b"".join(a) == data + + +def test_fastcdc_params_parsing(): + from argparse import ArgumentTypeError + + from ...helpers import ChunkerParams + + # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level (no window field) + assert ChunkerParams("fastcdc,19,23,21,2") == (CH_FASTCDC, 19, 23, 21, 2) + assert ChunkerParams("fastcdc,10,23,16,0") == (CH_FASTCDC, 10, 23, 16, 0) + # a 6-field (buzhash64-style, with window) fastcdc must be rejected + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,21,4095,2") + # nc_level out of range (chunk_mask - nc_level < 1) + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,21,21") + # chunk_min <= chunk_mask <= chunk_max violated + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,24,2") + + +@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") +@pytest.mark.parametrize("worker", range(os.cpu_count() or 1)) +def test_fuzz_fastcdc(worker): + # Fuzz fastcdc with random and uniform data of misc. sizes and misc keys. + def rnd_key(): + return os.urandom(32) + + # decompose FASTCDC_PARAMS = (algo, min_exp, max_exp, mask_bits, nc_level) + algo, min_exp, max_exp, mask_bits, nc_level = FASTCDC_PARAMS + assert algo == CH_FASTCDC + + keys = [b"\0" * 32] + [rnd_key() for _ in range(10)] + sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)] + + for key in keys: + chunker = ChunkerFastCDC(key, min_exp, max_exp, mask_bits, nc_level) + for size in sizes: + # Random data + data = os.urandom(size) + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data + + # All-same data (non-zero) + data = b"\x42" * size + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data + + # All-zero data + data = b"\x00" * size + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data