From 666b542120046f20735049c123e2a27b400eab71 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 27 Jun 2026 22:47:52 +0200
Subject: [PATCH] fastcdc: add FastCDC chunker with a keyed Gear hash

Add a new "fastcdc" content-defined chunker selectable via --chunker-params.
It uses the FastCDC Gear rolling hash (fp = (fp << 1) + Gear[byte]), which is
window-less and cheaper per byte than buzhash's cyclic-polynomial update, so it
chunks noticeably faster (see "borg benchmark cpu" output), while producing
the same chunk-size distribution and deduplication.

The Gear table is keyed: it is derived from the repo id key via CSPRNG (own
"fastcdc" domain), exactly like the buzhash64 table, so chunk cut points stay
unpredictable without the key (anti-fingerprinting). It implements the same
FastCDC techniques as buzhash64 (sub-minimum skipping, normalized chunking with
a required nc_level, min/max clamping); the mask uses the high bits of the hash
(Gear accumulates entropy there).

chunker-params: "fastcdc,chunk_min,chunk_max,chunk_mask,nc_level" - there is no
window field, because Gear is window-less. e.g. fastcdc,19,23,21,2

Also: borg benchmark cpu now measures the fastcdc chunker; tests in
borg.testsuite.chunkers (golden vector, size distribution, keyed gear table,
param parsing, slow fuzz); docs and changelog.

Benchmarks (scripts/chunker_bench.py, buzhash64 vs fastcdc, both nc_level=2,
incompressible data unless noted):

  5 GiB, 2 MiB target (default params):
    buzhash64: CV 0.294, 1011 MB/s
    fastcdc:   CV 0.295, 1313 MB/s   (+30%)

  64 MiB, 64 KiB target:
    buzhash64: CV 0.374, shift-resilience 0.9928,  963 MB/s
    fastcdc:   CV 0.359, shift-resilience 0.9929, 1331 MB/s   (+38%)

  Re-backup of a 2.5 GiB file after scattered single-byte edits (dedup ratio,
  0.5 = v2 fully deduplicated, lower is better):
     64 edits:  buzhash64 0.5237, fastcdc 0.5236
    320 edits:  buzhash64 0.6133, fastcdc 0.6161

  borg benchmark cpu, 1 GB: fastcdc 3.80s, buzhash 4.36s, buzhash64 8.13s,
  fixed 0.56s.

Chunk-size distribution, deduplication and shift-resilience match buzhash64
within noise; fastcdc is consistently faster.

Also: fix bug when computing the mask, one needs to use 1ULL instead of
1, so the shifting computation is done in a uint64, not in a 32bit int.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore                                  |   1 +
 docs/changes.rst                            |   6 +
 docs/global.rst.inc                         |   1 +
 docs/internals/data-structures.rst          |  18 ++
 scripts/chunker_bench.py                    |   9 +-
 setup.py                                    |   3 +
 src/borg/archiver/benchmark_cmd.py          |   2 +
 src/borg/archiver/completion_cmd.py         |   8 +-
 src/borg/chunkers/__init__.py               |  16 +-
 src/borg/chunkers/fastcdc.pyx               | 285 ++++++++++++++++++++
 src/borg/constants.py                       |   3 +
 src/borg/helpers/parseformat.py             |  18 ++
 src/borg/testsuite/chunkers/fastcdc_test.py | 161 +++++++++++
 13 files changed, 523 insertions(+), 8 deletions(-)
 create mode 100644 src/borg/chunkers/fastcdc.pyx
 create mode 100644 src/borg/testsuite/chunkers/fastcdc_test.py

diff --git a/.gitignore b/.gitignore
index f2f7461228..a2254bafd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ src/borg/crypto/low_level.c
 src/borg/item.c
 src/borg/chunkers/buzhash.c
 src/borg/chunkers/buzhash64.c
+src/borg/chunkers/fastcdc.c
 src/borg/chunkers/reader.c
 src/borg/checksums.c
 src/borg/platform/darwin.c
diff --git a/docs/changes.rst b/docs/changes.rst
index 8caa0d0e8b..9aac15a9f5 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -176,6 +176,12 @@ New features:
   ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level``
   (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``).
   buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x.
+- new ``fastcdc`` chunker: a FastCDC content-defined chunker using a window-less, keyed Gear
+  rolling hash (the gear table is derived from the repo's id key, like buzhash64, so cut points
+  stay unpredictable without the key). It supports the same normalized chunking as buzhash64 and
+  produces the same chunk-size distribution and deduplication, but chunks roughly 1.3-1.5x faster.
+  Select it via ``--chunker-params fastcdc,chunk_min,chunk_max,chunk_mask,nc_level`` (no window
+  field; e.g. ``fastcdc,19,23,21,2``). ``borg benchmark cpu`` now reports its throughput too.
 - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now
   selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb``
   or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function
diff --git a/docs/global.rst.inc b/docs/global.rst.inc
index a3c8df1cc8..196391a265 100644
--- a/docs/global.rst.inc
+++ b/docs/global.rst.inc
@@ -19,6 +19,7 @@
 .. _OpenSSL: https://www.openssl.org/
 .. _`Python 3`: https://www.python.org/
 .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash
+.. _FastCDC: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia
 .. _msgpack: https://msgpack.org/
 .. _`msgpack-python`: https://pypi.org/project/msgpack-python/
 .. _llfuse: https://pypi.org/project/llfuse/
diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
index 0a1e86edb0..2f63218758 100644
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -403,6 +403,8 @@ Borg has these chunkers:
 - "buzhash": variable, content-defined blocksize, uses a rolling hash
   computed by the Buzhash_ algorithm.
 - "buzhash64": similar to "buzhash", but improved 64bit implementation
+- "fastcdc": variable, content-defined blocksize, uses the window-less, keyed
+  Gear rolling hash (FastCDC_); faster than buzhash, same deduplication.
 
 For some more general usage hints see also ``--chunker-params``.
 
@@ -483,6 +485,22 @@ The buzhash table is cryptographically derived from secret key material.
 These changes should improve resistance against attacks and also solve
 some of the issues of the original (32bit / XORed table) implementation.
 
+"fastcdc" chunker
++++++++++++++++++
+
+FastCDC_ content-defined chunker using the Gear rolling hash. Unlike buzhash it
+is window-less (each byte's influence simply decays out of the hash), so its
+update is cheaper and it chunks noticeably faster, while producing the same
+deduplication and (with normalized chunking) the same chunk-size distribution.
+
+Like "buzhash64", the Gear table is cryptographically derived from secret key
+material, so chunk cut points are unpredictable without the key.
+
+``borg create --chunker-params fastcdc,CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,NC_LEVEL``
+
+There is no window size (Gear is window-less). NC_LEVEL is the normalized
+chunking level (0 disables it); 2 is a good default. E.g.: ``fastcdc,19,23,21,2``.
+
 .. _cache:
 
 The cache
diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py
index a90b66a04c..1d439d0099 100644
--- a/scripts/chunker_bench.py
+++ b/scripts/chunker_bench.py
@@ -129,7 +129,10 @@ def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal
     params = [min_exp, max_exp, mask_bits, win]
     kw = dict(key=None, sparse=False)
     if algo == "buzhash64":
-        params.append(nc_level)  # nc_level is a positional buzhash64 param
+        params.append(nc_level)  # nc_level is a positional param
+        kw["normal_size"] = normal_size
+    elif algo == "fastcdc":
+        params = [min_exp, max_exp, mask_bits, nc_level]  # fastcdc is window-less
         kw["normal_size"] = normal_size
     chunker = get_chunker(algo, *params, **kw)
     sizes = []
@@ -285,11 +288,11 @@ def main():
     print(f"shift test: {args.shift_edits} edits   repeats: {args.repeat}")
     print("-" * 118)
 
-    # build (algo, nc_level) variants; for buzhash64 also run the requested NC level
+    # build (algo, nc_level) variants; for buzhash64/fastcdc also run the requested NC level
     variants = []
     for algo in args.algo:
         variants.append((algo, 0))
-        if algo == "buzhash64" and args.nc_level > 0:
+        if algo in ("buzhash64", "fastcdc") and args.nc_level > 0:
             variants.append((algo, args.nc_level))
 
     for algo, nc in variants:
diff --git a/setup.py b/setup.py
index 19f1c92789..1ad76e8048 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@
 crypto_legacy_ll_source = "src/borg/legacy/crypto/low_level.pyx"
 buzhash_source = "src/borg/chunkers/buzhash.pyx"
 buzhash64_source = "src/borg/chunkers/buzhash64.pyx"
+fastcdc_source = "src/borg/chunkers/fastcdc.pyx"
 reader_source = "src/borg/chunkers/reader.pyx"
 hashindex_source = "src/borg/hashindex.pyx"
 item_source = "src/borg/item.pyx"
@@ -73,6 +74,7 @@
     crypto_legacy_ll_source,
     buzhash_source,
     buzhash64_source,
+    fastcdc_source,
     reader_source,
     hashindex_source,
     item_source,
@@ -189,6 +191,7 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s
         Extension("borg.item", [item_source], extra_compile_args=cflags),
         Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags),
         Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags),
+        Extension("borg.chunkers.fastcdc", [fastcdc_source], extra_compile_args=cflags),
         Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags),
     ]
 
diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py
index 099cac8b36..71615e6456 100644
--- a/src/borg/archiver/benchmark_cmd.py
+++ b/src/borg/archiver/benchmark_cmd.py
@@ -204,6 +204,8 @@ def chunkit(ch):
                 "chunkit(ch)",
                 locals(),
             ),
+            # fastcdc (window-less keyed gear hash); gear table creation is slow, keep it in setup
+            ("fastcdc,19,23,21,2", "ch = get_chunker('fastcdc', 19, 23, 21, 2, sparse=False)", "chunkit(ch)", locals()),
             ("fixed,1048576", "ch = get_chunker('fixed', 1048576, sparse=False)", "chunkit(ch)", locals()),
         ]:
             dt = timeit(func, setup, number=number_default, globals=vars)
diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py
index 1bce0cb15d..6b73444913 100644
--- a/src/borg/archiver/completion_cmd.py
+++ b/src/borg/archiver/completion_cmd.py
@@ -708,7 +708,13 @@ def do_completion(self, args):
         comp_spec_choices_str = " ".join(comp_spec_choices)
 
         # Chunker params choices (static list)
-        chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"]
+        chunker_params_choices = [
+            "default",
+            "fixed,4194304",
+            "buzhash,19,23,21,4095",
+            "buzhash64,19,23,21,4095,2",
+            "fastcdc,19,23,21,2",
+        ]
         chunker_params_choices_str = " ".join(chunker_params_choices)
 
         # Relative time marker choices (static list)
diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py
index 7282b5e8eb..461cc90d16 100644
--- a/src/borg/chunkers/__init__.py
+++ b/src/borg/chunkers/__init__.py
@@ -1,5 +1,6 @@
 from .buzhash import Chunker
 from .buzhash64 import ChunkerBuzHash64
+from .fastcdc import ChunkerFastCDC
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
@@ -10,16 +11,23 @@ def get_chunker(algo, *params, **kw):
     sparse = kw.get("sparse", False)
     # key.chunk_seed only has 32 bits
     seed = key.chunk_seed if key is not None else 0
-    # for buzhash64, we want a much longer key, so we derive it from the id key
-    bh64_key = (
-        key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
-    )
     if algo == "buzhash":
         return Chunker(seed, *params, sparse=sparse)
     if algo == "buzhash64":
+        # for buzhash64, we want a much longer key, so we derive it from the id key.
         # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level);
         # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto).
+        bh64_key = (
+            key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
+        )
         return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse)
+    if algo == "fastcdc":
+        # keyed gear table, derived from the id key (own domain). params is
+        # (chunk_min_exp, chunk_max_exp, hash_mask_bits, nc_level) - no window (Gear is window-less).
+        fc_key = (
+            key.derive_key(salt=b"", domain=b"fastcdc", size=32, from_id_key=True) if key is not None else b"\0" * 32
+        )
+        return ChunkerFastCDC(fc_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse)
     if algo == "fixed":
         return ChunkerFixed(*params, sparse=sparse)
     if algo == "fail":
diff --git a/src/borg/chunkers/fastcdc.pyx b/src/borg/chunkers/fastcdc.pyx
new file mode 100644
index 0000000000..2248ff5406
--- /dev/null
+++ b/src/borg/chunkers/fastcdc.pyx
@@ -0,0 +1,285 @@
+# cython: language_level=3
+
+import cython
+import time
+
+from cpython.bytes cimport PyBytes_AsString
+from libc.stdint cimport uint8_t, uint64_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy, memmove, memset
+
+from ..crypto.low_level import CSPRNG
+
+from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
+from .reader import FileReader, Chunk
+
+# FastCDC content-defined chunker (Xia et al., USENIX ATC 2016).
+#
+# Differences vs. the buzhash64 chunker in this package:
+#  * It uses the Gear rolling hash: fp = (fp << 1) + Gear[byte]. This is a single shift,
+#    add and table lookup per byte (no window, no "remove" term), so it is cheaper than
+#    buzhash's cyclic-polynomial update.
+#  * The Gear table is keyed from a 256-bit key via the same CSPRNG used by buzhash64, so
+#    cut points are unpredictable without the key (anti-fingerprinting), just like buzhash64.
+#  * Because the Gear hash accumulates information in its HIGH bits (the low bits only depend
+#    on the most recent bytes), the cut-decision mask uses the high bits of the hash.
+#
+# It implements the same FastCDC techniques the buzhash64 chunker uses: sub-minimum cut-point
+# skipping, normalized chunking (strict/loose mask around a "normal" size), and min/max clamping.
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef uint64_t* fastcdc_init_gear(bytes key) except NULL:
+    """Generate a keyed 256-entry, 64-bit Gear table deterministically from a 256-bit key."""
+    rng = CSPRNG(key)
+    cdef bytes rnd = rng.random_bytes(2048)  # 256 * sizeof(uint64_t)
+    cdef const uint8_t* rp = <const uint8_t*>PyBytes_AsString(rnd)
+    cdef uint64_t* gear = <uint64_t*>malloc(2048)
+    if gear == NULL:
+        raise MemoryError("Failed to allocate fastcdc gear table")
+    cdef int i, j
+    cdef uint64_t v
+    for i in range(256):
+        v = 0
+        for j in range(8):
+            v |= (<uint64_t>rp[i * 8 + j]) << (8 * j)
+        gear[i] = v
+    return gear
+
+
+cdef inline uint64_t _high_mask(int bits):
+    """A mask with <bits> one-bits in the most significant positions (Gear's strong bits)."""
+    if bits <= 0:
+        return 0
+    if bits >= 64:
+        return <uint64_t>0xFFFFFFFFFFFFFFFF
+    return ((<uint64_t>1 << bits) - 1) << (64 - bits)
+
+
+cdef class ChunkerFastCDC:
+    """
+    FastCDC content-defined chunker, variable chunk sizes, keyed Gear hash.
+
+    Unlike the buzhash chunkers, Gear is window-less, so there is no hash_window_size parameter.
+    """
+    cdef uint64_t chunk_mask
+    cdef uint64_t mask_s, mask_l  # normalized chunking: strict / loose masks
+    cdef size_t normal_size       # chunk length at which we switch mask_s -> mask_l
+    cdef int nc_level             # normalized chunking level (0 = disabled)
+    cdef uint64_t* gear
+    cdef uint8_t* data
+    cdef object _fd
+    cdef int fh
+    cdef int done, eof
+    cdef size_t min_size, buf_size, remaining, position, last
+    cdef long long bytes_read, bytes_yielded
+    cdef readonly float chunking_time
+    cdef object file_reader
+    cdef size_t reader_block_size
+    cdef bint sparse
+
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int nc_level=0, size_t normal_size=0, bint sparse=False):
+        self.gear = NULL
+        self.data = NULL
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
+        assert min_size + 1 <= max_size, "too small max_size"
+
+        self.chunk_mask = _high_mask(hash_mask_bits)
+        self.min_size = min_size
+        # Normalized chunking, identical structure to the buzhash64 chunker (see there), but with
+        # the mask one-bits placed in the high bits of the Gear hash.
+        assert nc_level >= 0
+        assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits"
+        assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits"
+        self.nc_level = nc_level
+        if nc_level:
+            self.mask_s = _high_mask(hash_mask_bits + nc_level)
+            self.mask_l = _high_mask(hash_mask_bits - nc_level)
+            self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level)))
+        else:
+            self.mask_s = self.chunk_mask
+            self.mask_l = self.chunk_mask
+            self.normal_size = 0
+        self.gear = fastcdc_init_gear(key)
+        self.buf_size = max_size
+        self.data = <uint8_t*>malloc(self.buf_size)
+        if self.data == NULL:
+            raise MemoryError("Failed to allocate chunker buffer")
+        self.fh = -1
+        self.done = 0
+        self.eof = 0
+        self.remaining = 0
+        self.position = 0
+        self.last = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self._fd = None
+        self.chunking_time = 0.0
+        self.reader_block_size = 1024 * 1024
+        self.sparse = sparse
+
+    def __dealloc__(self):
+        if self.gear != NULL:
+            free(self.gear)
+            self.gear = NULL
+        if self.data != NULL:
+            free(self.data)
+            self.data = NULL
+
+    cdef int fill(self) except 0:
+        """Fill the chunker's buffer with more data."""
+        cdef ssize_t n
+        cdef object chunk
+
+        memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
+        self.position -= self.last
+        self.last = 0
+        n = self.buf_size - self.position - self.remaining
+
+        if self.eof or n == 0:
+            return 1
+
+        chunk = self.file_reader.read(n)
+        n = chunk.meta["size"]
+
+        if n > 0:
+            if chunk.meta["allocation"] == CH_DATA:
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
+            else:
+                memset(self.data + self.position + self.remaining, 0, n)
+            self.remaining += n
+            self.bytes_read += n
+        else:
+            self.eof = 1
+        return 1
+
+    cdef object process(self) except *:
+        """Process the chunker's buffer and return the next chunk."""
+        cdef uint64_t fp = 0, mask, mask_s = self.mask_s, mask_l = self.mask_l
+        cdef int nc_level = self.nc_level
+        cdef size_t n, old_last, min_size = self.min_size
+        cdef size_t normal_size = self.normal_size, normal_pos, chunk_len, did
+        cdef uint8_t* p
+        cdef uint8_t* stop
+        cdef uint8_t* cut
+        cdef uint64_t* gear = self.gear
+
+        if self.done:
+            if self.bytes_read == self.bytes_yielded:
+                raise StopIteration
+            else:
+                raise Exception("chunkifier byte count mismatch")
+
+        # ensure at least min_size + 1 bytes are buffered, or we are at eof
+        while self.remaining < min_size + 1 and not self.eof:
+            if not self.fill():
+                return None
+
+        # at eof with only a remainder (< min_size + 1): emit it as the final chunk
+        if self.eof and self.remaining < min_size + 1:
+            self.done = 1
+            if self.remaining:
+                old_last = self.last
+                self.position += self.remaining
+                self.last = self.position
+                n = self.last - old_last
+                self.remaining = 0
+                self.bytes_yielded += n
+                return memoryview((self.data + old_last)[:n])
+            else:
+                if self.bytes_read == self.bytes_yielded:
+                    raise StopIteration
+                else:
+                    raise Exception("chunkifier byte count mismatch")
+
+        # skip the sub-minimum region (no cut allowed below min_size), then gear-scan
+        self.position += min_size
+        self.remaining -= min_size
+        fp = 0
+
+        while True:
+            chunk_len = self.position - self.last
+            mask = mask_s if (nc_level and chunk_len < normal_size) else mask_l
+
+            if self.remaining == 0:
+                if self.eof:
+                    break  # cut at end of data
+                if not self.fill():
+                    return None
+                if self.remaining == 0:
+                    break  # buffer full -> chunk reached max_size -> forced cut
+                continue
+
+            p = self.data + self.position
+            stop = p + self.remaining
+            if nc_level and chunk_len < normal_size:
+                # do not scan past the strict->loose transition; re-evaluate the mask there
+                normal_pos = self.last + normal_size
+                if (self.data + normal_pos) < stop:
+                    stop = self.data + normal_pos
+
+            cut = NULL
+            while p < stop:
+                fp = (fp << 1) + gear[p[0]]
+                if (fp & mask) == 0:
+                    cut = p
+                    break
+                p += 1
+
+            if cut != NULL:
+                p = cut + 1  # cut right after the byte that triggered the boundary
+                did = p - (self.data + self.position)
+                self.position += did
+                self.remaining -= did
+                break
+            else:
+                did = p - (self.data + self.position)
+                self.position += did
+                self.remaining -= did
+
+        old_last = self.last
+        self.last = self.position
+        n = self.last - old_last
+        self.bytes_yielded += n
+        return memoryview((self.data + old_last)[:n])
+
+    def chunkify(self, fd, fh=-1, fmap=None):
+        self._fd = fd
+        self.fh = fh
+        self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
+        self.done = 0
+        self.remaining = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self.position = 0
+        self.last = 0
+        self.eof = 0
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        started_chunking = time.monotonic()
+        data = self.process()
+        got = len(data)
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        self.chunking_time += time.monotonic() - started_chunking
+        return Chunk(data, size=got, allocation=allocation)
+
+
+def fastcdc_get_gear_table(bytes key):
+    """Get the keyed gear table generated from <key> (for tests / inspection)."""
+    cdef uint64_t* gear = fastcdc_init_gear(key)
+    cdef int i
+    try:
+        return [gear[i] for i in range(256)]
+    finally:
+        free(gear)
diff --git a/src/borg/constants.py b/src/borg/constants.py
index 3750af71a1..5b411e183e 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -103,6 +103,7 @@
 # chunker algorithms
 CH_BUZHASH = "buzhash"
 CH_BUZHASH64 = "buzhash64"
+CH_FASTCDC = "fastcdc"
 CH_FIXED = "fixed"
 CH_FAIL = "fail"
 
@@ -119,6 +120,8 @@
 # defaults, use --chunker-params to override
 CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL)
+# fastcdc uses a window-less Gear hash, so it has no window_size parameter.
+FASTCDC_PARAMS = (CH_FASTCDC, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, NC_LEVEL)
 
 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index 72183885fd..b5b2928d93 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -324,6 +324,24 @@ def ChunkerParams(s):
             )
         # note that for buzhash64, there is no problem with even window_size.
         return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level
+    if algo == CH_FASTCDC and count == 5:
+        # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level
+        # fastcdc uses a window-less Gear hash, so there is no window_size field.
+        # nc_level is required; use nc_level 0 to disable normalized chunking.
+        chunk_min, chunk_max, chunk_mask = (int(p) for p in params[1:4])
+        nc_level = int(params[4])
+        if not (chunk_min <= chunk_mask <= chunk_max):
+            raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
+        if chunk_min < 6:
+            # see comment in 'fixed' algo check
+            raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)")
+        if chunk_max > 23:
+            raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)")
+        if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48):
+            raise ArgumentTypeError(
+                "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48"
+            )
+        return CH_FASTCDC, chunk_min, chunk_max, chunk_mask, nc_level
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
     if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])
diff --git a/src/borg/testsuite/chunkers/fastcdc_test.py b/src/borg/testsuite/chunkers/fastcdc_test.py
new file mode 100644
index 0000000000..cf06193a6e
--- /dev/null
+++ b/src/borg/testsuite/chunkers/fastcdc_test.py
@@ -0,0 +1,161 @@
+from hashlib import sha256
+from io import BytesIO
+import os
+import random
+
+import pytest
+
+from . import cf, cf_expand
+from ...chunkers import ChunkerFastCDC, get_chunker
+from ...chunkers.fastcdc import fastcdc_get_gear_table
+from ...constants import *  # NOQA
+from ...helpers import hex_to_bin
+
+
+# from os.urandom(32)
+key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
+key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
+
+
+def H(data):
+    return sha256(data).digest()
+
+
+def test_chunkpoints_fastcdc_unchanged():
+    def twist(size):
+        x = 1
+        a = bytearray(size)
+        for i in range(size):
+            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
+            a[i] = x & 0xFF
+        return a
+
+    data = twist(100000)
+
+    runs = []
+    for nc_level in (0, 2, 3):
+        for minexp in (4, 6, 7, 11, 12):
+            for maxexp in (15, 17):
+                if minexp >= maxexp:
+                    continue
+                for maskbits in (4, 7, 10, 12):
+                    if maskbits - nc_level < 1:  # nc_level needs room below the base mask bits
+                        continue
+                    for key in (key0, key1):
+                        fh = BytesIO(data)
+                        chunker = ChunkerFastCDC(key, minexp, maxexp, maskbits, nc_level)
+                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
+                        runs.append(H(b"".join(chunks)))
+
+    # The "correct" hash below matches the existing chunker behavior.
+    # Future chunker optimizations must not change this, or existing repos will bloat.
+    overall_hash = H(b"".join(runs))
+    print(overall_hash.hex())
+    assert overall_hash == hex_to_bin("50d39b6f30214d78f665ff97a4800142cddcb6a7c5995e5d162f9c6dceb20cfe")
+
+
+def test_fastcdc_chunksize_distribution():
+    data = os.urandom(1048576)
+    min_exp, max_exp, mask, nc_level = 10, 16, 14, 2  # chunk size target 16 KiB, clip at 1 KiB and 64 KiB
+    chunker = ChunkerFastCDC(key0, min_exp, max_exp, mask, nc_level)
+    f = BytesIO(data)
+    chunks = cf(chunker.chunkify(f))
+    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp
+    chunk_sizes = [len(chunk) for chunk in chunks]
+    chunks_count = len(chunks)
+    min_chunksize_observed = min(chunk_sizes)
+    max_chunksize_observed = max(chunk_sizes)
+    min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
+    max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
+    print(
+        f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
+        f"min count: {min_count} max count: {max_count}"
+    )
+    # usually there will about 64 chunks
+    assert 32 < chunks_count < 128
+    # chunks always must be between min and max (clipping must work):
+    assert min_chunksize_observed >= 2**min_exp
+    assert max_chunksize_observed <= 2**max_exp
+    # most chunks should be cut due to the gear hash triggering, not due to clipping at min/max size:
+    assert min_count < 10
+    assert max_count < 10
+
+
+def test_fastcdc_gear_table():
+    # Test that the function returns a list of 256 integers
+    table0 = fastcdc_get_gear_table(key0)
+    assert len(table0) == 256
+    for value in table0:
+        assert isinstance(value, int)
+        assert 0 <= value < 2**64
+
+    # deterministic (same key produces same table)
+    assert table0 == fastcdc_get_gear_table(key0)
+
+    # different keys produce different tables
+    table1 = fastcdc_get_gear_table(key1)
+    assert table0 != table1
+
+
+def test_fastcdc_get_chunker():
+    # without a key, get_chunker uses an all-zero key; chunking must still work and be deterministic
+    data = os.urandom(2 * 1024 * 1024)
+    a = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data)))
+    b = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data)))
+    assert a == b
+    assert b"".join(a) == data
+
+
+def test_fastcdc_params_parsing():
+    from argparse import ArgumentTypeError
+
+    from ...helpers import ChunkerParams
+
+    # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level (no window field)
+    assert ChunkerParams("fastcdc,19,23,21,2") == (CH_FASTCDC, 19, 23, 21, 2)
+    assert ChunkerParams("fastcdc,10,23,16,0") == (CH_FASTCDC, 10, 23, 16, 0)
+    # a 6-field (buzhash64-style, with window) fastcdc must be rejected
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,21,4095,2")
+    # nc_level out of range (chunk_mask - nc_level < 1)
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,21,21")
+    # chunk_min <= chunk_mask <= chunk_max violated
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,24,2")
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+@pytest.mark.parametrize("worker", range(os.cpu_count() or 1))
+def test_fuzz_fastcdc(worker):
+    # Fuzz fastcdc with random and uniform data of misc. sizes and misc keys.
+    def rnd_key():
+        return os.urandom(32)
+
+    # decompose FASTCDC_PARAMS = (algo, min_exp, max_exp, mask_bits, nc_level)
+    algo, min_exp, max_exp, mask_bits, nc_level = FASTCDC_PARAMS
+    assert algo == CH_FASTCDC
+
+    keys = [b"\0" * 32] + [rnd_key() for _ in range(10)]
+    sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
+
+    for key in keys:
+        chunker = ChunkerFastCDC(key, min_exp, max_exp, mask_bits, nc_level)
+        for size in sizes:
+            # Random data
+            data = os.urandom(size)
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data
+
+            # All-same data (non-zero)
+            data = b"\x42" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data
+
+            # All-zero data
+            data = b"\x00" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data