diff --git a/.gitignore b/.gitignore
index 27a84ea..10bcf41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ coverage/
 coverage.xml
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_ci/
diff --git a/Makefile b/Makefile
index b14f3d6..d3d6d9b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,19 @@
-.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts
+.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts
 
-update-baselines:
-	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI.
+# Prefer downloading benchmark-results.json from a CI artifact, then:
+#   python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+seed-baselines-local:
+	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
+	PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
+
+# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
+update-baselines: seed-baselines-local
 
 check-benchmarks:
-	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
-	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+	PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	PYTHONPATH=. python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
 
 clean-benchmark-artifacts:
 	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 59f70fc..5ef54c2 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,8 +23,8 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark
 
 | Group | What |
 |-------|------|
-| parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
-| export | `run_bulk_export` over 10 / 50 / 100 sessions |
+| parse | `parse_session` on 10 / 500 / 5000+ line JSONL; large-file peak heap (`test_parse_large_peak_memory`) |
+| export | `run_bulk_export` latency over 10 / 50 / 100 sessions; ZIP export peak heap (`test_bulk_export_zip_peak_memory`) |
 | search | `GET /api/search` over a 50-session synthetic corpus |
 | cache | cold vs warm `get_cached_session` (informational; not gated) |
 
@@ -32,29 +32,39 @@ Large JSONL files (5000+ lines) are generated at test session scope under pytest
 
 Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof.
 
-The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
+The memory ceiling test (`test_large_parse_peak_memory_under_ceiling`) runs in the main `pytest` job. Tracked peak-memory benchmarks (`test_parse_large_peak_memory`, `test_bulk_export_zip_peak_memory`) run under `--benchmark-only` and store `extra_info.peak_bytes` for the regression gate.
 
 ## CI gate
 
 The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**.
 
-**Gated:** parse medium/large, export 10/50/100 sessions.
+**Gated:** parse medium/large + large peak memory; export 10/50/100 session latency + ZIP peak memory.
 
-**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate.
+**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. These may appear in `baselines.json` for reference but are skipped by `check_benchmark_regression.py`. Benchmarks without a baseline entry print a warning and do not fail the gate.
+
+Missing gated benchmarks (renamed or removed tests still listed in `baselines.json`) fail the gate.
 
 ## Refresh baselines
 
-After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job):
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:
+
+```bash
+python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+```
+
+For a quick local snapshot only (may not match CI timings):
 
 ```bash
-make update-baselines
+make seed-baselines-local
 ```
 
+`make update-baselines` is a deprecated alias for `seed-baselines-local` and prints a warning. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.
+
 Or manually:
 
 ```bash
-pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
 ```
 
-Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed.
+Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 813a72f..07fb84e 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,17 +1,24 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.",
-  "updated": "2026-06-17T21:00:00Z",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #97, run 28126772276). Excluded from gate (recorded for reference): test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Memory benchmarks use extra_info.peak_bytes (bytes); latency uses stats.mean (seconds).",
+  "updated": "2026-06-24T20:15:37Z",
   "machine": "Linux",
   "groups": {
     "parse": {
-      "test_parse_session_medium": 0.002956,
-      "test_parse_session_large": 0.029678
+      "test_parse_session_small": 0.00010518068718225604,
+      "test_parse_session_medium": 0.002991333112179635,
+      "test_parse_session_large": 0.032311203818181436,
+      "test_parse_large_peak_memory": 2032028.0
     },
     "export": {
-      "test_bulk_export_session_count[sessions-10]": 0.004278,
-      "test_bulk_export_session_count[sessions-50]": 0.021144,
-      "test_bulk_export_session_count[sessions-100]": 0.042003
+      "test_bulk_export_session_count[sessions-10]": 0.0042825538530803925,
+      "test_bulk_export_session_count[sessions-50]": 0.021406330209302382,
+      "test_bulk_export_session_count[sessions-100]": 0.04229194749999898,
+      "test_bulk_export_zip_peak_memory[sessions-10]": 350628.0,
+      "test_bulk_export_zip_peak_memory[sessions-50]": 506454.0,
+      "test_bulk_export_zip_peak_memory[sessions-100]": 694088.0
     },
-    "search": {}
+    "search": {
+      "test_search_full_corpus": 0.0011120838654706596
+    }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 7842021..3a27dab 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -22,7 +22,43 @@ class BenchmarkDataError(ValueError):
     """Raised when benchmark JSON input is malformed or missing required fields."""
 
 
-def load_results(results_path: str | Path) -> dict[str, float]:
+def entry_uses_peak_bytes(entry: dict[str, object]) -> bool:
+    """True when the gated metric for *entry* is extra_info.peak_bytes."""
+    extra = entry.get("extra_info")
+    return isinstance(extra, dict) and "peak_bytes" in extra
+
+
+def metric_is_bytes(name: str, entry: dict[str, object] | None = None) -> bool:
+    """Shared heuristic for metric kind (bytes vs seconds) in gate and display."""
+    if entry is not None and entry_uses_peak_bytes(entry):
+        return True
+    return "peak_memory" in name
+
+
+def benchmark_entry_mean(entry: dict[str, object]) -> float:
+    """Return gated metric: peak_bytes from extra_info when present, else stats.mean."""
+    if entry_uses_peak_bytes(entry):
+        extra = entry["extra_info"]
+        if not isinstance(extra, dict):
+            raise BenchmarkDataError(f"extra_info for {entry.get('name')!r} is not a dict")
+        try:
+            return float(extra["peak_bytes"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes"
+            ) from exc
+    try:
+        stats = entry["stats"]
+        return float(stats["mean"])  # type: ignore[index]
+    except (KeyError, TypeError, ValueError) as exc:
+        raise BenchmarkDataError(
+            f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes"
+        ) from exc
+
+
+def load_results(
+    results_path: str | Path,
+) -> tuple[dict[str, float], dict[str, dict[str, object]]]:
     path = Path(results_path)
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
@@ -38,21 +74,25 @@ def load_results(results_path: str | Path) -> dict[str, float]:
         raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
 
     results: dict[str, float] = {}
+    entries_by_name: dict[str, dict[str, object]] = {}
     for index, entry in enumerate(benchmarks):
         if not isinstance(entry, dict):
             raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
         try:
             name = entry["name"]
-            mean = float(entry["stats"]["mean"])
+            mean = benchmark_entry_mean(entry)
+        except BenchmarkDataError:
+            raise
         except (KeyError, TypeError, ValueError) as exc:
             raise BenchmarkDataError(
-                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+                f"{path} benchmarks[{index}] missing 'name' or measurable value"
             ) from exc
         name = str(name)
         if name in results:
             raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}")
         results[name] = mean
-    return results
+        entries_by_name[name] = entry
+    return results, entries_by_name
 
 
 def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
@@ -96,23 +136,29 @@ def check_regression(
     threshold: float = THRESHOLD,
 ) -> int:
     """Return 0 when within threshold; 1 when any gated benchmark regresses."""
-    flat = load_results(results_path)
+    flat, entries_by_name = load_results(results_path)
     baseline_means = load_baseline_means(baselines_path)
 
     failures: list[str] = []
+    missing: list[str] = []
     for name, base in baseline_means.items():
         if name in EXCLUDED_FROM_GATE:
             continue
         cur = flat.get(name)
         if cur is None:
-            print(f"WARN: no current result for baseline {name!r}; skipping")
+            print(f"FAIL: no current result for gated baseline {name!r}")
+            missing.append(name)
             continue
         if base == 0:
             print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
             continue
         ratio = cur / base
         tag = "FAIL" if ratio > threshold else "ok"
-        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
+        entry = entries_by_name.get(name)
+        if metric_is_bytes(name, entry):
+            print(f"[{tag}] {name}: {cur:.0f} bytes vs {base:.0f} bytes ({ratio:.2f}x)")
+        else:
+            print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
         if ratio > threshold:
             failures.append(name)
 
@@ -124,6 +170,9 @@ def check_regression(
 
     if failures:
         print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+    if missing:
+        print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results")
+    if failures or missing:
         return 1
     return 0
 
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index 88be0eb..a3b4114 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -8,10 +8,10 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
-try:
-    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
-except ModuleNotFoundError:
-    from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+from scripts.check_benchmark_regression import (
+    BenchmarkDataError,
+    benchmark_entry_mean,
+)
 
 GATED_GROUPS = ("parse", "export", "search")
 
@@ -50,24 +50,30 @@ def reduce_baselines(
             raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
         try:
             name = entry["name"]
-            mean = float(entry["stats"]["mean"])
+            mean = benchmark_entry_mean(entry)
+        except BenchmarkDataError:
+            raise
         except (KeyError, TypeError, ValueError) as exc:
             raise BenchmarkDataError(
-                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+                f"{path} benchmarks[{index}] missing 'name' or measurable value"
             ) from exc
+        bench_name = str(name)
         group = entry.get("group")
         if group not in GATED_GROUPS:
             continue
-        if str(name) in EXCLUDED_FROM_GATE:
-            continue
-        groups[group][str(name)] = mean * slack
+        groups[group][bench_name] = mean * slack
 
+    slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
     output: dict[str, object] = {
         "_note": (
-            "Gated means from ubuntu-latest CI (post-cache). "
-            "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)."
+            "Gated means from ubuntu-latest CI benchmark-results.json."
+            f"{slack_note} "
+            "Excluded from gate (recorded for reference): test_parse_session_small, "
+            "test_search_full_corpus (sub-ms CI noise). "
+            "Memory benchmarks use extra_info.peak_bytes (bytes); "
+            "latency uses stats.mean (seconds)."
         ),
         "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
         "machine": machine,
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
index cd4369c..2bc4876 100644
--- a/tests/benchmarks/conftest.py
+++ b/tests/benchmarks/conftest.py
@@ -3,8 +3,12 @@
 from __future__ import annotations
 
 import json
+import tracemalloc
+from collections.abc import Callable
 from copy import deepcopy
+from datetime import UTC, datetime, timedelta
 from pathlib import Path
+from typing import Any, TypeVar
 
 import pytest
 
@@ -13,14 +17,47 @@
 FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
 TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0]
 
+T = TypeVar("T")
 
-def write_jsonl(path: Path, line_count: int) -> Path:
+_EXPORT_SESSION_BASE = datetime(2026, 6, 12, 0, 0, tzinfo=UTC)
+
+
+def export_session_first_timestamp(index: int) -> str:
+    """Return a unique, valid ISO timestamp for export-corpus session *index*."""
+    return (_EXPORT_SESSION_BASE + timedelta(minutes=index)).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+class TracemallocPeak:
+    """Measure peak Python heap bytes for one callable invocation."""
+
+    def measure(self, func: Callable[..., T], /, *args: Any, **kwargs: Any) -> tuple[T, int]:
+        was_tracing = tracemalloc.is_tracing()
+        tracemalloc.start()
+        tracemalloc.clear_traces()
+        try:
+            result = func(*args, **kwargs)
+            _, peak = tracemalloc.get_traced_memory()
+            return result, peak
+        finally:
+            if not was_tracing:
+                tracemalloc.stop()
+
+
+@pytest.fixture
+def tracemalloc_peak() -> TracemallocPeak:
+    return TracemallocPeak()
+
+
+def write_jsonl(path: Path, line_count: int, *, first_timestamp: str | None = None) -> Path:
     """Write a JSONL session file with *line_count* rows derived from the template fixture."""
     template = json.loads(TEMPLATE_LINE)
     with path.open("w", encoding="utf-8") as f:
         for i in range(line_count):
             entry = deepcopy(template)
-            entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z"
+            if i == 0 and first_timestamp is not None:
+                entry["timestamp"] = first_timestamp
+            else:
+                entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z"
             if i % 3 == 1:
                 msg = entry.setdefault("message", {})
                 if isinstance(msg, dict) and "content" in msg:
@@ -72,7 +109,9 @@ def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path:
     project = tmp_path / "bench-project"
     project.mkdir()
     for i in range(count):
-        write_jsonl(project / f"session_{i:04d}.jsonl", 20)
+        # Unique first_timestamp per session so export filenames do not collide in ZIP benches.
+        first_ts = export_session_first_timestamp(i)
+        write_jsonl(project / f"session_{i:04d}.jsonl", 20, first_timestamp=first_ts)
     return project
 
 
diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py
index 46c0eaf..39abcbb 100644
--- a/tests/benchmarks/test_export_bench.py
+++ b/tests/benchmarks/test_export_bench.py
@@ -2,11 +2,17 @@
 
 from __future__ import annotations
 
+import io
+import zipfile
 from pathlib import Path
 
 import pytest
 
-from utils.export_engine import NoopSink, run_bulk_export
+from utils.export_engine import BulkExportResult, NoopSink, ZipSink, run_bulk_export
+
+
+def _bench_projects(export_corpus: Path) -> list[dict[str, str]]:
+    return [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}]
 
 
 @pytest.mark.benchmark(group="export")
@@ -20,7 +26,7 @@ def test_bulk_export_session_count(
     benchmark,
     export_corpus: Path,
 ) -> None:
-    projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}]
+    projects = _bench_projects(export_corpus)
 
     def _run() -> object:
         # NoopSink + since="all" + empty last_export_sessions: no disk/state writes per round.
@@ -37,3 +43,45 @@ def _run() -> object:
 
     result = benchmark(_run)
     assert result.exported_session_count > 0
+
+
+@pytest.mark.benchmark(group="export")
+@pytest.mark.parametrize(
+    "export_corpus",
+    [10, 50, 100],
+    indirect=True,
+    ids=["sessions-10", "sessions-50", "sessions-100"],
+)
+def test_bulk_export_zip_peak_memory(
+    benchmark,
+    export_corpus: Path,
+    tracemalloc_peak,
+) -> None:
+    projects = _bench_projects(export_corpus)
+    peaks: list[int] = []
+    results: list[BulkExportResult] = []
+
+    def _run() -> None:
+        def _export() -> BulkExportResult:
+            buf = io.BytesIO()
+            with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+                sink = ZipSink(zf)
+                return run_bulk_export(
+                    projects=projects,
+                    since="all",
+                    rules=[],
+                    last_export_sessions={},
+                    sink=sink,
+                    fmt="md",
+                    path_layout="api",
+                    manifest_style="api",
+                )
+
+        result, peak = tracemalloc_peak.measure(_export)
+        results.append(result)
+        peaks.append(peak)
+
+    benchmark(_run)
+    assert results and results[-1].exported_session_count > 0
+    assert peaks, "benchmark produced no peak memory samples"
+    benchmark.extra_info["peak_bytes"] = int(sum(peaks) / len(peaks))
diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py
index de1c886..46738cd 100644
--- a/tests/benchmarks/test_parse_memory.py
+++ b/tests/benchmarks/test_parse_memory.py
@@ -1,14 +1,18 @@
-"""Peak memory ceiling for large-file parse_session (regular pytest, not benchmark-only)."""
+"""Peak memory for large-file parse_session: ceiling test + tracked benchmark."""
 
 from __future__ import annotations
 
-import tracemalloc
 from pathlib import Path
 
+import pytest
+
 from utils.jsonl_parser import parse_session
 
 
-def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None:
+def test_large_parse_peak_memory_under_ceiling(
+    parse_large_file: Path,
+    tracemalloc_peak,
+) -> None:
     path = parse_large_file
     file_bytes = path.stat().st_size
     # Issue #7 ceiling: Python heap peak (tracemalloc) vs on-disk JSONL size. Parsed
@@ -16,13 +20,25 @@ def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None:
     # a comment here if the parser legitimately grows.
     ceiling = file_bytes * 10
 
-    tracemalloc.start()
-    tracemalloc.clear_traces()
-    try:
-        result = parse_session(str(path))
-        assert len(result["messages"]) > 0, "parse_session returned no messages"
-        _, peak = tracemalloc.get_traced_memory()
-    finally:
-        tracemalloc.stop()
-
+    result, peak = tracemalloc_peak.measure(parse_session, str(path))
+    assert len(result["messages"]) > 0, "parse_session returned no messages"
     assert peak < ceiling, f"peak {peak} bytes exceeds 10x file size {file_bytes}"
+
+
+@pytest.mark.benchmark(group="parse")
+def test_parse_large_peak_memory(
+    benchmark,
+    parse_large_file: Path,
+    tracemalloc_peak,
+) -> None:
+    path = str(parse_large_file)
+    peaks: list[int] = []
+
+    def _run() -> None:
+        _, peak = tracemalloc_peak.measure(parse_session, path)
+        peaks.append(peak)
+
+    benchmark(_run)
+    assert peaks, "benchmark produced no peak memory samples"
+    # Gate uses extra_info.peak_bytes, not stats.mean (tracemalloc-inflated wall time).
+    benchmark.extra_info["peak_bytes"] = int(sum(peaks) / len(peaks))
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 49c4f49..891766f 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -39,12 +39,12 @@ def test_missing_baseline_warns_without_failing(
         results,
         [
             {"name": "test_new_bench", "stats": {"mean": 0.01}},
-            {"name": "test_parse_session_small", "stats": {"mean": 0.0001}},
+            {"name": GATED_BENCH, "stats": {"mean": 0.002}},
         ],
     )
     _write_baselines(
         baselines,
-        {"parse": {"test_parse_session_small": 0.0001}},
+        {"parse": {GATED_BENCH: 0.002}},
     )
 
     assert check_regression(results, baselines) == 0
@@ -152,9 +152,7 @@ def test_excluded_benchmark_in_baselines_is_not_gated(
     assert "REGRESSION" not in capsys.readouterr().out
 
 
-def test_missing_current_result_warns_without_failing(
-    tmp_path, capsys: pytest.CaptureFixture[str]
-) -> None:
+def test_missing_current_result_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
     results = tmp_path / "results.json"
     baselines = tmp_path / "baselines.json"
     _write_results(results, [])
@@ -163,8 +161,10 @@ def test_missing_current_result_warns_without_failing(
         {"parse": {GATED_BENCH: 0.002}},
     )
 
-    assert check_regression(results, baselines) == 0
-    assert "no current result for baseline" in capsys.readouterr().out
+    assert check_regression(results, baselines) == 1
+    out = capsys.readouterr().out
+    assert "MISSING" in out
+    assert "no current result for gated baseline" in out
 
 
 def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
@@ -179,6 +179,80 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu
     assert "ERROR:" in capsys.readouterr().err
 
 
+def test_load_results_prefers_peak_bytes_extra_info(tmp_path) -> None:
+    path = tmp_path / "results.json"
+    _write_results(
+        path,
+        [
+            {
+                "name": "test_parse_large_peak_memory",
+                "stats": {"mean": 0.05},
+                "extra_info": {"peak_bytes": 12_345_678},
+            }
+        ],
+    )
+
+    assert load_results(path)[0]["test_parse_large_peak_memory"] == 12_345_678.0
+
+
+def test_metric_is_bytes_uses_extra_info_without_name_hint() -> None:
+    from scripts.check_benchmark_regression import metric_is_bytes
+
+    entry = {
+        "name": "test_export_latency",
+        "stats": {"mean": 0.05},
+        "extra_info": {"peak_bytes": 1_000_000},
+    }
+    assert metric_is_bytes("test_export_latency", entry)
+
+
+def test_memory_metric_regression_uses_bytes(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [
+            {
+                "name": "test_parse_large_peak_memory",
+                "stats": {"mean": 0.05},
+                "extra_info": {"peak_bytes": 15_000_000},
+            }
+        ],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_large_peak_memory": 10_000_000}},
+    )
+
+    assert check_regression(results, baselines) == 1
+    out = capsys.readouterr().out
+    assert "bytes" in out
+    assert "REGRESSION" in out
+
+
+def test_benchmark_entry_mean_rejects_non_dict_extra_info() -> None:
+    from scripts.check_benchmark_regression import benchmark_entry_mean
+
+    with pytest.raises(BenchmarkDataError, match="extra_info"):
+        benchmark_entry_mean(
+            {
+                "name": "test_parse_large_peak_memory",
+                "extra_info": "not-a-dict",
+            }
+        )
+
+
+def test_load_results_preserves_benchmark_data_error_message(tmp_path) -> None:
+    path = tmp_path / "results.json"
+    _write_results(
+        path,
+        [{"name": "test_parse_large_peak_memory", "extra_info": {"peak_bytes": "bad"}}],
+    )
+
+    with pytest.raises(BenchmarkDataError, match="extra_info.peak_bytes"):
+        load_results(path)
+
+
 def test_duplicate_baseline_name_raises(tmp_path) -> None:
     baselines = tmp_path / "baselines.json"
     _write_baselines(
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
index 8919b84..bad0a56 100644
--- a/tests/test_reduce_baselines.py
+++ b/tests/test_reduce_baselines.py
@@ -38,11 +38,29 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None:
     output = reduce_baselines(raw, out)
 
     assert output["machine"] == "Linux"
+    assert set(output["groups"].keys()) == {"parse", "export", "search"}
     assert "test_parse_session_medium" in output["groups"]["parse"]
-    assert "test_parse_session_small" not in output["groups"]["parse"]
+    assert "test_parse_session_small" in output["groups"]["parse"]
+    assert output["groups"]["search"] == {}
     assert "cache" not in output["groups"]
 
 
+def test_reduce_baselines_includes_search_benchmark(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.001}},
+            {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}},
+        ],
+    )
+
+    output = reduce_baselines(raw, out)
+
+    assert output["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.001)
+
+
 def test_reduce_baselines_applies_slack(tmp_path) -> None:
     raw = tmp_path / "raw.json"
     out = tmp_path / "baselines.json"
@@ -79,6 +97,26 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None:
     assert exc_info.value.code == 2
 
 
+def test_reduce_baselines_uses_peak_bytes_extra_info(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "group": "parse",
+                "name": "test_parse_large_peak_memory",
+                "stats": {"mean": 0.05},
+                "extra_info": {"peak_bytes": 10_000_000},
+            }
+        ],
+    )
+
+    output = reduce_baselines(raw, out)
+
+    assert output["groups"]["parse"]["test_parse_large_peak_memory"] == 10_000_000.0
+
+
 def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None:
     raw = tmp_path / "raw.json"
     raw.write_text(