cppalliance · wpak-ai · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ coverage/
 coverage.xml
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_ci/
diff --git a/Makefile b/Makefile
@@ -1,12 +1,19 @@
-.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts
+.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts
 
-update-baselines:
-	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI.
+# Prefer downloading benchmark-results.json from a CI artifact, then:
+#   python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+seed-baselines-local:
+	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
+	PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
+
+# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
+update-baselines: seed-baselines-local
 
 check-benchmarks:
-	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
-	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+	PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	PYTHONPATH=. python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
 
 clean-benchmark-artifacts:
 	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -23,38 +23,48 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark
 
 | Group | What |
 |-------|------|
-| parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
-| export | `run_bulk_export` over 10 / 50 / 100 sessions |
+| parse | `parse_session` on 10 / 500 / 5000+ line JSONL; large-file peak heap (`test_parse_large_peak_memory`) |
+| export | `run_bulk_export` latency over 10 / 50 / 100 sessions; ZIP export peak heap (`test_bulk_export_zip_peak_memory`) |
 | search | `GET /api/search` over a 50-session synthetic corpus |
 | cache | cold vs warm `get_cached_session` (informational; not gated) |
 
 Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git.
 
 Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof.
 
-The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
+The memory ceiling test (`test_large_parse_peak_memory_under_ceiling`) runs in the main `pytest` job. Tracked peak-memory benchmarks (`test_parse_large_peak_memory`, `test_bulk_export_zip_peak_memory`) run under `--benchmark-only` and store `extra_info.peak_bytes` for the regression gate.
 
 ## CI gate
 
 The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**.
 
-**Gated:** parse medium/large, export 10/50/100 sessions.
+**Gated:** parse medium/large + large peak memory; export 10/50/100 session latency + ZIP peak memory.
 
-**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate.
+**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. These may appear in `baselines.json` for reference but are skipped by `check_benchmark_regression.py`. Benchmarks without a baseline entry print a warning and do not fail the gate.
+
+Missing gated benchmarks (renamed or removed tests still listed in `baselines.json`) fail the gate.
 
 ## Refresh baselines
 
-After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job):
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:
+
+```bash
+python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+```
+
+For a quick local snapshot only (may not match CI timings):
 
 ```bash
-make update-baselines
+make seed-baselines-local
 ```
 
+`make update-baselines` is a deprecated alias for `seed-baselines-local` and prints a warning. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.
+
 Or manually:
 
 ```bash
-pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
 ```
 
-Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed.
+Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
@@ -1,17 +1,24 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.",
-  "updated": "2026-06-17T21:00:00Z",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #97, run 28126772276). Excluded from gate (recorded for reference): test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Memory benchmarks use extra_info.peak_bytes (bytes); latency uses stats.mean (seconds).",
+  "updated": "2026-06-24T20:15:37Z",
   "machine": "Linux",
   "groups": {
     "parse": {
-      "test_parse_session_medium": 0.002956,
-      "test_parse_session_large": 0.029678
+      "test_parse_session_small": 0.00010518068718225604,
+      "test_parse_session_medium": 0.002991333112179635,
+      "test_parse_session_large": 0.032311203818181436,
+      "test_parse_large_peak_memory": 2032028.0
     },
     "export": {
-      "test_bulk_export_session_count[sessions-10]": 0.004278,
-      "test_bulk_export_session_count[sessions-50]": 0.021144,
-      "test_bulk_export_session_count[sessions-100]": 0.042003
+      "test_bulk_export_session_count[sessions-10]": 0.0042825538530803925,
+      "test_bulk_export_session_count[sessions-50]": 0.021406330209302382,
+      "test_bulk_export_session_count[sessions-100]": 0.04229194749999898,
+      "test_bulk_export_zip_peak_memory[sessions-10]": 350628.0,
+      "test_bulk_export_zip_peak_memory[sessions-50]": 506454.0,
+      "test_bulk_export_zip_peak_memory[sessions-100]": 694088.0
     },
-    "search": {}
+    "search": {
+      "test_search_full_corpus": 0.0011120838654706596
+    }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
@@ -22,7 +22,43 @@ class BenchmarkDataError(ValueError):
     """Raised when benchmark JSON input is malformed or missing required fields."""
 
 
-def load_results(results_path: str | Path) -> dict[str, float]:
+def entry_uses_peak_bytes(entry: dict[str, object]) -> bool:
+    """True when the gated metric for *entry* is extra_info.peak_bytes."""
+    extra = entry.get("extra_info")
+    return isinstance(extra, dict) and "peak_bytes" in extra
+
+
+def metric_is_bytes(name: str, entry: dict[str, object] | None = None) -> bool:
+    """Shared heuristic for metric kind (bytes vs seconds) in gate and display."""
+    if entry is not None and entry_uses_peak_bytes(entry):
+        return True
+    return "peak_memory" in name
+
+
+def benchmark_entry_mean(entry: dict[str, object]) -> float:
+    """Return gated metric: peak_bytes from extra_info when present, else stats.mean."""
+    if entry_uses_peak_bytes(entry):
+        extra = entry["extra_info"]
+        if not isinstance(extra, dict):
+            raise BenchmarkDataError(f"extra_info for {entry.get('name')!r} is not a dict")
+        try:
+            return float(extra["peak_bytes"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes"
+            ) from exc
+    try:
+        stats = entry["stats"]
+        return float(stats["mean"])  # type: ignore[index]
+    except (KeyError, TypeError, ValueError) as exc:
+        raise BenchmarkDataError(
+            f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes"
+        ) from exc
+
+
+def load_results(
+    results_path: str | Path,
+) -> tuple[dict[str, float], dict[str, dict[str, object]]]:
     path = Path(results_path)
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
@@ -38,21 +74,25 @@ def load_results(results_path: str | Path) -> dict[str, float]:
         raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
 
     results: dict[str, float] = {}
+    entries_by_name: dict[str, dict[str, object]] = {}
     for index, entry in enumerate(benchmarks):
         if not isinstance(entry, dict):
             raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
         try:
             name = entry["name"]
-            mean = float(entry["stats"]["mean"])
+            mean = benchmark_entry_mean(entry)
+        except BenchmarkDataError:
+            raise
         except (KeyError, TypeError, ValueError) as exc:
             raise BenchmarkDataError(
-                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+                f"{path} benchmarks[{index}] missing 'name' or measurable value"
             ) from exc
         name = str(name)
         if name in results:
             raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}")
         results[name] = mean
-    return results
+        entries_by_name[name] = entry
+    return results, entries_by_name
 
 
 def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
@@ -96,23 +136,29 @@ def check_regression(
     threshold: float = THRESHOLD,
 ) -> int:
     """Return 0 when within threshold; 1 when any gated benchmark regresses."""
-    flat = load_results(results_path)
+    flat, entries_by_name = load_results(results_path)
     baseline_means = load_baseline_means(baselines_path)
 
     failures: list[str] = []
+    missing: list[str] = []
     for name, base in baseline_means.items():
         if name in EXCLUDED_FROM_GATE:
             continue
         cur = flat.get(name)
         if cur is None:
-            print(f"WARN: no current result for baseline {name!r}; skipping")
+            print(f"FAIL: no current result for gated baseline {name!r}")
+            missing.append(name)
             continue
         if base == 0:
             print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
             continue
         ratio = cur / base
         tag = "FAIL" if ratio > threshold else "ok"
-        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
+        entry = entries_by_name.get(name)
+        if metric_is_bytes(name, entry):
+            print(f"[{tag}] {name}: {cur:.0f} bytes vs {base:.0f} bytes ({ratio:.2f}x)")
+        else:
+            print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
         if ratio > threshold:
             failures.append(name)
 
@@ -124,6 +170,9 @@ def check_regression(
 
     if failures:
         print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+    if missing:
+        print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results")
+    if failures or missing:
         return 1
     return 0
 

diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
@@ -8,10 +8,10 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
-try:
-    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
-except ModuleNotFoundError:
-    from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+from scripts.check_benchmark_regression import (
+    BenchmarkDataError,
+    benchmark_entry_mean,
+)
 
 GATED_GROUPS = ("parse", "export", "search")
 
@@ -50,24 +50,30 @@ def reduce_baselines(
             raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
         try:
             name = entry["name"]
-            mean = float(entry["stats"]["mean"])
+            mean = benchmark_entry_mean(entry)
+        except BenchmarkDataError:
+            raise
         except (KeyError, TypeError, ValueError) as exc:
             raise BenchmarkDataError(
-                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+                f"{path} benchmarks[{index}] missing 'name' or measurable value"
             ) from exc
+        bench_name = str(name)
         group = entry.get("group")
         if group not in GATED_GROUPS:
             continue
-        if str(name) in EXCLUDED_FROM_GATE:
-            continue
-        groups[group][str(name)] = mean * slack
+        groups[group][bench_name] = mean * slack
 
+    slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
     output: dict[str, object] = {
         "_note": (
-            "Gated means from ubuntu-latest CI (post-cache). "
-            "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)."
+            "Gated means from ubuntu-latest CI benchmark-results.json."
+            f"{slack_note} "
+            "Excluded from gate (recorded for reference): test_parse_session_small, "
+            "test_search_full_corpus (sub-ms CI noise). "
+            "Memory benchmarks use extra_info.peak_bytes (bytes); "
+            "latency uses stats.mean (seconds)."
         ),
         "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
         "machine": machine,

diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
@@ -3,8 +3,12 @@
 from __future__ import annotations
 
 import json
+import tracemalloc
+from collections.abc import Callable
 from copy import deepcopy
+from datetime import UTC, datetime, timedelta
 from pathlib import Path
+from typing import Any, TypeVar
 
 import pytest
 
@@ -13,14 +17,47 @@
 FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
 TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0]
 
+T = TypeVar("T")
 
-def write_jsonl(path: Path, line_count: int) -> Path:
+_EXPORT_SESSION_BASE = datetime(2026, 6, 12, 0, 0, tzinfo=UTC)
+
+
+def export_session_first_timestamp(index: int) -> str:
+    """Return a unique, valid ISO timestamp for export-corpus session *index*."""
+    return (_EXPORT_SESSION_BASE + timedelta(minutes=index)).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+class TracemallocPeak:
+    """Measure peak Python heap bytes for one callable invocation."""
+
+    def measure(self, func: Callable[..., T], /, *args: Any, **kwargs: Any) -> tuple[T, int]:
+        was_tracing = tracemalloc.is_tracing()
+        tracemalloc.start()
+        tracemalloc.clear_traces()
+        try:
+            result = func(*args, **kwargs)
+            _, peak = tracemalloc.get_traced_memory()
+            return result, peak
+        finally:
+            if not was_tracing:
+                tracemalloc.stop()
+
+
+@pytest.fixture
+def tracemalloc_peak() -> TracemallocPeak:
+    return TracemallocPeak()
+
+
+def write_jsonl(path: Path, line_count: int, *, first_timestamp: str | None = None) -> Path:
     """Write a JSONL session file with *line_count* rows derived from the template fixture."""
     template = json.loads(TEMPLATE_LINE)
     with path.open("w", encoding="utf-8") as f:
         for i in range(line_count):
             entry = deepcopy(template)
-            entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z"
+            if i == 0 and first_timestamp is not None:
+                entry["timestamp"] = first_timestamp
+            else:
+                entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z"
             if i % 3 == 1:
                 msg = entry.setdefault("message", {})
                 if isinstance(msg, dict) and "content" in msg:
@@ -72,7 +109,9 @@ def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path:
     project = tmp_path / "bench-project"
     project.mkdir()
     for i in range(count):
-        write_jsonl(project / f"session_{i:04d}.jsonl", 20)
+        # Unique first_timestamp per session so export filenames do not collide in ZIP benches.
+        first_ts = export_session_first_timestamp(i)
+        write_jsonl(project / f"session_{i:04d}.jsonl", 20, first_timestamp=first_ts)
     return project