diff --git a/.gitignore b/.gitignore index 27a84ea..10bcf41 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ coverage/ coverage.xml benchmark-results.json benchmarks/_raw.json +benchmarks/_ci/ diff --git a/Makefile b/Makefile index b14f3d6..d3d6d9b 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,19 @@ -.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts +.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts -update-baselines: - pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= - python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json +# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI. +# Prefer downloading benchmark-results.json from a CI artifact, then: +# python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +seed-baselines-local: + @echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2 + PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= + PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 + +# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above. +update-baselines: seed-baselines-local check-benchmarks: - pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= - python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= + PYTHONPATH=. python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json clean-benchmark-artifacts: rm -f benchmarks/_raw.json benchmark-results.json diff --git a/benchmarks/README.md b/benchmarks/README.md index 59f70fc..5ef54c2 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -23,8 +23,8 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark | Group | What | |-------|------| -| parse | `parse_session` on 10 / 500 / 5000+ line JSONL | -| export | `run_bulk_export` over 10 / 50 / 100 sessions | +| parse | `parse_session` on 10 / 500 / 5000+ line JSONL; large-file peak heap (`test_parse_large_peak_memory`) | +| export | `run_bulk_export` latency over 10 / 50 / 100 sessions; ZIP export peak heap (`test_bulk_export_zip_peak_memory`) | | search | `GET /api/search` over a 50-session synthetic corpus | | cache | cold vs warm `get_cached_session` (informational; not gated) | @@ -32,29 +32,39 @@ Large JSONL files (5000+ lines) are generated at test session scope under pytest Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof. -The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session. +The memory ceiling test (`test_large_parse_peak_memory_under_ceiling`) runs in the main `pytest` job. Tracked peak-memory benchmarks (`test_parse_large_peak_memory`, `test_bulk_export_zip_peak_memory`) run under `--benchmark-only` and store `extra_info.peak_bytes` for the regression gate. ## CI gate The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**. -**Gated:** parse medium/large, export 10/50/100 sessions. +**Gated:** parse medium/large + large peak memory; export 10/50/100 session latency + ZIP peak memory. -**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate. +**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. These may appear in `baselines.json` for reference but are skipped by `check_benchmark_regression.py`. Benchmarks without a baseline entry print a warning and do not fail the gate. + +Missing gated benchmarks (renamed or removed tests still listed in `baselines.json`) fail the gate. ## Refresh baselines -After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job): +After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible: + +```bash +python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +``` + +For a quick local snapshot only (may not match CI timings): ```bash -make update-baselines +make seed-baselines-local ``` +`make update-baselines` is a deprecated alias for `seed-baselines-local` and prints a warning. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew. + Or manually: ```bash -pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= -python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json +PYTHONPATH=. pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= +PYTHONPATH=. python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 ``` -Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed. +Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 813a72f..07fb84e 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,17 +1,24 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.", - "updated": "2026-06-17T21:00:00Z", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #97, run 28126772276). Excluded from gate (recorded for reference): test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Memory benchmarks use extra_info.peak_bytes (bytes); latency uses stats.mean (seconds).", + "updated": "2026-06-24T20:15:37Z", "machine": "Linux", "groups": { "parse": { - "test_parse_session_medium": 0.002956, - "test_parse_session_large": 0.029678 + "test_parse_session_small": 0.00010518068718225604, + "test_parse_session_medium": 0.002991333112179635, + "test_parse_session_large": 0.032311203818181436, + "test_parse_large_peak_memory": 2032028.0 }, "export": { - "test_bulk_export_session_count[sessions-10]": 0.004278, - "test_bulk_export_session_count[sessions-50]": 0.021144, - "test_bulk_export_session_count[sessions-100]": 0.042003 + "test_bulk_export_session_count[sessions-10]": 0.0042825538530803925, + "test_bulk_export_session_count[sessions-50]": 0.021406330209302382, + "test_bulk_export_session_count[sessions-100]": 0.04229194749999898, + "test_bulk_export_zip_peak_memory[sessions-10]": 350628.0, + "test_bulk_export_zip_peak_memory[sessions-50]": 506454.0, + "test_bulk_export_zip_peak_memory[sessions-100]": 694088.0 }, - "search": {} + "search": { + "test_search_full_corpus": 0.0011120838654706596 + } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 7842021..3a27dab 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -22,7 +22,43 @@ class BenchmarkDataError(ValueError): """Raised when benchmark JSON input is malformed or missing required fields.""" -def load_results(results_path: str | Path) -> dict[str, float]: +def entry_uses_peak_bytes(entry: dict[str, object]) -> bool: + """True when the gated metric for *entry* is extra_info.peak_bytes.""" + extra = entry.get("extra_info") + return isinstance(extra, dict) and "peak_bytes" in extra + + +def metric_is_bytes(name: str, entry: dict[str, object] | None = None) -> bool: + """Shared heuristic for metric kind (bytes vs seconds) in gate and display.""" + if entry is not None and entry_uses_peak_bytes(entry): + return True + return "peak_memory" in name + + +def benchmark_entry_mean(entry: dict[str, object]) -> float: + """Return gated metric: peak_bytes from extra_info when present, else stats.mean.""" + if entry_uses_peak_bytes(entry): + extra = entry["extra_info"] + if not isinstance(extra, dict): + raise BenchmarkDataError(f"extra_info for {entry.get('name')!r} is not a dict") + try: + return float(extra["peak_bytes"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes" + ) from exc + try: + stats = entry["stats"] + return float(stats["mean"]) # type: ignore[index] + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"benchmark {entry.get('name')!r} missing 'stats.mean' or extra_info.peak_bytes" + ) from exc + + +def load_results( + results_path: str | Path, +) -> tuple[dict[str, float], dict[str, dict[str, object]]]: path = Path(results_path) try: data = json.loads(path.read_text(encoding="utf-8")) @@ -38,21 +74,25 @@ def load_results(results_path: str | Path) -> dict[str, float]: raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") results: dict[str, float] = {} + entries_by_name: dict[str, dict[str, object]] = {} for index, entry in enumerate(benchmarks): if not isinstance(entry, dict): raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") try: name = entry["name"] - mean = float(entry["stats"]["mean"]) + mean = benchmark_entry_mean(entry) + except BenchmarkDataError: + raise except (KeyError, TypeError, ValueError) as exc: raise BenchmarkDataError( - f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + f"{path} benchmarks[{index}] missing 'name' or measurable value" ) from exc name = str(name) if name in results: raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}") results[name] = mean - return results + entries_by_name[name] = entry + return results, entries_by_name def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: @@ -96,23 +136,29 @@ def check_regression( threshold: float = THRESHOLD, ) -> int: """Return 0 when within threshold; 1 when any gated benchmark regresses.""" - flat = load_results(results_path) + flat, entries_by_name = load_results(results_path) baseline_means = load_baseline_means(baselines_path) failures: list[str] = [] + missing: list[str] = [] for name, base in baseline_means.items(): if name in EXCLUDED_FROM_GATE: continue cur = flat.get(name) if cur is None: - print(f"WARN: no current result for baseline {name!r}; skipping") + print(f"FAIL: no current result for gated baseline {name!r}") + missing.append(name) continue if base == 0: print(f"WARN: baseline for {name!r} is zero; skipping ratio check") continue ratio = cur / base tag = "FAIL" if ratio > threshold else "ok" - print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") + entry = entries_by_name.get(name) + if metric_is_bytes(name, entry): + print(f"[{tag}] {name}: {cur:.0f} bytes vs {base:.0f} bytes ({ratio:.2f}x)") + else: + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") if ratio > threshold: failures.append(name) @@ -124,6 +170,9 @@ def check_regression( if failures: print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + if missing: + print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results") + if failures or missing: return 1 return 0 diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index 88be0eb..a3b4114 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -8,10 +8,10 @@ from datetime import UTC, datetime from pathlib import Path -try: - from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError -except ModuleNotFoundError: - from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError +from scripts.check_benchmark_regression import ( + BenchmarkDataError, + benchmark_entry_mean, +) GATED_GROUPS = ("parse", "export", "search") @@ -50,24 +50,30 @@ def reduce_baselines( raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") try: name = entry["name"] - mean = float(entry["stats"]["mean"]) + mean = benchmark_entry_mean(entry) + except BenchmarkDataError: + raise except (KeyError, TypeError, ValueError) as exc: raise BenchmarkDataError( - f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + f"{path} benchmarks[{index}] missing 'name' or measurable value" ) from exc + bench_name = str(name) group = entry.get("group") if group not in GATED_GROUPS: continue - if str(name) in EXCLUDED_FROM_GATE: - continue - groups[group][str(name)] = mean * slack + groups[group][bench_name] = mean * slack + slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else "" machine_info = raw.get("machine_info") machine = machine_info.get("system") if isinstance(machine_info, dict) else None output: dict[str, object] = { "_note": ( - "Gated means from ubuntu-latest CI (post-cache). " - "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)." + "Gated means from ubuntu-latest CI benchmark-results.json." + f"{slack_note} " + "Excluded from gate (recorded for reference): test_parse_session_small, " + "test_search_full_corpus (sub-ms CI noise). " + "Memory benchmarks use extra_info.peak_bytes (bytes); " + "latency uses stats.mean (seconds)." ), "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), "machine": machine, diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index cd4369c..2bc4876 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -3,8 +3,12 @@ from __future__ import annotations import json +import tracemalloc +from collections.abc import Callable from copy import deepcopy +from datetime import UTC, datetime, timedelta from pathlib import Path +from typing import Any, TypeVar import pytest @@ -13,14 +17,47 @@ FIXTURES = Path(__file__).resolve().parents[1] / "fixtures" TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0] +T = TypeVar("T") -def write_jsonl(path: Path, line_count: int) -> Path: +_EXPORT_SESSION_BASE = datetime(2026, 6, 12, 0, 0, tzinfo=UTC) + + +def export_session_first_timestamp(index: int) -> str: + """Return a unique, valid ISO timestamp for export-corpus session *index*.""" + return (_EXPORT_SESSION_BASE + timedelta(minutes=index)).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class TracemallocPeak: + """Measure peak Python heap bytes for one callable invocation.""" + + def measure(self, func: Callable[..., T], /, *args: Any, **kwargs: Any) -> tuple[T, int]: + was_tracing = tracemalloc.is_tracing() + tracemalloc.start() + tracemalloc.clear_traces() + try: + result = func(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + return result, peak + finally: + if not was_tracing: + tracemalloc.stop() + + +@pytest.fixture +def tracemalloc_peak() -> TracemallocPeak: + return TracemallocPeak() + + +def write_jsonl(path: Path, line_count: int, *, first_timestamp: str | None = None) -> Path: """Write a JSONL session file with *line_count* rows derived from the template fixture.""" template = json.loads(TEMPLATE_LINE) with path.open("w", encoding="utf-8") as f: for i in range(line_count): entry = deepcopy(template) - entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z" + if i == 0 and first_timestamp is not None: + entry["timestamp"] = first_timestamp + else: + entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z" if i % 3 == 1: msg = entry.setdefault("message", {}) if isinstance(msg, dict) and "content" in msg: @@ -72,7 +109,9 @@ def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path: project = tmp_path / "bench-project" project.mkdir() for i in range(count): - write_jsonl(project / f"session_{i:04d}.jsonl", 20) + # Unique first_timestamp per session so export filenames do not collide in ZIP benches. + first_ts = export_session_first_timestamp(i) + write_jsonl(project / f"session_{i:04d}.jsonl", 20, first_timestamp=first_ts) return project diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py index 46c0eaf..39abcbb 100644 --- a/tests/benchmarks/test_export_bench.py +++ b/tests/benchmarks/test_export_bench.py @@ -2,11 +2,17 @@ from __future__ import annotations +import io +import zipfile from pathlib import Path import pytest -from utils.export_engine import NoopSink, run_bulk_export +from utils.export_engine import BulkExportResult, NoopSink, ZipSink, run_bulk_export + + +def _bench_projects(export_corpus: Path) -> list[dict[str, str]]: + return [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}] @pytest.mark.benchmark(group="export") @@ -20,7 +26,7 @@ def test_bulk_export_session_count( benchmark, export_corpus: Path, ) -> None: - projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}] + projects = _bench_projects(export_corpus) def _run() -> object: # NoopSink + since="all" + empty last_export_sessions: no disk/state writes per round. @@ -37,3 +43,45 @@ def _run() -> object: result = benchmark(_run) assert result.exported_session_count > 0 + + +@pytest.mark.benchmark(group="export") +@pytest.mark.parametrize( + "export_corpus", + [10, 50, 100], + indirect=True, + ids=["sessions-10", "sessions-50", "sessions-100"], +) +def test_bulk_export_zip_peak_memory( + benchmark, + export_corpus: Path, + tracemalloc_peak, +) -> None: + projects = _bench_projects(export_corpus) + peaks: list[int] = [] + results: list[BulkExportResult] = [] + + def _run() -> None: + def _export() -> BulkExportResult: + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + sink = ZipSink(zf) + return run_bulk_export( + projects=projects, + since="all", + rules=[], + last_export_sessions={}, + sink=sink, + fmt="md", + path_layout="api", + manifest_style="api", + ) + + result, peak = tracemalloc_peak.measure(_export) + results.append(result) + peaks.append(peak) + + benchmark(_run) + assert results and results[-1].exported_session_count > 0 + assert peaks, "benchmark produced no peak memory samples" + benchmark.extra_info["peak_bytes"] = int(sum(peaks) / len(peaks)) diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py index de1c886..46738cd 100644 --- a/tests/benchmarks/test_parse_memory.py +++ b/tests/benchmarks/test_parse_memory.py @@ -1,14 +1,18 @@ -"""Peak memory ceiling for large-file parse_session (regular pytest, not benchmark-only).""" +"""Peak memory for large-file parse_session: ceiling test + tracked benchmark.""" from __future__ import annotations -import tracemalloc from pathlib import Path +import pytest + from utils.jsonl_parser import parse_session -def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: +def test_large_parse_peak_memory_under_ceiling( + parse_large_file: Path, + tracemalloc_peak, +) -> None: path = parse_large_file file_bytes = path.stat().st_size # Issue #7 ceiling: Python heap peak (tracemalloc) vs on-disk JSONL size. Parsed @@ -16,13 +20,25 @@ def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: # a comment here if the parser legitimately grows. ceiling = file_bytes * 10 - tracemalloc.start() - tracemalloc.clear_traces() - try: - result = parse_session(str(path)) - assert len(result["messages"]) > 0, "parse_session returned no messages" - _, peak = tracemalloc.get_traced_memory() - finally: - tracemalloc.stop() - + result, peak = tracemalloc_peak.measure(parse_session, str(path)) + assert len(result["messages"]) > 0, "parse_session returned no messages" assert peak < ceiling, f"peak {peak} bytes exceeds 10x file size {file_bytes}" + + +@pytest.mark.benchmark(group="parse") +def test_parse_large_peak_memory( + benchmark, + parse_large_file: Path, + tracemalloc_peak, +) -> None: + path = str(parse_large_file) + peaks: list[int] = [] + + def _run() -> None: + _, peak = tracemalloc_peak.measure(parse_session, path) + peaks.append(peak) + + benchmark(_run) + assert peaks, "benchmark produced no peak memory samples" + # Gate uses extra_info.peak_bytes, not stats.mean (tracemalloc-inflated wall time). + benchmark.extra_info["peak_bytes"] = int(sum(peaks) / len(peaks)) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 49c4f49..891766f 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -39,12 +39,12 @@ def test_missing_baseline_warns_without_failing( results, [ {"name": "test_new_bench", "stats": {"mean": 0.01}}, - {"name": "test_parse_session_small", "stats": {"mean": 0.0001}}, + {"name": GATED_BENCH, "stats": {"mean": 0.002}}, ], ) _write_baselines( baselines, - {"parse": {"test_parse_session_small": 0.0001}}, + {"parse": {GATED_BENCH: 0.002}}, ) assert check_regression(results, baselines) == 0 @@ -152,9 +152,7 @@ def test_excluded_benchmark_in_baselines_is_not_gated( assert "REGRESSION" not in capsys.readouterr().out -def test_missing_current_result_warns_without_failing( - tmp_path, capsys: pytest.CaptureFixture[str] -) -> None: +def test_missing_current_result_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: results = tmp_path / "results.json" baselines = tmp_path / "baselines.json" _write_results(results, []) @@ -163,8 +161,10 @@ def test_missing_current_result_warns_without_failing( {"parse": {GATED_BENCH: 0.002}}, ) - assert check_regression(results, baselines) == 0 - assert "no current result for baseline" in capsys.readouterr().out + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "MISSING" in out + assert "no current result for gated baseline" in out def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: @@ -179,6 +179,80 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu assert "ERROR:" in capsys.readouterr().err +def test_load_results_prefers_peak_bytes_extra_info(tmp_path) -> None: + path = tmp_path / "results.json" + _write_results( + path, + [ + { + "name": "test_parse_large_peak_memory", + "stats": {"mean": 0.05}, + "extra_info": {"peak_bytes": 12_345_678}, + } + ], + ) + + assert load_results(path)[0]["test_parse_large_peak_memory"] == 12_345_678.0 + + +def test_metric_is_bytes_uses_extra_info_without_name_hint() -> None: + from scripts.check_benchmark_regression import metric_is_bytes + + entry = { + "name": "test_export_latency", + "stats": {"mean": 0.05}, + "extra_info": {"peak_bytes": 1_000_000}, + } + assert metric_is_bytes("test_export_latency", entry) + + +def test_memory_metric_regression_uses_bytes(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [ + { + "name": "test_parse_large_peak_memory", + "stats": {"mean": 0.05}, + "extra_info": {"peak_bytes": 15_000_000}, + } + ], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_large_peak_memory": 10_000_000}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "bytes" in out + assert "REGRESSION" in out + + +def test_benchmark_entry_mean_rejects_non_dict_extra_info() -> None: + from scripts.check_benchmark_regression import benchmark_entry_mean + + with pytest.raises(BenchmarkDataError, match="extra_info"): + benchmark_entry_mean( + { + "name": "test_parse_large_peak_memory", + "extra_info": "not-a-dict", + } + ) + + +def test_load_results_preserves_benchmark_data_error_message(tmp_path) -> None: + path = tmp_path / "results.json" + _write_results( + path, + [{"name": "test_parse_large_peak_memory", "extra_info": {"peak_bytes": "bad"}}], + ) + + with pytest.raises(BenchmarkDataError, match="extra_info.peak_bytes"): + load_results(path) + + def test_duplicate_baseline_name_raises(tmp_path) -> None: baselines = tmp_path / "baselines.json" _write_baselines( diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py index 8919b84..bad0a56 100644 --- a/tests/test_reduce_baselines.py +++ b/tests/test_reduce_baselines.py @@ -38,11 +38,29 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None: output = reduce_baselines(raw, out) assert output["machine"] == "Linux" + assert set(output["groups"].keys()) == {"parse", "export", "search"} assert "test_parse_session_medium" in output["groups"]["parse"] - assert "test_parse_session_small" not in output["groups"]["parse"] + assert "test_parse_session_small" in output["groups"]["parse"] + assert output["groups"]["search"] == {} assert "cache" not in output["groups"] +def test_reduce_baselines_includes_search_benchmark(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + {"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.001}}, + {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}, + ], + ) + + output = reduce_baselines(raw, out) + + assert output["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.001) + + def test_reduce_baselines_applies_slack(tmp_path) -> None: raw = tmp_path / "raw.json" out = tmp_path / "baselines.json" @@ -79,6 +97,26 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None: assert exc_info.value.code == 2 +def test_reduce_baselines_uses_peak_bytes_extra_info(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "group": "parse", + "name": "test_parse_large_peak_memory", + "stats": {"mean": 0.05}, + "extra_info": {"peak_bytes": 10_000_000}, + } + ], + ) + + output = reduce_baselines(raw, out) + + assert output["groups"]["parse"]["test_parse_large_peak_memory"] == 10_000_000.0 + + def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None: raw = tmp_path / "raw.json" raw.write_text(