From 5fa9d3f38cff93d9ba6ef8c3e23aa4cf5668f9f7 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 10 Jun 2026 18:15:59 -0600 Subject: [PATCH] =?UTF-8?q?feat(loops):=20anytime=20metrics=20=E2=80=94=20?= =?UTF-8?q?time-to-satisfactory=20from=20the=20waterfall,=20free?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Derived entirely from existing spans (no new instrumentation): per-task hill-climb curves (best-so-far score after each shot with elapsed wall and cumulative spend) and the standard anytime-optimization summary per (strategy, satisficing target): median time-to-target, shots-to-target, COCO ERT (Σ all wall-time including failures / #successes — the honest all-in cost per success), $ /success, and the AUC of the anytime curve with a sparkline render. Satisfaction follows the COCO/BBOB convention — a SET of satisficing targets measured independently (targets: [0.5, 0.8, 1]) — or per-task bars via targetFor (task-generic satisfaction). steering-modes prints the table per arm; per-model comparison = arms with different WORKER_MODEL. --- bench/src/steering-modes.mts | 5 + src/runtime/anytime.ts | 181 +++++++++++++++++++++++++++++++++++ src/runtime/index.ts | 7 ++ tests/loops/anytime.test.ts | 69 +++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 src/runtime/anytime.ts create mode 100644 tests/loops/anytime.test.ts diff --git a/bench/src/steering-modes.mts b/bench/src/steering-modes.mts index 5d33001..498d1ff 100644 --- a/bench/src/steering-modes.mts +++ b/bench/src/steering-modes.mts @@ -25,8 +25,10 @@ import { writeFileSync } from 'node:fs' import { type AgenticTask, + anytimeReport, type BenchmarkReport, createWaterfallCollector, + renderAnytimeTable, defineStrategy, promotionGate, type PromotionVerdict, @@ -269,6 +271,9 @@ async function main(): Promise { if (process.env.WATERFALL) { console.error(waterfall.render({ width: 40, maxRows: 24 })) } + console.error( + renderAnytimeTable(anytimeReport(wf.spans, { targets: [0.5, 1] })), + ) } console.error( diff --git a/src/runtime/anytime.ts b/src/runtime/anytime.ts new file mode 100644 index 0000000..641ae6e --- /dev/null +++ b/src/runtime/anytime.ts @@ -0,0 +1,181 @@ +/** + * anytimeReport — time-to-satisfactory-output metrics, derived entirely from the + * waterfall's spans (no new instrumentation): per task, the best-so-far score after each + * shot with its elapsed wall-clock and cumulative spend; per strategy, the standard + * anytime-optimization metrics: + * + * TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median + * over tasks that reached it) + * STT shots-to-target — attempts until best-so-far ≥ target + * ERT expected running time (the COCO benchmarking convention): TOTAL time spent + * across all tasks — including failures' full budgets — divided by the number of + * tasks that reached the target. The honest "how long per success, all-in". + * AUC the anytime curve's area (mean best-so-far score across the budget, per shot + * index) — higher = climbs earlier. + * + * The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets + * (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently — + * runtime-to-target per (task, target) pair — optionally overridden per task + * (`targetFor`) when satisfaction is task-specific. Spans come from + * `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId + * (`agentic::`); shot spans are `shot:N` labels. + */ +import type { WaterfallSpan } from './waterfall' + +export interface AnytimeTaskCurve { + taskId: string + strategy: string + /** Best-so-far after each settled shot: elapsed ms from the task's first spawn, + * cumulative usd, and the running max score. */ + points: Array<{ elapsedMs: number; cumUsd: number; best: number }> + /** Per satisficing target (keyed by the target value as a string): the first point + * where best ≥ target, or null when never reached within budget. */ + hits: Record +} + +export interface AnytimeStrategySummary { + strategy: string + /** The satisficing target this row summarizes. */ + target: number + tasks: number + reachedTarget: number + /** Median time-to-target over the tasks that reached it (null when none did). */ + medianTttMs: number | null + medianShotsToTarget: number | null + /** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */ + ertMs: number | null + /** Same construction over dollars: Σ all spend / #successes. */ + erUsd: number | null + /** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */ + curveByShot: number[] + /** Area under the per-shot anytime curve, normalized to [0,1]. */ + auc: number +} + +export interface AnytimeReport { + targets: number[] + perTask: AnytimeTaskCurve[] + /** One summary per (strategy, target) pair — the COCO-style multi-target view. */ + perStrategy: AnytimeStrategySummary[] +} + +const median = (xs: number[]): number | null => { + if (xs.length === 0) return null + const s = [...xs].sort((a, b) => a - b) + const mid = Math.floor(s.length / 2) + return s.length % 2 === 1 ? (s[mid] as number) : ((s[mid - 1] as number) + (s[mid] as number)) / 2 +} + +/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score + * bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]); + * `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the + * per-task bar replaces every entry of `targets` for that task. */ +export function anytimeReport( + spans: WaterfallSpan[], + opts?: { targets?: number[]; targetFor?: (taskId: string) => number }, +): AnytimeReport { + const targets = opts?.targets ?? [1] + const byRun = new Map() + for (const s of spans) { + if (!s.label.startsWith('shot:')) continue + const list = byRun.get(s.runId) ?? [] + list.push(s) + byRun.set(s.runId, list) + } + + const perTask: AnytimeTaskCurve[] = [] + for (const [runId, shots] of byRun) { + const m = runId.match(/^agentic:(.+):(.+)$/) + const strategy = m?.[1] ?? runId + const taskId = m?.[2] ?? runId + const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs)) + const t0 = Math.min(...ordered.map((s) => s.startMs)) + const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets + let best = 0 + let cumUsd = 0 + const points: AnytimeTaskCurve['points'] = [] + const hits: AnytimeTaskCurve['hits'] = {} + for (const t of taskTargets) hits[String(t)] = null + for (const s of ordered) { + cumUsd += s.usd + if (typeof s.score === 'number' && s.score > best) best = s.score + const elapsedMs = (s.endMs ?? s.startMs) - t0 + points.push({ elapsedMs, cumUsd, best }) + for (const t of taskTargets) { + if (hits[String(t)] === null && best >= t) { + hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd } + } + } + } + perTask.push({ taskId, strategy, points, hits }) + } + + const byStrategy = new Map() + for (const t of perTask) { + const list = byStrategy.get(t.strategy) ?? [] + list.push(t) + byStrategy.set(t.strategy, list) + } + + const perStrategy: AnytimeStrategySummary[] = [] + for (const [strategy, tasks] of byStrategy) { + const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0) + const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0) + const maxShots = Math.max(0, ...tasks.map((t) => t.points.length)) + const curveByShot: number[] = [] + for (let i = 0; i < maxShots; i += 1) { + // A task with fewer shots carries its final best forward (it stopped — its + // best-so-far is what an operator would have at that point). + const vals = tasks.map( + (t) => (t.points[Math.min(i, t.points.length - 1)] as { best: number }).best, + ) + curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length) + } + const auc = + curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0 + const summaryTargets = opts?.targetFor ? [Number.NaN] : targets + for (const t of summaryTargets) { + const key = ( + taskCurve: AnytimeTaskCurve, + ): { ms: number; shots: number; usd: number } | null => + opts?.targetFor + ? (Object.values(taskCurve.hits)[0] ?? null) + : (taskCurve.hits[String(t)] ?? null) + const reached = tasks.filter((x) => key(x) !== null) + perStrategy.push({ + strategy, + target: t, + tasks: tasks.length, + reachedTarget: reached.length, + medianTttMs: median(reached.map((x) => (key(x) as { ms: number }).ms)), + medianShotsToTarget: median(reached.map((x) => (key(x) as { shots: number }).shots)), + ertMs: reached.length > 0 ? totalMs / reached.length : null, + erUsd: reached.length > 0 ? totalUsd / reached.length : null, + curveByShot, + auc, + }) + } + } + perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target) + return { targets, perTask, perStrategy } +} + +/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */ +export function renderAnytimeTable(report: AnytimeReport): string { + const lines = [ + `anytime metrics · satisficing targets [${report.targets.join(', ')}] · ERT = Σ all wall-time / #successes (COCO)`, + 'strategy ≥tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve', + ] + for (const s of report.perStrategy) { + const curve = s.curveByShot.map((v) => '▁▂▃▄▅▆▇█'[Math.min(7, Math.floor(v * 8))]).join('') + const tgt = Number.isNaN(s.target) ? 'task' : s.target.toFixed(2) + lines.push( + `${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ` + + `${s.medianTttMs === null ? ' —' : `${(s.medianTttMs / 1000).toFixed(1).padStart(6)}s`} ` + + `${s.medianShotsToTarget === null ? ' —' : String(s.medianShotsToTarget).padStart(5)} ` + + `${s.ertMs === null ? ' —' : `${(s.ertMs / 1000).toFixed(1).padStart(9)}s`} ` + + `${s.erUsd === null ? ' —' : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`, + ) + } + return lines.join('\n') +} diff --git a/src/runtime/index.ts b/src/runtime/index.ts index 6136746..94af19f 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -34,6 +34,13 @@ export { materializeTreeView, replaySpawnTree, } from '../durable/spawn-journal' +export { + type AnytimeReport, + type AnytimeStrategySummary, + type AnytimeTaskCurve, + anytimeReport, + renderAnytimeTable, +} from './anytime' export { type AuditIntentInput, type AuditIntentOptions, diff --git a/tests/loops/anytime.test.ts b/tests/loops/anytime.test.ts new file mode 100644 index 0000000..da80bfa --- /dev/null +++ b/tests/loops/anytime.test.ts @@ -0,0 +1,69 @@ +/** + * Anytime metrics from waterfall spans: hill-climb curves, multi-target satisficing + * hits (COCO convention), ERT charging failures' time to the successes. + */ +import { describe, expect, it } from 'vitest' +import { anytimeReport, renderAnytimeTable } from '../../src/runtime/anytime' +import type { WaterfallSpan } from '../../src/runtime/waterfall' + +const shot = ( + runId: string, + n: number, + start: number, + end: number, + usd: number, + score: number, +): WaterfallSpan => ({ + id: `${runId}:s${n}`, + label: `shot:${n}`, + runId, + startMs: start, + endMs: end, + status: 'done', + usd, + tokens: { input: 0, output: 0 }, + score, +}) + +describe('anytimeReport', () => { + // refine on task t1: climbs 0.5 → 1.0; on t2: stuck at 0.5 (never reaches 1). + const spans: WaterfallSpan[] = [ + shot('agentic:refine:t1', 0, 0, 2000, 0.01, 0.5), + shot('agentic:refine:t1', 1, 2000, 5000, 0.01, 1), + shot('agentic:refine:t2', 0, 0, 3000, 0.01, 0.5), + shot('agentic:refine:t2', 1, 3000, 6000, 0.01, 0.5), + { ...shot('agentic:refine:t1', 9, 0, 100, 0.001, 0), label: 'analyst:0' }, // ignored + ] + + it('multi-target hits, hill-climb curve, and COCO ERT', () => { + const r = anytimeReport(spans, { targets: [0.5, 1] }) + const t1 = r.perTask.find((t) => t.taskId === 't1') + expect(t1?.hits['0.5']).toEqual({ ms: 2000, shots: 1, usd: 0.01 }) + expect(t1?.hits['1']).toEqual({ ms: 5000, shots: 2, usd: 0.02 }) + + const at1 = r.perStrategy.find((s) => s.target === 1) + expect(at1?.reachedTarget).toBe(1) + expect(at1?.medianTttMs).toBe(5000) + // ERT charges BOTH tasks' wall time (5000 + 6000) to the single success. + expect(at1?.ertMs).toBe(11000) + const at05 = r.perStrategy.find((s) => s.target === 0.5) + expect(at05?.reachedTarget).toBe(2) + // The anytime curve: mean best-so-far per shot index across tasks. + expect(at1?.curveByShot[0]).toBeCloseTo(0.5) + expect(at1?.curveByShot[1]).toBeCloseTo(0.75) + }) + + it('per-task satisficing bars via targetFor', () => { + const r = anytimeReport(spans, { targetFor: (id) => (id === 't2' ? 0.5 : 1) }) + const row = r.perStrategy[0] + expect(row?.reachedTarget).toBe(2) // t1 hits 1.0, t2 hits its own 0.5 bar + }) + + it('renders one row per (strategy, target) with the sparkline curve', () => { + const text = renderAnytimeTable(anytimeReport(spans, { targets: [0.5, 1] })) + expect(text).toContain('refine') + expect(text).toContain('0.50') + expect(text).toContain('1.00') + expect(text).toContain('ERT') + }) +})