From 5fa9d3f38cff93d9ba6ef8c3e23aa4cf5668f9f7 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 10 Jun 2026 18:15:59 -0600
Subject: [PATCH] =?UTF-8?q?feat(loops):=20anytime=20metrics=20=E2=80=94=20?=
 =?UTF-8?q?time-to-satisfactory=20from=20the=20waterfall,=20free?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Derived entirely from existing spans (no new instrumentation): per-task
hill-climb curves (best-so-far score after each shot with elapsed wall and
cumulative spend) and the standard anytime-optimization summary per
(strategy, satisficing target): median time-to-target, shots-to-target,
COCO ERT (Σ all wall-time including failures / #successes — the honest
all-in cost per success), $ /success, and the AUC of the anytime curve
with a sparkline render.

Satisfaction follows the COCO/BBOB convention — a SET of satisficing
targets measured independently (targets: [0.5, 0.8, 1]) — or per-task bars
via targetFor (task-generic satisfaction). steering-modes prints the table
per arm; per-model comparison = arms with different WORKER_MODEL.
---
 bench/src/steering-modes.mts |   5 +
 src/runtime/anytime.ts       | 181 +++++++++++++++++++++++++++++++++++
 src/runtime/index.ts         |   7 ++
 tests/loops/anytime.test.ts  |  69 +++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 src/runtime/anytime.ts
 create mode 100644 tests/loops/anytime.test.ts
diff --git a/bench/src/steering-modes.mts b/bench/src/steering-modes.mts
index 5d33001..498d1ff 100644
--- a/bench/src/steering-modes.mts
+++ b/bench/src/steering-modes.mts
@@ -25,8 +25,10 @@
 import { writeFileSync } from 'node:fs'
 import {
   type AgenticTask,
+  anytimeReport,
   type BenchmarkReport,
   createWaterfallCollector,
+  renderAnytimeTable,
   defineStrategy,
   promotionGate,
   type PromotionVerdict,
@@ -269,6 +271,9 @@ async function main(): Promise<void> {
     if (process.env.WATERFALL) {
       console.error(waterfall.render({ width: 40, maxRows: 24 }))
     }
+    console.error(
+      renderAnytimeTable(anytimeReport(wf.spans, { targets: [0.5, 1] })),
+    )
   }
 
   console.error(
diff --git a/src/runtime/anytime.ts b/src/runtime/anytime.ts
new file mode 100644
index 0000000..641ae6e
--- /dev/null
+++ b/src/runtime/anytime.ts
@@ -0,0 +1,181 @@
+/**
+ * anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
+ * waterfall's spans (no new instrumentation): per task, the best-so-far score after each
+ * shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
+ * anytime-optimization metrics:
+ *
+ *   TTT  time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
+ *        over tasks that reached it)
+ *   STT  shots-to-target — attempts until best-so-far ≥ target
+ *   ERT  expected running time (the COCO benchmarking convention): TOTAL time spent
+ *        across all tasks — including failures' full budgets — divided by the number of
+ *        tasks that reached the target. The honest "how long per success, all-in".
+ *   AUC  the anytime curve's area (mean best-so-far score across the budget, per shot
+ *        index) — higher = climbs earlier.
+ *
+ * The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
+ * (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
+ * runtime-to-target per (task, target) pair — optionally overridden per task
+ * (`targetFor`) when satisfaction is task-specific. Spans come from
+ * `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
+ * (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
+ */
+import type { WaterfallSpan } from './waterfall'
+
+export interface AnytimeTaskCurve {
+  taskId: string
+  strategy: string
+  /** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
+   *  cumulative usd, and the running max score. */
+  points: Array<{ elapsedMs: number; cumUsd: number; best: number }>
+  /** Per satisficing target (keyed by the target value as a string): the first point
+   *  where best ≥ target, or null when never reached within budget. */
+  hits: Record<string, { ms: number; shots: number; usd: number } | null>
+}
+
+export interface AnytimeStrategySummary {
+  strategy: string
+  /** The satisficing target this row summarizes. */
+  target: number
+  tasks: number
+  reachedTarget: number
+  /** Median time-to-target over the tasks that reached it (null when none did). */
+  medianTttMs: number | null
+  medianShotsToTarget: number | null
+  /** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
+  ertMs: number | null
+  /** Same construction over dollars: Σ all spend / #successes. */
+  erUsd: number | null
+  /** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
+  curveByShot: number[]
+  /** Area under the per-shot anytime curve, normalized to [0,1]. */
+  auc: number
+}
+
+export interface AnytimeReport {
+  targets: number[]
+  perTask: AnytimeTaskCurve[]
+  /** One summary per (strategy, target) pair — the COCO-style multi-target view. */
+  perStrategy: AnytimeStrategySummary[]
+}
+
+const median = (xs: number[]): number | null => {
+  if (xs.length === 0) return null
+  const s = [...xs].sort((a, b) => a - b)
+  const mid = Math.floor(s.length / 2)
+  return s.length % 2 === 1 ? (s[mid] as number) : ((s[mid - 1] as number) + (s[mid] as number)) / 2
+}
+
+/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
+ *  bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
+ *  `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
+ *  per-task bar replaces every entry of `targets` for that task. */
+export function anytimeReport(
+  spans: WaterfallSpan[],
+  opts?: { targets?: number[]; targetFor?: (taskId: string) => number },
+): AnytimeReport {
+  const targets = opts?.targets ?? [1]
+  const byRun = new Map<string, WaterfallSpan[]>()
+  for (const s of spans) {
+    if (!s.label.startsWith('shot:')) continue
+    const list = byRun.get(s.runId) ?? []
+    list.push(s)
+    byRun.set(s.runId, list)
+  }
+
+  const perTask: AnytimeTaskCurve[] = []
+  for (const [runId, shots] of byRun) {
+    const m = runId.match(/^agentic:(.+):(.+)$/)
+    const strategy = m?.[1] ?? runId
+    const taskId = m?.[2] ?? runId
+    const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs))
+    const t0 = Math.min(...ordered.map((s) => s.startMs))
+    const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets
+    let best = 0
+    let cumUsd = 0
+    const points: AnytimeTaskCurve['points'] = []
+    const hits: AnytimeTaskCurve['hits'] = {}
+    for (const t of taskTargets) hits[String(t)] = null
+    for (const s of ordered) {
+      cumUsd += s.usd
+      if (typeof s.score === 'number' && s.score > best) best = s.score
+      const elapsedMs = (s.endMs ?? s.startMs) - t0
+      points.push({ elapsedMs, cumUsd, best })
+      for (const t of taskTargets) {
+        if (hits[String(t)] === null && best >= t) {
+          hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd }
+        }
+      }
+    }
+    perTask.push({ taskId, strategy, points, hits })
+  }
+
+  const byStrategy = new Map<string, AnytimeTaskCurve[]>()
+  for (const t of perTask) {
+    const list = byStrategy.get(t.strategy) ?? []
+    list.push(t)
+    byStrategy.set(t.strategy, list)
+  }
+
+  const perStrategy: AnytimeStrategySummary[] = []
+  for (const [strategy, tasks] of byStrategy) {
+    const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0)
+    const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0)
+    const maxShots = Math.max(0, ...tasks.map((t) => t.points.length))
+    const curveByShot: number[] = []
+    for (let i = 0; i < maxShots; i += 1) {
+      // A task with fewer shots carries its final best forward (it stopped — its
+      // best-so-far is what an operator would have at that point).
+      const vals = tasks.map(
+        (t) => (t.points[Math.min(i, t.points.length - 1)] as { best: number }).best,
+      )
+      curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length)
+    }
+    const auc =
+      curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0
+    const summaryTargets = opts?.targetFor ? [Number.NaN] : targets
+    for (const t of summaryTargets) {
+      const key = (
+        taskCurve: AnytimeTaskCurve,
+      ): { ms: number; shots: number; usd: number } | null =>
+        opts?.targetFor
+          ? (Object.values(taskCurve.hits)[0] ?? null)
+          : (taskCurve.hits[String(t)] ?? null)
+      const reached = tasks.filter((x) => key(x) !== null)
+      perStrategy.push({
+        strategy,
+        target: t,
+        tasks: tasks.length,
+        reachedTarget: reached.length,
+        medianTttMs: median(reached.map((x) => (key(x) as { ms: number }).ms)),
+        medianShotsToTarget: median(reached.map((x) => (key(x) as { shots: number }).shots)),
+        ertMs: reached.length > 0 ? totalMs / reached.length : null,
+        erUsd: reached.length > 0 ? totalUsd / reached.length : null,
+        curveByShot,
+        auc,
+      })
+    }
+  }
+  perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target)
+  return { targets, perTask, perStrategy }
+}
+
+/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
+export function renderAnytimeTable(report: AnytimeReport): string {
+  const lines = [
+    `anytime metrics · satisficing targets [${report.targets.join(', ')}] · ERT = Σ all wall-time / #successes (COCO)`,
+    'strategy            ≥tgt   reach   med-TTT   med-shots   ERT(all-in)   $/success   AUC   curve',
+  ]
+  for (const s of report.perStrategy) {
+    const curve = s.curveByShot.map((v) => '▁▂▃▄▅▆▇█'[Math.min(7, Math.floor(v * 8))]).join('')
+    const tgt = Number.isNaN(s.target) ? 'task' : s.target.toFixed(2)
+    lines.push(
+      `${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ` +
+        `${s.medianTttMs === null ? '      —' : `${(s.medianTttMs / 1000).toFixed(1).padStart(6)}s`}   ` +
+        `${s.medianShotsToTarget === null ? '    —' : String(s.medianShotsToTarget).padStart(5)}   ` +
+        `${s.ertMs === null ? '         —' : `${(s.ertMs / 1000).toFixed(1).padStart(9)}s`}   ` +
+        `${s.erUsd === null ? '       —' : `$${s.erUsd.toFixed(4)}`}   ${s.auc.toFixed(2)}   ${curve}`,
+    )
+  }
+  return lines.join('\n')
+}
diff --git a/src/runtime/index.ts b/src/runtime/index.ts
index 6136746..94af19f 100644
--- a/src/runtime/index.ts
+++ b/src/runtime/index.ts
@@ -34,6 +34,13 @@ export {
   materializeTreeView,
   replaySpawnTree,
 } from '../durable/spawn-journal'
+export {
+  type AnytimeReport,
+  type AnytimeStrategySummary,
+  type AnytimeTaskCurve,
+  anytimeReport,
+  renderAnytimeTable,
+} from './anytime'
 export {
   type AuditIntentInput,
   type AuditIntentOptions,
diff --git a/tests/loops/anytime.test.ts b/tests/loops/anytime.test.ts
new file mode 100644
index 0000000..da80bfa
--- /dev/null
+++ b/tests/loops/anytime.test.ts
@@ -0,0 +1,69 @@
+/**
+ * Anytime metrics from waterfall spans: hill-climb curves, multi-target satisficing
+ * hits (COCO convention), ERT charging failures' time to the successes.
+ */
+import { describe, expect, it } from 'vitest'
+import { anytimeReport, renderAnytimeTable } from '../../src/runtime/anytime'
+import type { WaterfallSpan } from '../../src/runtime/waterfall'
+
+const shot = (
+  runId: string,
+  n: number,
+  start: number,
+  end: number,
+  usd: number,
+  score: number,
+): WaterfallSpan => ({
+  id: `${runId}:s${n}`,
+  label: `shot:${n}`,
+  runId,
+  startMs: start,
+  endMs: end,
+  status: 'done',
+  usd,
+  tokens: { input: 0, output: 0 },
+  score,
+})
+
+describe('anytimeReport', () => {
+  // refine on task t1: climbs 0.5 → 1.0; on t2: stuck at 0.5 (never reaches 1).
+  const spans: WaterfallSpan[] = [
+    shot('agentic:refine:t1', 0, 0, 2000, 0.01, 0.5),
+    shot('agentic:refine:t1', 1, 2000, 5000, 0.01, 1),
+    shot('agentic:refine:t2', 0, 0, 3000, 0.01, 0.5),
+    shot('agentic:refine:t2', 1, 3000, 6000, 0.01, 0.5),
+    { ...shot('agentic:refine:t1', 9, 0, 100, 0.001, 0), label: 'analyst:0' }, // ignored
+  ]
+
+  it('multi-target hits, hill-climb curve, and COCO ERT', () => {
+    const r = anytimeReport(spans, { targets: [0.5, 1] })
+    const t1 = r.perTask.find((t) => t.taskId === 't1')
+    expect(t1?.hits['0.5']).toEqual({ ms: 2000, shots: 1, usd: 0.01 })
+    expect(t1?.hits['1']).toEqual({ ms: 5000, shots: 2, usd: 0.02 })
+
+    const at1 = r.perStrategy.find((s) => s.target === 1)
+    expect(at1?.reachedTarget).toBe(1)
+    expect(at1?.medianTttMs).toBe(5000)
+    // ERT charges BOTH tasks' wall time (5000 + 6000) to the single success.
+    expect(at1?.ertMs).toBe(11000)
+    const at05 = r.perStrategy.find((s) => s.target === 0.5)
+    expect(at05?.reachedTarget).toBe(2)
+    // The anytime curve: mean best-so-far per shot index across tasks.
+    expect(at1?.curveByShot[0]).toBeCloseTo(0.5)
+    expect(at1?.curveByShot[1]).toBeCloseTo(0.75)
+  })
+
+  it('per-task satisficing bars via targetFor', () => {
+    const r = anytimeReport(spans, { targetFor: (id) => (id === 't2' ? 0.5 : 1) })
+    const row = r.perStrategy[0]
+    expect(row?.reachedTarget).toBe(2) // t1 hits 1.0, t2 hits its own 0.5 bar
+  })
+
+  it('renders one row per (strategy, target) with the sparkline curve', () => {
+    const text = renderAnytimeTable(anytimeReport(spans, { targets: [0.5, 1] }))
+    expect(text).toContain('refine')
+    expect(text).toContain('0.50')
+    expect(text).toContain('1.00')
+    expect(text).toContain('ERT')
+  })
+})