Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bench/src/steering-modes.mts
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
import { writeFileSync } from 'node:fs'
import {
type AgenticTask,
anytimeReport,
type BenchmarkReport,
createWaterfallCollector,
renderAnytimeTable,
defineStrategy,
promotionGate,
type PromotionVerdict,
Expand Down Expand Up @@ -269,6 +271,9 @@ async function main(): Promise<void> {
if (process.env.WATERFALL) {
console.error(waterfall.render({ width: 40, maxRows: 24 }))
}
console.error(
renderAnytimeTable(anytimeReport(wf.spans, { targets: [0.5, 1] })),
)
}

console.error(
Expand Down
181 changes: 181 additions & 0 deletions src/runtime/anytime.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/**
* anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
* waterfall's spans (no new instrumentation): per task, the best-so-far score after each
* shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
* anytime-optimization metrics:
*
* TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
* over tasks that reached it)
* STT shots-to-target — attempts until best-so-far ≥ target
* ERT expected running time (the COCO benchmarking convention): TOTAL time spent
* across all tasks — including failures' full budgets — divided by the number of
* tasks that reached the target. The honest "how long per success, all-in".
* AUC the anytime curve's area (mean best-so-far score across the budget, per shot
* index) — higher = climbs earlier.
*
* The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
* (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
* runtime-to-target per (task, target) pair — optionally overridden per task
* (`targetFor`) when satisfaction is task-specific. Spans come from
* `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
* (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
*/
import type { WaterfallSpan } from './waterfall'

export interface AnytimeTaskCurve {
taskId: string
strategy: string
/** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
* cumulative usd, and the running max score. */
points: Array<{ elapsedMs: number; cumUsd: number; best: number }>
/** Per satisficing target (keyed by the target value as a string): the first point
* where best ≥ target, or null when never reached within budget. */
hits: Record<string, { ms: number; shots: number; usd: number } | null>
}

export interface AnytimeStrategySummary {
strategy: string
/** The satisficing target this row summarizes. */
target: number
tasks: number
reachedTarget: number
/** Median time-to-target over the tasks that reached it (null when none did). */
medianTttMs: number | null
medianShotsToTarget: number | null
/** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
ertMs: number | null
/** Same construction over dollars: Σ all spend / #successes. */
erUsd: number | null
/** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
curveByShot: number[]
/** Area under the per-shot anytime curve, normalized to [0,1]. */
auc: number
}

export interface AnytimeReport {
targets: number[]
perTask: AnytimeTaskCurve[]
/** One summary per (strategy, target) pair — the COCO-style multi-target view. */
perStrategy: AnytimeStrategySummary[]
}

const median = (xs: number[]): number | null => {
if (xs.length === 0) return null
const s = [...xs].sort((a, b) => a - b)
const mid = Math.floor(s.length / 2)
return s.length % 2 === 1 ? (s[mid] as number) : ((s[mid - 1] as number) + (s[mid] as number)) / 2
}

/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
* bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
* `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
* per-task bar replaces every entry of `targets` for that task. */
export function anytimeReport(
spans: WaterfallSpan[],
opts?: { targets?: number[]; targetFor?: (taskId: string) => number },
): AnytimeReport {
const targets = opts?.targets ?? [1]
const byRun = new Map<string, WaterfallSpan[]>()
for (const s of spans) {
if (!s.label.startsWith('shot:')) continue
const list = byRun.get(s.runId) ?? []
list.push(s)
byRun.set(s.runId, list)
}

const perTask: AnytimeTaskCurve[] = []
for (const [runId, shots] of byRun) {
const m = runId.match(/^agentic:(.+):(.+)$/)
const strategy = m?.[1] ?? runId
const taskId = m?.[2] ?? runId
const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs))
const t0 = Math.min(...ordered.map((s) => s.startMs))
const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets
let best = 0
let cumUsd = 0
const points: AnytimeTaskCurve['points'] = []
const hits: AnytimeTaskCurve['hits'] = {}
for (const t of taskTargets) hits[String(t)] = null
for (const s of ordered) {
cumUsd += s.usd
if (typeof s.score === 'number' && s.score > best) best = s.score
const elapsedMs = (s.endMs ?? s.startMs) - t0
points.push({ elapsedMs, cumUsd, best })
for (const t of taskTargets) {
if (hits[String(t)] === null && best >= t) {
hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd }
}
}
}
perTask.push({ taskId, strategy, points, hits })
}

const byStrategy = new Map<string, AnytimeTaskCurve[]>()
for (const t of perTask) {
const list = byStrategy.get(t.strategy) ?? []
list.push(t)
byStrategy.set(t.strategy, list)
}

const perStrategy: AnytimeStrategySummary[] = []
for (const [strategy, tasks] of byStrategy) {
const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0)
const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0)
const maxShots = Math.max(0, ...tasks.map((t) => t.points.length))
const curveByShot: number[] = []
for (let i = 0; i < maxShots; i += 1) {
// A task with fewer shots carries its final best forward (it stopped — its
// best-so-far is what an operator would have at that point).
const vals = tasks.map(
(t) => (t.points[Math.min(i, t.points.length - 1)] as { best: number }).best,
)
curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length)
}
const auc =
curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0
const summaryTargets = opts?.targetFor ? [Number.NaN] : targets
for (const t of summaryTargets) {
const key = (
taskCurve: AnytimeTaskCurve,
): { ms: number; shots: number; usd: number } | null =>
opts?.targetFor
? (Object.values(taskCurve.hits)[0] ?? null)
: (taskCurve.hits[String(t)] ?? null)
const reached = tasks.filter((x) => key(x) !== null)
perStrategy.push({
strategy,
target: t,
tasks: tasks.length,
reachedTarget: reached.length,
medianTttMs: median(reached.map((x) => (key(x) as { ms: number }).ms)),
medianShotsToTarget: median(reached.map((x) => (key(x) as { shots: number }).shots)),
ertMs: reached.length > 0 ? totalMs / reached.length : null,
erUsd: reached.length > 0 ? totalUsd / reached.length : null,
curveByShot,
auc,
})
}
}
perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target)
return { targets, perTask, perStrategy }
}

/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
export function renderAnytimeTable(report: AnytimeReport): string {
const lines = [
`anytime metrics · satisficing targets [${report.targets.join(', ')}] · ERT = Σ all wall-time / #successes (COCO)`,
'strategy ≥tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve',
]
for (const s of report.perStrategy) {
const curve = s.curveByShot.map((v) => '▁▂▃▄▅▆▇█'[Math.min(7, Math.floor(v * 8))]).join('')
const tgt = Number.isNaN(s.target) ? 'task' : s.target.toFixed(2)
lines.push(
`${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ` +
`${s.medianTttMs === null ? ' —' : `${(s.medianTttMs / 1000).toFixed(1).padStart(6)}s`} ` +
`${s.medianShotsToTarget === null ? ' —' : String(s.medianShotsToTarget).padStart(5)} ` +
`${s.ertMs === null ? ' —' : `${(s.ertMs / 1000).toFixed(1).padStart(9)}s`} ` +
`${s.erUsd === null ? ' —' : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`,
)
}
return lines.join('\n')
}
7 changes: 7 additions & 0 deletions src/runtime/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ export {
materializeTreeView,
replaySpawnTree,
} from '../durable/spawn-journal'
export {
type AnytimeReport,
type AnytimeStrategySummary,
type AnytimeTaskCurve,
anytimeReport,
renderAnytimeTable,
} from './anytime'
export {
type AuditIntentInput,
type AuditIntentOptions,
Expand Down
69 changes: 69 additions & 0 deletions tests/loops/anytime.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Anytime metrics from waterfall spans: hill-climb curves, multi-target satisficing
* hits (COCO convention), ERT charging failures' time to the successes.
*/
import { describe, expect, it } from 'vitest'
import { anytimeReport, renderAnytimeTable } from '../../src/runtime/anytime'
import type { WaterfallSpan } from '../../src/runtime/waterfall'

const shot = (
runId: string,
n: number,
start: number,
end: number,
usd: number,
score: number,
): WaterfallSpan => ({
id: `${runId}:s${n}`,
label: `shot:${n}`,
runId,
startMs: start,
endMs: end,
status: 'done',
usd,
tokens: { input: 0, output: 0 },
score,
})

describe('anytimeReport', () => {
// refine on task t1: climbs 0.5 → 1.0; on t2: stuck at 0.5 (never reaches 1).
const spans: WaterfallSpan[] = [
shot('agentic:refine:t1', 0, 0, 2000, 0.01, 0.5),
shot('agentic:refine:t1', 1, 2000, 5000, 0.01, 1),
shot('agentic:refine:t2', 0, 0, 3000, 0.01, 0.5),
shot('agentic:refine:t2', 1, 3000, 6000, 0.01, 0.5),
{ ...shot('agentic:refine:t1', 9, 0, 100, 0.001, 0), label: 'analyst:0' }, // ignored
]

it('multi-target hits, hill-climb curve, and COCO ERT', () => {
const r = anytimeReport(spans, { targets: [0.5, 1] })
const t1 = r.perTask.find((t) => t.taskId === 't1')
expect(t1?.hits['0.5']).toEqual({ ms: 2000, shots: 1, usd: 0.01 })
expect(t1?.hits['1']).toEqual({ ms: 5000, shots: 2, usd: 0.02 })

const at1 = r.perStrategy.find((s) => s.target === 1)
expect(at1?.reachedTarget).toBe(1)
expect(at1?.medianTttMs).toBe(5000)
// ERT charges BOTH tasks' wall time (5000 + 6000) to the single success.
expect(at1?.ertMs).toBe(11000)
const at05 = r.perStrategy.find((s) => s.target === 0.5)
expect(at05?.reachedTarget).toBe(2)
// The anytime curve: mean best-so-far per shot index across tasks.
expect(at1?.curveByShot[0]).toBeCloseTo(0.5)
expect(at1?.curveByShot[1]).toBeCloseTo(0.75)
})

it('per-task satisficing bars via targetFor', () => {
const r = anytimeReport(spans, { targetFor: (id) => (id === 't2' ? 0.5 : 1) })
const row = r.perStrategy[0]
expect(row?.reachedTarget).toBe(2) // t1 hits 1.0, t2 hits its own 0.5 bar
})

it('renders one row per (strategy, target) with the sparkline curve', () => {
const text = renderAnytimeTable(anytimeReport(spans, { targets: [0.5, 1] }))
expect(text).toContain('refine')
expect(text).toContain('0.50')
expect(text).toContain('1.00')
expect(text).toContain('ERT')
})
})
Loading