From 60a9fa3d3e7e9d24aa8f0badbddd1e139f402d4e Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 10 Jun 2026 05:02:23 -0600 Subject: [PATCH] feat(cost): model seating chart, dollar budgets in the fuzz loop, program cost report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ModelSeats + seatPresets + resolveSeat (src/model-seats.ts): one object re-tiers an entire eval program; economy preset uses the fleet-policy ids (cross-family judges, fully priced), frontier is deliberately empty — resolveSeat fails loud on any unset seat, a model id is never a silent default. - BehaviorExplorer cost governance (src/fuzz): costOf + costBudgetUsd + ledger + onCost. Known cost accrues toward a hard ceiling with control-runtime maxCostUsd semantics (nonnegative finite, stop at >=); unknown-cost runs are counted apart, never folded in as $0. Capsule stats gain costUsd/costUnknownRuns only when tracking was wired, and the HTML capsule shows the cost KPI with the unpriced-run count. - costReport + attachCostToReport (src/cost-report.ts): thin projection over CostLedger.summary() adding the per-model rollup (unpriced:true marks a lower-bound $); attachCostToReport is the one stamp every artifact uses and refuses to overwrite an existing cost key. --- src/cost-report.test.ts | 97 +++++++++++++++++++++ src/cost-report.ts | 79 +++++++++++++++++ src/fuzz/capsule.ts | 17 ++++ src/fuzz/explorer-cost.test.ts | 155 +++++++++++++++++++++++++++++++++ src/fuzz/explorer.ts | 79 +++++++++++++++-- src/fuzz/index.ts | 1 + src/fuzz/types.ts | 43 +++++++++ src/index.ts | 7 ++ src/model-seats.test.ts | 105 ++++++++++++++++++++++ src/model-seats.ts | 128 +++++++++++++++++++++++++++ 10 files changed, 706 insertions(+), 5 deletions(-) create mode 100644 src/cost-report.test.ts create mode 100644 src/cost-report.ts create mode 100644 src/fuzz/explorer-cost.test.ts create mode 100644 src/model-seats.test.ts create mode 100644 src/model-seats.ts diff --git a/src/cost-report.test.ts b/src/cost-report.test.ts new file mode 100644 index 0000000..bc1c9ed --- /dev/null +++ b/src/cost-report.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest' +import { CostLedger } from './cost-ledger' +import { attachCostToReport, costReport } from './cost-report' +import { ValidationError } from './errors' + +function buildLedger(): CostLedger { + const ledger = new CostLedger() + // gpt-4o: 0.0025 in + 0.01 out per 1k + ledger.record({ + model: 'gpt-4o', + channel: 'agent', + usage: { inputTokens: 1000, outputTokens: 1000 }, + }) + ledger.record({ + model: 'gpt-4o', + channel: 'judge', + usage: { inputTokens: 2000, outputTokens: 0 }, + }) + // Unpriced model — costUnknown, the $0 is a lower bound, not a measured zero. + ledger.record({ + model: 'made-up-zzz', + channel: 'judge', + usage: { inputTokens: 1000, outputTokens: 1000 }, + }) + return ledger +} + +describe('costReport', () => { + it('projects per-channel, total, and per-model rollups from the ledger', () => { + const report = costReport(buildLedger()) + + expect(report.perChannel.map((c) => c.channel)).toEqual(['agent', 'judge']) + const judge = report.perChannel.find((c) => c.channel === 'judge') + expect(judge?.calls).toBe(2) + expect(judge?.unpricedCalls).toBe(1) + + expect(report.total.usd).toBeCloseTo(0.0125 + 0.005, 6) + expect(report.total.unknownEntries).toBe(1) + + expect(report.perModel.map((m) => m.model)).toEqual(['gpt-4o', 'made-up-zzz']) + expect(report.perModel[0]).toEqual({ + model: 'gpt-4o', + usd: 0.0175, + entries: 2, + unpriced: false, + }) + }) + + it('flags an unpriced model unpriced:true — its $0 is never a measured zero', () => { + const report = costReport(buildLedger()) + const unpriced = report.perModel.find((m) => m.model === 'made-up-zzz') + expect(unpriced).toEqual({ model: 'made-up-zzz', usd: 0, entries: 1, unpriced: true }) + }) + + it('an actualCostUsd override clears unpriced — observed dollars are real', () => { + const ledger = new CostLedger() + ledger.record({ + model: 'made-up-zzz', + channel: 'agent', + usage: { inputTokens: 100, outputTokens: 100 }, + actualCostUsd: 0.42, + }) + const report = costReport(ledger) + expect(report.perModel[0]).toEqual({ + model: 'made-up-zzz', + usd: 0.42, + entries: 1, + unpriced: false, + }) + expect(report.total.unknownEntries).toBe(0) + }) + + it('an empty ledger projects to zeros, never throws', () => { + const report = costReport(new CostLedger()) + expect(report).toEqual({ + perChannel: [], + total: { usd: 0, unknownEntries: 0 }, + perModel: [], + }) + }) +}) + +describe('attachCostToReport', () => { + it('stamps the projection under cost and preserves the report fields', () => { + const stamped = attachCostToReport({ verdict: 'ship', lift: 0.04 }, buildLedger()) + expect(stamped.verdict).toBe('ship') + expect(stamped.lift).toBe(0.04) + expect(stamped.cost.total.unknownEntries).toBe(1) + expect(stamped.cost.perModel).toHaveLength(2) + }) + + it('refuses to overwrite an existing cost stamp', () => { + expect(() => attachCostToReport({ cost: 'already-stamped' }, new CostLedger())).toThrow( + ValidationError, + ) + }) +}) diff --git a/src/cost-report.ts b/src/cost-report.ts new file mode 100644 index 0000000..8e8fbe7 --- /dev/null +++ b/src/cost-report.ts @@ -0,0 +1,79 @@ +/** + * Program cost report — a thin projection over `CostLedger.summary()` that + * adds the per-model rollup the summary lacks, plus `attachCostToReport`, the + * one way every artifact (capsules, campaign results, diagnose reports) gets + * its cost stamp. + * + * Honesty contract carried through from the ledger: `total.unknownEntries` + * and `perModel[].unpriced` surface the costUnknown axis — a $0 from an + * unpriced model is a lower bound, never a measured zero. + */ + +import type { ChannelRollup, CostLedger } from './cost-ledger' +import { ValidationError } from './errors' + +export interface ModelCostRollup { + model: string + usd: number + entries: number + /** ≥1 entry for this model was costUnknown — `usd` is a lower bound. An + * `actualCostUsd` override clears the flag for that entry (the dollars are + * observed, even when the model has no pricing). */ + unpriced: boolean +} + +export interface CostReport { + /** Per-channel breakdown — `CostLedgerSummary.byChannel` verbatim. */ + perChannel: ChannelRollup[] + total: { + usd: number + /** Entries whose cost was unknown — non-zero means `usd` is a lower bound. */ + unknownEntries: number + } + /** Per-model spend, sorted by model id. */ + perModel: ModelCostRollup[] +} + +/** Project a ledger into the program cost report. Pure — no I/O, no clock. */ +export function costReport(ledger: CostLedger): CostReport { + const summary = ledger.summary() + const perModel = new Map() + for (const entry of ledger.list()) { + const roll = perModel.get(entry.model) ?? { + model: entry.model, + usd: 0, + entries: 0, + unpriced: false, + } + roll.usd += entry.costUsd + roll.entries += 1 + if (entry.costUnknown) roll.unpriced = true + perModel.set(entry.model, roll) + } + return { + perChannel: summary.byChannel, + total: { + usd: summary.totalCostUsd, + unknownEntries: summary.byChannel.reduce((sum, c) => sum + c.unpricedCalls, 0), + }, + perModel: [...perModel.values()].sort((a, b) => a.model.localeCompare(b.model)), + } +} + +/** + * Stamp a report-shaped object with its cost projection under the `cost` key. + * Generic so capsules, campaign results, and diagnose reports all stamp the + * same way. Throws when the report already carries a `cost` key — silently + * overwriting an existing stamp would corrupt the artifact's provenance. + */ +export function attachCostToReport( + report: R, + ledger: CostLedger, +): R & { cost: CostReport } { + if ('cost' in report) { + throw new ValidationError( + "attachCostToReport: report already has a 'cost' key — refusing to overwrite an existing stamp", + ) + } + return { ...report, cost: costReport(ledger) } +} diff --git a/src/fuzz/capsule.ts b/src/fuzz/capsule.ts index 123da3a..769461f 100644 --- a/src/fuzz/capsule.ts +++ b/src/fuzz/capsule.ts @@ -23,6 +23,9 @@ export interface BuildCapsuleInput { findings: Finding[] candidateFindings: number runsUsed: number + /** Known-dollar / unknown-run split — present only when cost tracking was + * wired; the capsule never fabricates a $0 total. */ + cost?: { costUsd: number; costUnknownRuns: number } } export function buildCapsule(input: BuildCapsuleInput): CapsuleData { @@ -47,6 +50,9 @@ export function buildCapsule(input: BuildCapsuleInput): CapsuleData { candidateFindings: input.candidateFindings, verifiedFindings: input.findings.length, meanRobustness, + ...(input.cost + ? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns } + : {}), }, } } @@ -172,6 +178,16 @@ export function renderCapsuleHtml( '#5ad17a', ) : '' + // Cost KPI only when tracking was wired — an untracked run never shows $0. + // Unpriced runs are named in the label (amber): the total is a lower bound. + const cost = + s.costUsd !== undefined + ? kpi( + s.costUnknownRuns ? `cost · ${s.costUnknownRuns} runs unpriced` : 'cost', + `$${s.costUsd.toFixed(2)}`, + s.costUnknownRuns ? '#e5c07b' : '#e6e6e6', + ) + : '' const stamp = opts.generatedAt ?? capsule.generatedAt ?? '' return ` @@ -218,6 +234,7 @@ ${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a9 ${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')} ${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)} ${kpi('scenarios run', String(s.totalRuns))} +${cost} ${lift}

Coverage map

diff --git a/src/fuzz/explorer-cost.test.ts b/src/fuzz/explorer-cost.test.ts new file mode 100644 index 0000000..d21276b --- /dev/null +++ b/src/fuzz/explorer-cost.test.ts @@ -0,0 +1,155 @@ +import { describe, expect, it } from 'vitest' +import { CostLedger } from '../cost-ledger' +import { costReport } from '../cost-report' +import { ValidationError } from '../errors' +import { renderCapsuleHtml } from './capsule' +import { BehaviorExplorer } from './explorer' +import type { BehaviorSpace, ExploreOptions } from './types' + +const space: BehaviorSpace = { axes: [{ name: 'difficulty', values: ['easy'] }] } + +// Single-cell space + uniform allocation + concurrency 1 → the evaluation +// (and costOf) order is fully deterministic. +function makeOpts(overrides: Partial>): ExploreOptions { + let n = 0 + return { + target: 'cost-target', + space, + proposer: (ctx) => Array.from({ length: ctx.count }, () => `p-${n++}`), + evaluate: async () => ({ valid: true, score: 0.9 }), + seedsFor: () => ['seed-0'], + scenarioId: (s) => s, + allocation: 'uniform', + budget: 50, + seed: 3, + ...overrides, + } +} + +describe('BehaviorExplorer cost budget', () => { + it('stops the loop when accumulated known cost reaches costBudgetUsd', async () => { + const explorer = new BehaviorExplorer( + makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 3 }), + ) + const capsule = await explorer.run() + expect(capsule.stats.totalRuns).toBe(3) + expect(capsule.stats.costUsd).toBe(3) + expect(capsule.stats.costUnknownRuns).toBe(0) + }) + + it('a zero budget stops before any evaluation — same >= semantics as control-runtime', async () => { + const explorer = new BehaviorExplorer( + makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 0 }), + ) + const capsule = await explorer.run() + expect(capsule.stats.totalRuns).toBe(0) + expect(capsule.stats.costUsd).toBe(0) + }) + + it('counts unknown-cost runs separately — never as $0, never against the budget', async () => { + let call = 0 + const onCost: Array<{ usd: number; channel: string }> = [] + const explorer = new BehaviorExplorer( + makeOpts({ + budget: 4, + // Runs 1 and 3 cost $1; runs 2 and 4 have unknown cost. + costOf: () => (call++ % 2 === 0 ? { usd: 1 } : null), + costBudgetUsd: 10, + onCost: (e) => onCost.push(e), + }), + ) + const capsule = await explorer.run() + expect(capsule.stats.totalRuns).toBe(4) + expect(capsule.stats.costUsd).toBe(2) + expect(capsule.stats.costUnknownRuns).toBe(2) + expect(onCost).toEqual([ + { usd: 1, channel: 'agent' }, + { usd: 1, channel: 'agent' }, + ]) + }) + + it('records known costs into the supplied ledger with channel agent + actualCostUsd', async () => { + const ledger = new CostLedger() + const explorer = new BehaviorExplorer( + makeOpts({ budget: 2, costOf: () => ({ usd: 0.5, model: 'gpt-4o' }), ledger }), + ) + await explorer.run() + const entries = ledger.list() + expect(entries).toHaveLength(2) + for (const entry of entries) { + expect(entry.channel).toBe('agent') + expect(entry.actualCostUsd).toBe(0.5) + expect(entry.costUnknown).toBe(false) + expect(entry.model).toBe('gpt-4o') + expect(entry.tags?.target).toBe('cost-target') + } + const report = costReport(ledger) + expect(report.perModel).toEqual([{ model: 'gpt-4o', usd: 1, entries: 2, unpriced: false }]) + }) + + it('labels ledger entries unattributed when costOf names no model', async () => { + const ledger = new CostLedger() + const explorer = new BehaviorExplorer( + makeOpts({ budget: 1, costOf: () => ({ usd: 0.25 }), ledger }), + ) + await explorer.run() + expect(ledger.list().map((e) => e.model)).toEqual(['unattributed']) + }) + + it('rejects negative, NaN, and infinite budgets loudly', () => { + const costOf = () => ({ usd: 1 }) + for (const bad of [-1, Number.NaN, Number.POSITIVE_INFINITY]) { + expect(() => new BehaviorExplorer(makeOpts({ costOf, costBudgetUsd: bad }))).toThrow( + /costBudgetUsd must be a nonnegative finite number/, + ) + } + }) + + it('rejects cost options without costOf — the explorer cannot know run cost', () => { + expect(() => new BehaviorExplorer(makeOpts({ costBudgetUsd: 3 }))).toThrow(ValidationError) + expect(() => new BehaviorExplorer(makeOpts({ ledger: new CostLedger() }))).toThrow( + ValidationError, + ) + expect(() => new BehaviorExplorer(makeOpts({ onCost: () => {} }))).toThrow(ValidationError) + }) + + it('rejects a fabricated costOf number loudly — null is the only unknown', async () => { + const explorer = new BehaviorExplorer( + makeOpts({ budget: 1, costOf: () => ({ usd: Number.NaN }) }), + ) + await expect(explorer.run()).rejects.toThrow(/costOf returned an invalid usd/) + }) + + it('omits cost stats entirely when cost tracking is not wired — absent, never $0', async () => { + const capsule = await new BehaviorExplorer(makeOpts({ budget: 2 })).run() + expect(capsule.stats.costUsd).toBeUndefined() + expect(capsule.stats.costUnknownRuns).toBeUndefined() + }) +}) + +describe('renderCapsuleHtml cost KPI', () => { + it('shows the known-dollar KPI when cost tracking was wired', async () => { + const explorer = new BehaviorExplorer( + makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 3 }), + ) + const html = renderCapsuleHtml(await explorer.run()) + expect(html).toContain('$3.00') + expect(html).not.toContain('runs unpriced') + }) + + it('names the unpriced-run count next to the total — the dollar figure is a lower bound', async () => { + let call = 0 + const explorer = new BehaviorExplorer( + makeOpts({ budget: 4, costOf: () => (call++ % 2 === 0 ? { usd: 1 } : null) }), + ) + const html = renderCapsuleHtml(await explorer.run()) + expect(html).toContain('$2.00') + expect(html).toContain('2 runs unpriced') + }) + + it('shows no dollar KPI at all when cost was not tracked', async () => { + const html = renderCapsuleHtml(await new BehaviorExplorer(makeOpts({ budget: 2 })).run()) + expect(/\$\d/.test(html)).toBe(false) + expect(html).not.toContain('runs unpriced') + }) +}) diff --git a/src/fuzz/explorer.ts b/src/fuzz/explorer.ts index 0bdd216..4d778a5 100644 --- a/src/fuzz/explorer.ts +++ b/src/fuzz/explorer.ts @@ -13,6 +13,7 @@ * observations and coverage are projections of it. */ +import { ValidationError } from '../errors' import { varianceBasedCurriculum } from '../rl/active-curriculum' import { buildCapsule } from './capsule' import type { EvalRecord } from './cube' @@ -23,6 +24,7 @@ import type { CapsuleData, Cell, CoverageCell, + Evaluation, ExploreOptions, Finding, Objective, @@ -65,11 +67,31 @@ export class BehaviorExplorer { private runsUsed = 0 private candidateFindings = 0 private rngState: number + /** Accumulated KNOWN dollars — unknown-cost runs never inflate it. */ + private spentKnownUsd = 0 + private costUnknownRuns = 0 constructor(private readonly opts: ExploreOptions) { this.cells = enumerateCells(opts.space) if (this.cells.length === 0) throw new Error('BehaviorExplorer: space has no cells — every axis needs ≥1 value') + if (opts.costBudgetUsd !== undefined) { + if ( + typeof opts.costBudgetUsd !== 'number' || + !Number.isFinite(opts.costBudgetUsd) || + opts.costBudgetUsd < 0 + ) { + throw new RangeError( + `BehaviorExplorer: costBudgetUsd must be a nonnegative finite number, got ${String(opts.costBudgetUsd)}`, + ) + } + } + if (!opts.costOf && (opts.costBudgetUsd !== undefined || opts.ledger || opts.onCost)) { + throw new ValidationError( + 'BehaviorExplorer: costBudgetUsd/ledger/onCost require costOf — the explorer ' + + 'cannot know run cost without it; supply costOf or drop the cost options', + ) + } this.cellById = new Map(this.cells.map((c) => [c.id, c])) this.objective = opts.objective ?? adversarialObjective(0.5) this.threshold = this.objective.threshold ?? 0.5 @@ -124,6 +146,37 @@ export class BehaviorExplorer { } } + /** Mirrors control-runtime: stop once accumulated KNOWN cost ≥ the ceiling. */ + private costExhausted(): boolean { + return this.opts.costBudgetUsd !== undefined && this.spentKnownUsd >= this.opts.costBudgetUsd + } + + /** Fold one run's cost in: null counts as unknown (never $0); a known cost + * accrues toward the budget, lands in the ledger, and fires `onCost`. */ + private recordRunCost(scenario: S, cell: Cell, ev: Evaluation): void { + if (!this.opts.costOf) return + const cost = this.opts.costOf(scenario, cell, ev) + if (cost === null) { + this.costUnknownRuns++ + return + } + if (typeof cost.usd !== 'number' || !Number.isFinite(cost.usd) || cost.usd < 0) { + throw new RangeError( + `BehaviorExplorer: costOf returned an invalid usd (${String(cost?.usd)}) — ` + + 'return null when cost is unknown, never a fabricated number', + ) + } + this.spentKnownUsd += cost.usd + this.opts.ledger?.record({ + model: cost.model ?? 'unattributed', + channel: 'agent', + usage: { inputTokens: 0, outputTokens: 0 }, + actualCostUsd: cost.usd, + tags: { target: this.opts.target, cell: cell.id }, + }) + this.opts.onCost?.({ usd: cost.usd, channel: 'agent' }) + } + /** Elites whose INPUT cell matches — what the proposer mutates/deepens from. */ private elitesFor(cellId: string): S[] { const out: S[] = [] @@ -134,14 +187,16 @@ export class BehaviorExplorer { /** One allocate → propose → evaluate → gate → archive round. */ async step(): Promise<{ runs: number; findings: Finding[] }> { const remaining = this.opts.budget - this.runsUsed - if (remaining <= 0 || this.opts.signal?.aborted) return { runs: 0, findings: [] } + if (remaining <= 0 || this.costExhausted() || this.opts.signal?.aborted) + return { runs: 0, findings: [] } const allocations = this.allocate(Math.min(this.perRoundBudget, remaining)) const newFindings: Finding[] = [] let runsThisStep = 0 for (const alloc of allocations) { - if (this.runsUsed >= this.opts.budget || this.opts.signal?.aborted) break + if (this.runsUsed >= this.opts.budget || this.costExhausted() || this.opts.signal?.aborted) + break const cell = this.cellById.get(alloc.cellId) if (!cell) continue const cap = Math.min(alloc.count, this.opts.budget - this.runsUsed) @@ -166,10 +221,16 @@ export class BehaviorExplorer { await pMap( toEval, async (scenario) => { - if (this.runsUsed >= this.opts.budget || this.opts.signal?.aborted) return + if ( + this.runsUsed >= this.opts.budget || + this.costExhausted() || + this.opts.signal?.aborted + ) + return const ev = await this.opts.evaluate(scenario, cell) this.runsUsed++ runsThisStep++ + this.recordRunCost(scenario, cell, ev) const interest = this.objective.interest(ev, this.objectiveContext()) this.log.push({ cell, ev, interest, scenarioId: this.opts.scenarioId(scenario) }) this.opts.onProgress?.({ type: 'evaluated', cell, scenario, evaluation: ev }) @@ -215,9 +276,14 @@ export class BehaviorExplorer { return { runs: runsThisStep, findings: newFindings } } - /** Loop `step()` until budget is spent, the signal aborts, or no progress is made. */ + /** Loop `step()` until the run or dollar budget is spent, the signal aborts, + * or no progress is made. */ async run(): Promise> { - while (this.runsUsed < this.opts.budget && !this.opts.signal?.aborted) { + while ( + this.runsUsed < this.opts.budget && + !this.costExhausted() && + !this.opts.signal?.aborted + ) { const { runs } = await this.step() if (runs === 0) break } @@ -243,6 +309,9 @@ export class BehaviorExplorer { findings: this._findings, candidateFindings: this.candidateFindings, runsUsed: this.runsUsed, + cost: this.opts.costOf + ? { costUsd: this.spentKnownUsd, costUnknownRuns: this.costUnknownRuns } + : undefined, }) } } diff --git a/src/fuzz/index.ts b/src/fuzz/index.ts index 70effff..7ccca1d 100644 --- a/src/fuzz/index.ts +++ b/src/fuzz/index.ts @@ -38,6 +38,7 @@ export type { ObjectiveContext, ProposeContext, Proposer, + RunCost, SpaceAxis, ValidityGates, } from './types' diff --git a/src/fuzz/types.ts b/src/fuzz/types.ts index b66192d..2cb23c9 100644 --- a/src/fuzz/types.ts +++ b/src/fuzz/types.ts @@ -18,6 +18,7 @@ * never a parallel score shape. */ +import type { CostChannel, CostLedger } from '../cost-ledger' import type { AdversarialMutation } from '../rl/adversarial' import type { DefaultVerdict } from '../verdict' @@ -177,9 +178,27 @@ export interface CapsuleData { candidateFindings: number verifiedFindings: number meanRobustness: number + /** Known dollars spent on this exploration's runs. Present only when cost + * tracking was wired (`costOf`) — absent means "not tracked", never $0. */ + costUsd?: number + /** Runs whose cost was unknown (`costOf` returned null) — counted apart, + * never folded into `costUsd` as a fabricated $0. */ + costUnknownRuns?: number } } +// ── cost governance ─────────────────────────────────────────────────────────── + +/** + * Known cost of one evaluated run. `model` attributes the spend in the ledger's + * per-model rollup; absent, the entry is labeled `unattributed` (the dollars are + * real either way — recorded as `actualCostUsd`, never an estimate). + */ +export interface RunCost { + usd: number + model?: string +} + // ── engine options + events ─────────────────────────────────────────────────── export type ExploreEvent = @@ -225,6 +244,30 @@ export interface ExploreOptions { onProgress?: (event: ExploreEvent) => void /** Deterministic seed. Default 1. */ seed?: number + /** + * Cost of one evaluated run — consumer-supplied; the explorer cannot know + * token usage. Return null when the cost is unknown: the run is COUNTED in + * `stats.costUnknownRuns`, never folded into the total as $0. Required by + * every other cost option (`costBudgetUsd` / `ledger` / `onCost`). + */ + costOf?: (scenario: S, cell: Cell, ev: Evaluation) => RunCost | null + /** + * Hard dollar ceiling on accumulated KNOWN cost (same semantics as the + * control-runtime `budget.maxCostUsd`: nonnegative finite, the session stops + * once spent ≥ ceiling; no new evaluation starts after that). Unknown-cost + * runs do not consume budget — they are reported separately, so the ceiling + * is honest about what it can see. + */ + costBudgetUsd?: number + /** + * Sink for per-run cost entries — each known `costOf` result is recorded + * with channel 'agent' and `actualCostUsd` (token axes are zero: the + * explorer only sees dollars). Pass the program's shared `CostLedger` so + * `costReport` stamps fuzz spend alongside judge/analyst spend. + */ + ledger?: CostLedger + /** Observer fired for every known-cost run recorded. */ + onCost?: (entry: { usd: number; channel: CostChannel }) => void } export type { AdversarialMutation, DefaultVerdict } diff --git a/src/index.ts b/src/index.ts index 1ee48b2..15e6a02 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1318,4 +1318,11 @@ export { // `AgentProfile` exported above from `./agent-profile`. export * as profile from './profile/index' +// ── Cost governance — model seating chart + program cost report ───────── + +export type { CostReport, ModelCostRollup } from './cost-report' +export { attachCostToReport, costReport } from './cost-report' +export type { ModelSeats, SeatName, SeatPresetName } from './model-seats' +export { resolveSeat, SeatUnsetError, seatPresets } from './model-seats' + // Ax RLM trace analyst — subpath: /traces (re-exported alongside trace store). diff --git a/src/model-seats.test.ts b/src/model-seats.test.ts new file mode 100644 index 0000000..8ac9123 --- /dev/null +++ b/src/model-seats.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from 'vitest' +import { ConfigError, ValidationError } from './errors' +import { assertCrossFamily } from './judge-families' +import { isModelPriced } from './metrics' +import { type ModelSeats, resolveSeat, SeatUnsetError, seatPresets } from './model-seats' + +const seats: ModelSeats = { + worker: 'kimi-k2.6', + judges: ['kimi-k2.6', 'deepseek-v4-pro'], +} + +describe('resolveSeat', () => { + it('returns a set single-model seat', () => { + expect(resolveSeat(seats, 'worker')).toBe('kimi-k2.6') + }) + + it('returns a copy of the judges list — mutating it never edits the chart', () => { + const judges = resolveSeat(seats, 'judges') + expect(judges).toEqual(['kimi-k2.6', 'deepseek-v4-pro']) + judges.push('extra') + expect(seats.judges).toEqual(['kimi-k2.6', 'deepseek-v4-pro']) + }) + + it('throws SeatUnsetError (code config, names the seat) when unset with no fallback', () => { + expect(() => resolveSeat(seats, 'analyst')).toThrow(SeatUnsetError) + try { + resolveSeat(seats, 'analyst') + expect.unreachable('resolveSeat must throw') + } catch (err) { + expect(err).toBeInstanceOf(SeatUnsetError) + expect(err).toBeInstanceOf(ConfigError) + expect((err as SeatUnsetError).seat).toBe('analyst') + expect((err as SeatUnsetError).code).toBe('config') + expect((err as SeatUnsetError).message).toContain("'analyst'") + } + }) + + it('returns the explicit fallback when the seat is unset', () => { + expect(resolveSeat(seats, 'reflection', 'gpt-4.1-mini')).toBe('gpt-4.1-mini') + }) + + it('wraps a fallback for the judges seat into a one-model panel', () => { + expect(resolveSeat({}, 'judges', 'deepseek-v4-pro')).toEqual(['deepseek-v4-pro']) + }) + + it('treats a blank string and an empty judges array as unset', () => { + expect(() => resolveSeat({ worker: ' ' }, 'worker')).toThrow(SeatUnsetError) + expect(() => resolveSeat({ judges: [] }, 'judges')).toThrow(SeatUnsetError) + expect(resolveSeat({ worker: '' }, 'worker', 'kimi-k2.6')).toBe('kimi-k2.6') + }) + + it('fails loud on malformed seats — blank judge entry, wrong runtime types', () => { + expect(() => resolveSeat({ judges: ['kimi-k2.6', ' '] }, 'judges')).toThrow(ValidationError) + expect(() => resolveSeat({ judges: 'kimi-k2.6' as unknown as string[] }, 'judges')).toThrow( + ValidationError, + ) + expect(() => resolveSeat({ worker: ['kimi-k2.6'] as unknown as string }, 'worker')).toThrow( + ValidationError, + ) + }) + + it('rejects a blank fallback — it cannot stand in for a model id', () => { + expect(() => resolveSeat({}, 'worker', '')).toThrow(ValidationError) + }) +}) + +describe('seatPresets', () => { + it('economy fills every seat with the fleet-policy ids', () => { + const economy = seatPresets.economy + expect(economy.worker).toBe('kimi-k2.6') + expect(economy.judges).toEqual(['kimi-k2.6', 'deepseek-v4-pro', 'gpt-4.1-mini']) + expect(economy.analyst).toBe('gpt-4.1-mini') + expect(economy.reflection).toBe('gpt-4.1-mini') + expect(economy.verifier).toBe('deepseek-v4-pro') + }) + + it('economy judges pass assertCrossFamily as-is', () => { + const families = assertCrossFamily(resolveSeat(seatPresets.economy, 'judges')) + expect(families.length).toBeGreaterThanOrEqual(3) + }) + + it('every economy id is priced — the preset never produces a costUnknown axis', () => { + const economy = seatPresets.economy + const ids = [ + economy.worker, + economy.analyst, + economy.reflection, + economy.verifier, + ...(economy.judges ?? []), + ] + for (const id of ids) { + expect(id).toBeDefined() + expect(isModelPriced(id as string)).toBe(true) + } + }) + + it('frontier is deliberately empty — every seat fails loud until the caller supplies entitled ids', () => { + const seatNames = ['worker', 'judges', 'analyst', 'reflection', 'verifier'] as const + for (const seat of seatNames) { + expect(() => resolveSeat(seatPresets.frontier, seat)).toThrow(SeatUnsetError) + } + const filled = { ...seatPresets.frontier, worker: 'my-frontier-id' } + expect(resolveSeat(filled, 'worker')).toBe('my-frontier-id') + }) +}) diff --git a/src/model-seats.ts b/src/model-seats.ts new file mode 100644 index 0000000..6dab223 --- /dev/null +++ b/src/model-seats.ts @@ -0,0 +1,128 @@ +/** + * ModelSeats — the program's model seating chart. + * + * One object names which model fills each role in an eval program: the worker + * under evaluation, the judge panel, the analyst, the reflection/driver model, + * and the verifier. Re-tiering an entire program (economy ↔ frontier) is one + * swapped object instead of a hunt through call sites. + * + * Wiring points — consumers thread seats; this module implements none of them + * (those files belong to other surfaces): + * - `judges` → `ensembleJudge({ models: seats.judges, … })` (src/judge-panel.ts) + * and the `JudgeConfig`s handed to `makeEvalTools({ judges })` + * (src/eval-tools.ts). + * - `reflection` → `selfImprove({ llm: { model: seats.reflection } })` — the + * `gepaDriver` reflection model (src/contract/self-improve.ts); + * same seat for any custom `ImprovementDriver`'s LLM. + * - `worker` → the dispatch model the agent itself calls — the model an + * `AgentProfile` declares. + * - `analyst` → the LLM behind `analyzeRuns` / analyst-registry kinds. + * - `verifier` → completion-verifier / objective-checker model. + * - campaign cells thread `judges` + driver models the same way; that wiring + * lands with the campaign surface, not here. + * + * `resolveSeat` is the only read path: an unset seat with no explicit fallback + * throws — a model id is a budget decision, never a silent default. + */ + +import { ConfigError, ValidationError } from './errors' + +export interface ModelSeats { + /** The model under evaluation — what the agent itself dispatches with. */ + worker?: string + /** Judge-panel model ids — thread into `ensembleJudge({ models })`. */ + judges?: string[] + /** Analyst model — `analyzeRuns` / analyst-registry LLM calls. */ + analyst?: string + /** Reflection/driver model — `gepaDriver` mutation proposals. */ + reflection?: string + /** Verifier model — completion/objective checking. */ + verifier?: string +} + +export type SeatName = keyof ModelSeats + +export type SeatPresetName = keyof typeof seatPresets + +/** + * Tier presets — plain data, swap or spread freely. + * + * `economy` uses the fleet-policy ids: every id resolves through the + * substrate's family pricing (no costUnknown axis) and the judge trio spans + * three provider families (moonshot / deepseek / openai), so it passes + * `assertCrossFamily` as-is. + * + * `frontier` is deliberately EMPTY: entitled frontier ids vary per router + * account, and a hardcoded claude/gpt-5 id 401s on keys that lack it. Supply + * your own: `{ ...seatPresets.frontier, worker: '', … }` — + * `resolveSeat` throws on every seat you haven't filled. + */ +export const seatPresets: Record<'economy' | 'frontier', ModelSeats> = { + economy: { + worker: 'kimi-k2.6', + judges: ['kimi-k2.6', 'deepseek-v4-pro', 'gpt-4.1-mini'], + analyst: 'gpt-4.1-mini', + reflection: 'gpt-4.1-mini', + verifier: 'deepseek-v4-pro', + }, + frontier: {}, +} + +/** Thrown by `resolveSeat` when a seat is unset and no fallback was given. */ +export class SeatUnsetError extends ConfigError { + constructor(public readonly seat: SeatName) { + super( + `ModelSeats: seat '${seat}' is unset and no fallback was given — ` + + 'name a model explicitly (a model id is a budget decision, never a silent default)', + ) + } +} + +/** + * Read one seat. Blank strings and empty arrays count as unset (env-var + * plumbing produces them); malformed values (non-string seat, non-array or + * blank-entry `judges`) throw `ValidationError`. When the seat is unset, an + * explicit `fallback` is returned (`[fallback]` for `judges` — a one-model + * panel); without one, `SeatUnsetError`. + */ +export function resolveSeat(seats: ModelSeats, seat: 'judges', fallback?: string): string[] +export function resolveSeat( + seats: ModelSeats, + seat: Exclude, + fallback?: string, +): string +export function resolveSeat(seats: ModelSeats, seat: SeatName, fallback?: string): string | string[] +export function resolveSeat( + seats: ModelSeats, + seat: SeatName, + fallback?: string, +): string | string[] { + const value = seats[seat] + if (seat === 'judges') { + if (value !== undefined && !Array.isArray(value)) { + throw new ValidationError(`ModelSeats: seat 'judges' must be a string[], got ${typeof value}`) + } + const models = Array.isArray(value) ? value : [] + if (models.length > 0) { + const blank = models.findIndex((m) => typeof m !== 'string' || m.trim() === '') + if (blank >= 0) { + throw new ValidationError( + `ModelSeats: judges[${blank}] is blank — every panel model must be a non-empty id`, + ) + } + return [...models] + } + } else { + if (value !== undefined && typeof value !== 'string') { + throw new ValidationError(`ModelSeats: seat '${seat}' must be a string, got ${typeof value}`) + } + if (typeof value === 'string' && value.trim() !== '') return value + } + if (fallback !== undefined) { + if (fallback.trim() === '') { + throw new ValidationError(`ModelSeats: fallback for seat '${seat}' is blank`) + } + return seat === 'judges' ? [fallback] : fallback + } + throw new SeatUnsetError(seat) +}