Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions src/cost-report.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import { describe, expect, it } from 'vitest'
import { CostLedger } from './cost-ledger'
import { attachCostToReport, costReport } from './cost-report'
import { ValidationError } from './errors'

function buildLedger(): CostLedger {
const ledger = new CostLedger()
// gpt-4o: 0.0025 in + 0.01 out per 1k
ledger.record({
model: 'gpt-4o',
channel: 'agent',
usage: { inputTokens: 1000, outputTokens: 1000 },
})
ledger.record({
model: 'gpt-4o',
channel: 'judge',
usage: { inputTokens: 2000, outputTokens: 0 },
})
// Unpriced model — costUnknown, the $0 is a lower bound, not a measured zero.
ledger.record({
model: 'made-up-zzz',
channel: 'judge',
usage: { inputTokens: 1000, outputTokens: 1000 },
})
return ledger
}

describe('costReport', () => {
it('projects per-channel, total, and per-model rollups from the ledger', () => {
const report = costReport(buildLedger())

expect(report.perChannel.map((c) => c.channel)).toEqual(['agent', 'judge'])
const judge = report.perChannel.find((c) => c.channel === 'judge')
expect(judge?.calls).toBe(2)
expect(judge?.unpricedCalls).toBe(1)

expect(report.total.usd).toBeCloseTo(0.0125 + 0.005, 6)
expect(report.total.unknownEntries).toBe(1)

expect(report.perModel.map((m) => m.model)).toEqual(['gpt-4o', 'made-up-zzz'])
expect(report.perModel[0]).toEqual({
model: 'gpt-4o',
usd: 0.0175,
entries: 2,
unpriced: false,
})
})

it('flags an unpriced model unpriced:true — its $0 is never a measured zero', () => {
const report = costReport(buildLedger())
const unpriced = report.perModel.find((m) => m.model === 'made-up-zzz')
expect(unpriced).toEqual({ model: 'made-up-zzz', usd: 0, entries: 1, unpriced: true })
})

it('an actualCostUsd override clears unpriced — observed dollars are real', () => {
const ledger = new CostLedger()
ledger.record({
model: 'made-up-zzz',
channel: 'agent',
usage: { inputTokens: 100, outputTokens: 100 },
actualCostUsd: 0.42,
})
const report = costReport(ledger)
expect(report.perModel[0]).toEqual({
model: 'made-up-zzz',
usd: 0.42,
entries: 1,
unpriced: false,
})
expect(report.total.unknownEntries).toBe(0)
})

it('an empty ledger projects to zeros, never throws', () => {
const report = costReport(new CostLedger())
expect(report).toEqual({
perChannel: [],
total: { usd: 0, unknownEntries: 0 },
perModel: [],
})
})
})

describe('attachCostToReport', () => {
it('stamps the projection under cost and preserves the report fields', () => {
const stamped = attachCostToReport({ verdict: 'ship', lift: 0.04 }, buildLedger())
expect(stamped.verdict).toBe('ship')
expect(stamped.lift).toBe(0.04)
expect(stamped.cost.total.unknownEntries).toBe(1)
expect(stamped.cost.perModel).toHaveLength(2)
})

it('refuses to overwrite an existing cost stamp', () => {
expect(() => attachCostToReport({ cost: 'already-stamped' }, new CostLedger())).toThrow(
ValidationError,
)
})
})
79 changes: 79 additions & 0 deletions src/cost-report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/**
* Program cost report — a thin projection over `CostLedger.summary()` that
* adds the per-model rollup the summary lacks, plus `attachCostToReport`, the
* one way every artifact (capsules, campaign results, diagnose reports) gets
* its cost stamp.
*
* Honesty contract carried through from the ledger: `total.unknownEntries`
* and `perModel[].unpriced` surface the costUnknown axis — a $0 from an
* unpriced model is a lower bound, never a measured zero.
*/

import type { ChannelRollup, CostLedger } from './cost-ledger'
import { ValidationError } from './errors'

export interface ModelCostRollup {
model: string
usd: number
entries: number
/** ≥1 entry for this model was costUnknown — `usd` is a lower bound. An
* `actualCostUsd` override clears the flag for that entry (the dollars are
* observed, even when the model has no pricing). */
unpriced: boolean
}

export interface CostReport {
/** Per-channel breakdown — `CostLedgerSummary.byChannel` verbatim. */
perChannel: ChannelRollup[]
total: {
usd: number
/** Entries whose cost was unknown — non-zero means `usd` is a lower bound. */
unknownEntries: number
}
/** Per-model spend, sorted by model id. */
perModel: ModelCostRollup[]
}

/** Project a ledger into the program cost report. Pure — no I/O, no clock. */
export function costReport(ledger: CostLedger): CostReport {
const summary = ledger.summary()
const perModel = new Map<string, ModelCostRollup>()
for (const entry of ledger.list()) {
const roll = perModel.get(entry.model) ?? {
model: entry.model,
usd: 0,
entries: 0,
unpriced: false,
}
roll.usd += entry.costUsd
roll.entries += 1
if (entry.costUnknown) roll.unpriced = true
perModel.set(entry.model, roll)
}
return {
perChannel: summary.byChannel,
total: {
usd: summary.totalCostUsd,
unknownEntries: summary.byChannel.reduce((sum, c) => sum + c.unpricedCalls, 0),
},
perModel: [...perModel.values()].sort((a, b) => a.model.localeCompare(b.model)),
}
}

/**
* Stamp a report-shaped object with its cost projection under the `cost` key.
* Generic so capsules, campaign results, and diagnose reports all stamp the
* same way. Throws when the report already carries a `cost` key — silently
* overwriting an existing stamp would corrupt the artifact's provenance.
*/
export function attachCostToReport<R extends object>(
report: R,
ledger: CostLedger,
): R & { cost: CostReport } {
if ('cost' in report) {
throw new ValidationError(
"attachCostToReport: report already has a 'cost' key — refusing to overwrite an existing stamp",
)
}
return { ...report, cost: costReport(ledger) }
}
17 changes: 17 additions & 0 deletions src/fuzz/capsule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ export interface BuildCapsuleInput<S> {
findings: Finding<S>[]
candidateFindings: number
runsUsed: number
/** Known-dollar / unknown-run split — present only when cost tracking was
* wired; the capsule never fabricates a $0 total. */
cost?: { costUsd: number; costUnknownRuns: number }
}

export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
Expand All @@ -47,6 +50,9 @@ export function buildCapsule<S>(input: BuildCapsuleInput<S>): CapsuleData<S> {
candidateFindings: input.candidateFindings,
verifiedFindings: input.findings.length,
meanRobustness,
...(input.cost
? { costUsd: input.cost.costUsd, costUnknownRuns: input.cost.costUnknownRuns }
: {}),
},
}
}
Expand Down Expand Up @@ -172,6 +178,16 @@ export function renderCapsuleHtml<S>(
'#5ad17a',
)
: ''
// Cost KPI only when tracking was wired — an untracked run never shows $0.
// Unpriced runs are named in the label (amber): the total is a lower bound.
const cost =
s.costUsd !== undefined
? kpi(
s.costUnknownRuns ? `cost · ${s.costUnknownRuns} runs unpriced` : 'cost',
`$${s.costUsd.toFixed(2)}`,
s.costUnknownRuns ? '#e5c07b' : '#e6e6e6',
)
: ''
const stamp = opts.generatedAt ?? capsule.generatedAt ?? ''

return `<!doctype html><html><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
Expand Down Expand Up @@ -218,6 +234,7 @@ ${kpi('mean robustness', pct(s.meanRobustness), s.meanRobustness < 0.6 ? '#e58a9
${kpi('verified findings', String(s.verifiedFindings), s.verifiedFindings > 0 ? '#e58a96' : '#5ad17a')}
${kpi('cells covered', `${s.cellsCovered}/${s.cellsTotal}`)}
${kpi('scenarios run', String(s.totalRuns))}
${cost}
${lift}
</div>
<h2>Coverage map</h2>
Expand Down
155 changes: 155 additions & 0 deletions src/fuzz/explorer-cost.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import { describe, expect, it } from 'vitest'
import { CostLedger } from '../cost-ledger'
import { costReport } from '../cost-report'
import { ValidationError } from '../errors'
import { renderCapsuleHtml } from './capsule'
import { BehaviorExplorer } from './explorer'
import type { BehaviorSpace, ExploreOptions } from './types'

const space: BehaviorSpace = { axes: [{ name: 'difficulty', values: ['easy'] }] }

// Single-cell space + uniform allocation + concurrency 1 → the evaluation
// (and costOf) order is fully deterministic.
function makeOpts(overrides: Partial<ExploreOptions<string>>): ExploreOptions<string> {
let n = 0
return {
target: 'cost-target',
space,
proposer: (ctx) => Array.from({ length: ctx.count }, () => `p-${n++}`),
evaluate: async () => ({ valid: true, score: 0.9 }),
seedsFor: () => ['seed-0'],
scenarioId: (s) => s,
allocation: 'uniform',
budget: 50,
seed: 3,
...overrides,
}
}

describe('BehaviorExplorer cost budget', () => {
it('stops the loop when accumulated known cost reaches costBudgetUsd', async () => {
const explorer = new BehaviorExplorer(
makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 3 }),
)
const capsule = await explorer.run()
expect(capsule.stats.totalRuns).toBe(3)
expect(capsule.stats.costUsd).toBe(3)
expect(capsule.stats.costUnknownRuns).toBe(0)
})

it('a zero budget stops before any evaluation — same >= semantics as control-runtime', async () => {
const explorer = new BehaviorExplorer(
makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 0 }),
)
const capsule = await explorer.run()
expect(capsule.stats.totalRuns).toBe(0)
expect(capsule.stats.costUsd).toBe(0)
})

it('counts unknown-cost runs separately — never as $0, never against the budget', async () => {
let call = 0
const onCost: Array<{ usd: number; channel: string }> = []
const explorer = new BehaviorExplorer(
makeOpts({
budget: 4,
// Runs 1 and 3 cost $1; runs 2 and 4 have unknown cost.
costOf: () => (call++ % 2 === 0 ? { usd: 1 } : null),
costBudgetUsd: 10,
onCost: (e) => onCost.push(e),
}),
)
const capsule = await explorer.run()
expect(capsule.stats.totalRuns).toBe(4)
expect(capsule.stats.costUsd).toBe(2)
expect(capsule.stats.costUnknownRuns).toBe(2)
expect(onCost).toEqual([
{ usd: 1, channel: 'agent' },
{ usd: 1, channel: 'agent' },
])
})

it('records known costs into the supplied ledger with channel agent + actualCostUsd', async () => {
const ledger = new CostLedger()
const explorer = new BehaviorExplorer(
makeOpts({ budget: 2, costOf: () => ({ usd: 0.5, model: 'gpt-4o' }), ledger }),
)
await explorer.run()
const entries = ledger.list()
expect(entries).toHaveLength(2)
for (const entry of entries) {
expect(entry.channel).toBe('agent')
expect(entry.actualCostUsd).toBe(0.5)
expect(entry.costUnknown).toBe(false)
expect(entry.model).toBe('gpt-4o')
expect(entry.tags?.target).toBe('cost-target')
}
const report = costReport(ledger)
expect(report.perModel).toEqual([{ model: 'gpt-4o', usd: 1, entries: 2, unpriced: false }])
})

it('labels ledger entries unattributed when costOf names no model', async () => {
const ledger = new CostLedger()
const explorer = new BehaviorExplorer(
makeOpts({ budget: 1, costOf: () => ({ usd: 0.25 }), ledger }),
)
await explorer.run()
expect(ledger.list().map((e) => e.model)).toEqual(['unattributed'])
})

it('rejects negative, NaN, and infinite budgets loudly', () => {
const costOf = () => ({ usd: 1 })
for (const bad of [-1, Number.NaN, Number.POSITIVE_INFINITY]) {
expect(() => new BehaviorExplorer(makeOpts({ costOf, costBudgetUsd: bad }))).toThrow(
/costBudgetUsd must be a nonnegative finite number/,
)
}
})

it('rejects cost options without costOf — the explorer cannot know run cost', () => {
expect(() => new BehaviorExplorer(makeOpts({ costBudgetUsd: 3 }))).toThrow(ValidationError)
expect(() => new BehaviorExplorer(makeOpts({ ledger: new CostLedger() }))).toThrow(
ValidationError,
)
expect(() => new BehaviorExplorer(makeOpts({ onCost: () => {} }))).toThrow(ValidationError)
})

it('rejects a fabricated costOf number loudly — null is the only unknown', async () => {
const explorer = new BehaviorExplorer(
makeOpts({ budget: 1, costOf: () => ({ usd: Number.NaN }) }),
)
await expect(explorer.run()).rejects.toThrow(/costOf returned an invalid usd/)
})

it('omits cost stats entirely when cost tracking is not wired — absent, never $0', async () => {
const capsule = await new BehaviorExplorer(makeOpts({ budget: 2 })).run()
expect(capsule.stats.costUsd).toBeUndefined()
expect(capsule.stats.costUnknownRuns).toBeUndefined()
})
})

describe('renderCapsuleHtml cost KPI', () => {
it('shows the known-dollar KPI when cost tracking was wired', async () => {
const explorer = new BehaviorExplorer(
makeOpts({ costOf: () => ({ usd: 1 }), costBudgetUsd: 3 }),
)
const html = renderCapsuleHtml(await explorer.run())
expect(html).toContain('$3.00')
expect(html).not.toContain('runs unpriced')
})

it('names the unpriced-run count next to the total — the dollar figure is a lower bound', async () => {
let call = 0
const explorer = new BehaviorExplorer(
makeOpts({ budget: 4, costOf: () => (call++ % 2 === 0 ? { usd: 1 } : null) }),
)
const html = renderCapsuleHtml(await explorer.run())
expect(html).toContain('$2.00')
expect(html).toContain('2 runs unpriced')
})

it('shows no dollar KPI at all when cost was not tracked', async () => {
const html = renderCapsuleHtml(await new BehaviorExplorer(makeOpts({ budget: 2 })).run())
expect(/\$\d/.test(html)).toBe(false)
expect(html).not.toContain('runs unpriced')
})
})
Loading
Loading