Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion bench/src/ablation-grid.mts
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,14 @@ async function main(): Promise<void> {
return { environment: surface, tasks: loadSlice }
})()

const worker = { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7 }
const worker = {
routerBaseUrl,
routerKey,
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 4),
temperature: 0.7,
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
}
const gammaPrompt = cells.some((c) => c.gamma) ? readFileSync(must('PROMPT_ARTIFACT'), 'utf8').trim() : undefined

// The prompt each cell carries: original | γ artifact, then κ on top of whichever.
Expand Down
1 change: 1 addition & 0 deletions bench/src/flywheel-evolve.mts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ async function main(): Promise<void> {
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 4),
temperature: 0.7,
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
},
author: {
chat: createChatClient({ transport: 'router', apiKey: routerKey, baseUrl: routerBaseUrl, defaultModel: authorModel }),
Expand Down
26 changes: 16 additions & 10 deletions bench/src/steering-modes.mts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ improve the outcome, whether a FRESH restart is more promising, or whether furth
is waste. Respond with EXACTLY one line in this format and nothing else:
VERDICT: CONTINUE|RESTART|STOP confidence=<0.0-1.0> reason=<one short clause>`

const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, critique }) => {
const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, consult }) => {
let handle = await surface.open(task)
const progression: number[] = []
let messages: Msg[] | undefined
Expand All @@ -194,13 +194,16 @@ const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget
progression.push(out.score)
if (out.score >= 1) break
if (shots === budget - 1) break
const verdictText = await critique(out.messages)
// The RAW channel — the findings protocol strips verdict formats; consult() does not.
const verdictText = await consult(out.messages, beliefInstruction)
completions += 1
if (!verdictText) break // analyst says complete
if (!verdictText) break // analyst went down
const m = verdictText.match(/VERDICT:\s*(CONTINUE|RESTART|STOP)/i)
const decision = m?.[1]?.toUpperCase()
if (!decision) parseFailures += 1
if (decision === 'STOP') break
// Calibration floor: an uncertain STOP is not obeyed — spend the budget.
const conf = Number.parseFloat(verdictText.match(/confidence=([\d.]+)/i)?.[1] ?? '1')
if (decision === 'STOP' && conf >= 0.7) break
if (decision === 'RESTART') {
await surface.close(handle)
handle = await surface.open(task)
Expand Down Expand Up @@ -234,14 +237,17 @@ async function main(): Promise<void> {
{ name: 'refine', strategy: refine },
{ name: 'structural', strategy: structural },
{ name: 'contrastive', strategy: contrastive },
{ name: 'belief', strategy: belief, analystInstruction: beliefInstruction },
{ name: 'belief', strategy: belief },
]
const armFilter = process.env.ARMS?.split(',').map((a) => a.trim())
const selected = armFilter ? arms.filter((a) => armFilter.includes(a.name)) : arms
if (selected.length < 2) throw new Error(`ARMS must include refine + at least one mode (got: ${process.env.ARMS})`)

console.error(
`=== STEERING MODES · ${arms.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`,
`=== STEERING MODES · ${selected.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`,
)
const reports: Record<string, BenchmarkReport> = {}
for (const arm of arms) {
for (const arm of selected) {
const waterfall = createWaterfallCollector()
const report = await runBenchmark({
environment,
Expand All @@ -252,7 +258,7 @@ async function main(): Promise<void> {
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 2),
temperature: 0.7,
...(arm.analystInstruction ? { analystInstruction: arm.analystInstruction } : {}),
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
},
strategies: [arm.strategy],
budget,
Expand Down Expand Up @@ -282,7 +288,7 @@ async function main(): Promise<void> {
const verdicts: Record<string, PromotionVerdict> = {}
const incumbentReport = reports.refine
if (!incumbentReport) throw new Error('refine arm missing')
for (const arm of arms) {
for (const arm of selected) {
if (arm.name === 'refine') continue
const candidate = reports[arm.name]
if (!candidate) continue
Expand Down Expand Up @@ -321,7 +327,7 @@ async function main(): Promise<void> {
models: { worker: workerModel, analyst: workerModel },
domain: `aime[${offset},${offset + n})`,
budget,
arms: arms.map((a) => a.name),
arms: selected.map((a) => a.name),
reports,
verdicts,
},
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/strategy-author.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
instruction (or null when it judges the work complete). Costs ~1 completion.

consult(messages, instruction): Promise<string | null>
The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
trajectory verbatim (no reformatting) — use it when you need a specific reply format
(a decision, a prediction). Costs ~1 completion.

surface.open(task) / surface.close(handle)
Open a persistent artifact you manage yourself (remember to close in a finally).
close is idempotent — closing an already-closed handle is a safe no-op.
Expand Down
99 changes: 91 additions & 8 deletions src/runtime/strategy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ export interface AgenticOptions {
routerKey: string
model: string
temperature?: number
/** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
* budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
maxTokens?: number
/** Turns the agent may take within ONE shot before the driver intervenes. */
innerTurns?: number
/** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
Expand Down Expand Up @@ -116,6 +119,9 @@ interface ShotTask {
steer?: string // analyst-derived steer injected before this shot (depth)
persona?: ShotPersona // role override — multi-agent loops give each shot its own hat
tools?: string[] // restrict THIS shot to these domain tools (names); unknown names throw
/** analyst leaf only: a RAW instruction — the analyst answers it over the trajectory
* directly (no findings schema). The verdict-capable channel. */
rawInstruction?: string
}

interface ShotOut {
Expand Down Expand Up @@ -158,6 +164,7 @@ async function runShot(
tools,
tool_choice: 'auto',
temperature: opts.temperature ?? 0.7,
...(opts.maxTokens ? { max_tokens: opts.maxTokens } : {}),
}),
})
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`)
Expand Down Expand Up @@ -211,12 +218,10 @@ interface AnalyzeOut {
tokens: { input: number; output: number }
}

async function analyze(
task: AgenticTask,
messages: Msg[],
opts: AgenticOptions,
): Promise<AnalyzeOut> {
const trajectory = messages
/** The firewall's input shape: the trajectory as compacted text — calls, results,
* assistant text. NEVER scores, NEVER check internals. Shared by both analyst channels. */
function compactTrajectory(messages: Msg[]): string {
return messages
.filter((m) => m.role === 'assistant' || m.role === 'tool')
.map((m) => {
if (m.role === 'tool') return `RESULT ${String(m.content).slice(0, 280)}`
Expand All @@ -227,6 +232,64 @@ async function analyze(
})
.join('\n')
.slice(0, 7000)
}

/** The RAW analyst channel: the firewalled critic answers `instruction` over the
* trajectory directly — no findings schema, no recommended-action extraction. The
* channel for verdict-shaped steering (budget controllers, calibrated predictions)
* whose output format the findings protocol would strip. Same firewall as analyze():
* trajectory in, never scores. */
async function consultAnalyst(
task: AgenticTask,
messages: Msg[],
instruction: string,
opts: AgenticOptions,
): Promise<AnalyzeOut> {
const trajectory = compactTrajectory(messages)
const analystModel = opts.analystModel ?? opts.model
const chat = createChatClient({
transport: 'router',
apiKey: opts.routerKey,
baseUrl: opts.routerBaseUrl,
defaultModel: analystModel,
})
const res = await chat.chat({
model: analystModel,
temperature: 0.2,
maxTokens: 1024,
messages: [
{ role: 'system', content: instruction },
{
role: 'user',
content: `TASK: ${task.userPrompt.slice(0, 1500)}\n\nTRAJECTORY:\n${trajectory}`,
},
],
})
const usage = (
res as {
usage?: {
promptTokens?: number
prompt_tokens?: number
completionTokens?: number
completion_tokens?: number
}
}
).usage
return {
steer: res.content.trim(),
tokens: {
input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
output: usage?.completionTokens ?? usage?.completion_tokens ?? 0,
},
}
}

async function analyze(
task: AgenticTask,
messages: Msg[],
opts: AgenticOptions,
): Promise<AnalyzeOut> {
const trajectory = compactTrajectory(messages)
const analystModel = opts.analystModel ?? opts.model
const inner = createChatClient({
transport: 'router',
Expand Down Expand Up @@ -381,8 +444,10 @@ function analystExecutor(opts: AgenticOptions): Executor<unknown> {
return {
runtime: 'agentic-analyst',
async execute(task: unknown): Promise<ExecutorResult<unknown>> {
const t = task as { task: AgenticTask; messages: Msg[] }
const { steer, tokens } = await analyze(t.task, t.messages, opts)
const t = task as { task: AgenticTask; messages: Msg[]; rawInstruction?: string }
const { steer, tokens } = t.rawInstruction
? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts)
: await analyze(t.task, t.messages, opts)
const analystModel = opts.analystModel ?? opts.model
artifact = {
outRef: `analyst:${steer.length}`,
Expand Down Expand Up @@ -664,6 +729,11 @@ export interface StrategyCtx {
shot(spec?: ShotSpec): Promise<ShotResult | null>
/** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
critique(messages: Msg[]): Promise<string | null>
/** The RAW analyst channel: the firewalled critic answers `instruction` over the
* trajectory verbatim — no findings extraction, so verdict-shaped formats
* (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
* trajectory in, never scores. Null when the analyst went down. */
consult(messages: Msg[], instruction: string): Promise<string | null>
/** The tools THIS artifact's task actually offers (names + descriptions only — never
* the implementations). Tool sets vary per task on heterogeneous domains; a strategy
* that restricts shots MUST select from this list, never from hardcoded names. */
Expand Down Expand Up @@ -756,6 +826,19 @@ export function defineStrategy(
const findings = settled.out as unknown as string
return /^\s*COMPLETE\b/i.test(findings) ? null : findings
},
async consult(messages, instruction) {
const child = leaf(`analyst:${seq}`, 'analyst')
seq += 1
const res = scope.spawn(
child,
{ task, messages, rawInstruction: instruction },
{ budget: perChild(1), label: child.name },
)
if (!res.ok) return null
const settled = await drainOne(scope)
if (settled.kind === 'down') return null
return settled.out as unknown as string
},
}
const r = await run(ctx)
// Override the body's self-reported score/resolved with the harness-verified
Expand Down
37 changes: 33 additions & 4 deletions tests/loops/strategy-suite.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,18 @@ function stubRouter(): CapturedChatRequest[] {
'fetch',
vi.fn(async (_url: string, init?: { body?: string }) => {
captured.push(JSON.parse(init?.body ?? '{}') as CapturedChatRequest)
const body = {
choices: [{ message: { content: 'DONE' } }],
usage: { prompt_tokens: 10, completion_tokens: 5 },
}
// Both response-reading styles: runShot uses json(); agent-eval's llm-client
// reads text() — a stub missing either silently downs the analyst leaf.
return {
ok: true,
json: async () => ({
choices: [{ message: { content: 'DONE' } }],
usage: { prompt_tokens: 10, completion_tokens: 5 },
}),
status: 200,
headers: { get: () => 'application/json' },
json: async () => body,
text: async () => JSON.stringify(body),
}
}),
)
Expand Down Expand Up @@ -631,3 +637,26 @@ describe('promotionGate non-inferiority', () => {
expect(a.promoted).toBe(true)
})
})

// ── The raw analyst channel (verdict-shaped steering survives) ─────────────────────

describe('consult', () => {
it('the instruction reaches the analyst verbatim and the raw reply returns intact', async () => {
const captured = stubRouter()
const surface = fixtureSurface(() => ({ passes: 0, total: 1 }))
let reply: string | null = null
const controller = defineStrategy('controller', async ({ shot, consult }) => {
const out = await shot()
if (out)
reply = await consult(out.messages, 'Reply with EXACTLY: VERDICT: STOP confidence=0.9')
return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
})
await runAgentic({ surface, task, ...worker, strategy: controller, budget: 2 })
// The consult call is the SECOND router request; its system prompt is the raw instruction.
const consultReq = captured[1] as { messages?: Array<{ role: string; content: string }> }
expect(consultReq?.messages?.[0]?.role).toBe('system')
expect(consultReq?.messages?.[0]?.content).toContain('VERDICT: STOP')
// The stubbed model replies 'DONE'; consult returns it verbatim (no findings filter).
expect(reply).toBe('DONE')
})
})
Loading