diff --git a/bench/src/ablation-grid.mts b/bench/src/ablation-grid.mts index 5e40a27..a46c5fd 100644 --- a/bench/src/ablation-grid.mts +++ b/bench/src/ablation-grid.mts @@ -162,7 +162,14 @@ async function main(): Promise { return { environment: surface, tasks: loadSlice } })() - const worker = { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7 } + const worker = { + routerBaseUrl, + routerKey, + model: workerModel, + innerTurns: Number(process.env.INNER_TURNS ?? 4), + temperature: 0.7, + ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), + } const gammaPrompt = cells.some((c) => c.gamma) ? readFileSync(must('PROMPT_ARTIFACT'), 'utf8').trim() : undefined // The prompt each cell carries: original | γ artifact, then κ on top of whichever. diff --git a/bench/src/flywheel-evolve.mts b/bench/src/flywheel-evolve.mts index 1c08173..9b4087c 100644 --- a/bench/src/flywheel-evolve.mts +++ b/bench/src/flywheel-evolve.mts @@ -71,6 +71,7 @@ async function main(): Promise { model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7, + ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), }, author: { chat: createChatClient({ transport: 'router', apiKey: routerKey, baseUrl: routerBaseUrl, defaultModel: authorModel }), diff --git a/bench/src/steering-modes.mts b/bench/src/steering-modes.mts index 498d1ff..38fb707 100644 --- a/bench/src/steering-modes.mts +++ b/bench/src/steering-modes.mts @@ -179,7 +179,7 @@ improve the outcome, whether a FRESH restart is more promising, or whether furth is waste. Respond with EXACTLY one line in this format and nothing else: VERDICT: CONTINUE|RESTART|STOP confidence=<0.0-1.0> reason=` -const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, critique }) => { +const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, consult }) => { let handle = await surface.open(task) const progression: number[] = [] let messages: Msg[] | undefined @@ -194,13 +194,16 @@ const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget progression.push(out.score) if (out.score >= 1) break if (shots === budget - 1) break - const verdictText = await critique(out.messages) + // The RAW channel — the findings protocol strips verdict formats; consult() does not. + const verdictText = await consult(out.messages, beliefInstruction) completions += 1 - if (!verdictText) break // analyst says complete + if (!verdictText) break // analyst went down const m = verdictText.match(/VERDICT:\s*(CONTINUE|RESTART|STOP)/i) const decision = m?.[1]?.toUpperCase() if (!decision) parseFailures += 1 - if (decision === 'STOP') break + // Calibration floor: an uncertain STOP is not obeyed — spend the budget. + const conf = Number.parseFloat(verdictText.match(/confidence=([\d.]+)/i)?.[1] ?? '1') + if (decision === 'STOP' && conf >= 0.7) break if (decision === 'RESTART') { await surface.close(handle) handle = await surface.open(task) @@ -234,14 +237,17 @@ async function main(): Promise { { name: 'refine', strategy: refine }, { name: 'structural', strategy: structural }, { name: 'contrastive', strategy: contrastive }, - { name: 'belief', strategy: belief, analystInstruction: beliefInstruction }, + { name: 'belief', strategy: belief }, ] + const armFilter = process.env.ARMS?.split(',').map((a) => a.trim()) + const selected = armFilter ? arms.filter((a) => armFilter.includes(a.name)) : arms + if (selected.length < 2) throw new Error(`ARMS must include refine + at least one mode (got: ${process.env.ARMS})`) console.error( - `=== STEERING MODES · ${arms.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`, + `=== STEERING MODES · ${selected.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`, ) const reports: Record = {} - for (const arm of arms) { + for (const arm of selected) { const waterfall = createWaterfallCollector() const report = await runBenchmark({ environment, @@ -252,7 +258,7 @@ async function main(): Promise { model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 2), temperature: 0.7, - ...(arm.analystInstruction ? { analystInstruction: arm.analystInstruction } : {}), + ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), }, strategies: [arm.strategy], budget, @@ -282,7 +288,7 @@ async function main(): Promise { const verdicts: Record = {} const incumbentReport = reports.refine if (!incumbentReport) throw new Error('refine arm missing') - for (const arm of arms) { + for (const arm of selected) { if (arm.name === 'refine') continue const candidate = reports[arm.name] if (!candidate) continue @@ -321,7 +327,7 @@ async function main(): Promise { models: { worker: workerModel, analyst: workerModel }, domain: `aime[${offset},${offset + n})`, budget, - arms: arms.map((a) => a.name), + arms: selected.map((a) => a.name), reports, verdicts, }, diff --git a/src/runtime/strategy-author.ts b/src/runtime/strategy-author.ts index 54f277b..71e8a41 100644 --- a/src/runtime/strategy-author.ts +++ b/src/runtime/strategy-author.ts @@ -43,6 +43,11 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective instruction (or null when it judges the work complete). Costs ~1 completion. + consult(messages, instruction): Promise + The RAW analyst channel: the same firewalled critic answers YOUR instruction over the + trajectory verbatim (no reformatting) — use it when you need a specific reply format + (a decision, a prediction). Costs ~1 completion. + surface.open(task) / surface.close(handle) Open a persistent artifact you manage yourself (remember to close in a finally). close is idempotent — closing an already-closed handle is a safe no-op. diff --git a/src/runtime/strategy.ts b/src/runtime/strategy.ts index 752498e..026fa59 100644 --- a/src/runtime/strategy.ts +++ b/src/runtime/strategy.ts @@ -85,6 +85,9 @@ export interface AgenticOptions { routerKey: string model: string temperature?: number + /** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded + * budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */ + maxTokens?: number /** Turns the agent may take within ONE shot before the driver intervenes. */ innerTurns?: number /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a @@ -116,6 +119,9 @@ interface ShotTask { steer?: string // analyst-derived steer injected before this shot (depth) persona?: ShotPersona // role override — multi-agent loops give each shot its own hat tools?: string[] // restrict THIS shot to these domain tools (names); unknown names throw + /** analyst leaf only: a RAW instruction — the analyst answers it over the trajectory + * directly (no findings schema). The verdict-capable channel. */ + rawInstruction?: string } interface ShotOut { @@ -158,6 +164,7 @@ async function runShot( tools, tool_choice: 'auto', temperature: opts.temperature ?? 0.7, + ...(opts.maxTokens ? { max_tokens: opts.maxTokens } : {}), }), }) if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`) @@ -211,12 +218,10 @@ interface AnalyzeOut { tokens: { input: number; output: number } } -async function analyze( - task: AgenticTask, - messages: Msg[], - opts: AgenticOptions, -): Promise { - const trajectory = messages +/** The firewall's input shape: the trajectory as compacted text — calls, results, + * assistant text. NEVER scores, NEVER check internals. Shared by both analyst channels. */ +function compactTrajectory(messages: Msg[]): string { + return messages .filter((m) => m.role === 'assistant' || m.role === 'tool') .map((m) => { if (m.role === 'tool') return `RESULT ${String(m.content).slice(0, 280)}` @@ -227,6 +232,64 @@ async function analyze( }) .join('\n') .slice(0, 7000) +} + +/** The RAW analyst channel: the firewalled critic answers `instruction` over the + * trajectory directly — no findings schema, no recommended-action extraction. The + * channel for verdict-shaped steering (budget controllers, calibrated predictions) + * whose output format the findings protocol would strip. Same firewall as analyze(): + * trajectory in, never scores. */ +async function consultAnalyst( + task: AgenticTask, + messages: Msg[], + instruction: string, + opts: AgenticOptions, +): Promise { + const trajectory = compactTrajectory(messages) + const analystModel = opts.analystModel ?? opts.model + const chat = createChatClient({ + transport: 'router', + apiKey: opts.routerKey, + baseUrl: opts.routerBaseUrl, + defaultModel: analystModel, + }) + const res = await chat.chat({ + model: analystModel, + temperature: 0.2, + maxTokens: 1024, + messages: [ + { role: 'system', content: instruction }, + { + role: 'user', + content: `TASK: ${task.userPrompt.slice(0, 1500)}\n\nTRAJECTORY:\n${trajectory}`, + }, + ], + }) + const usage = ( + res as { + usage?: { + promptTokens?: number + prompt_tokens?: number + completionTokens?: number + completion_tokens?: number + } + } + ).usage + return { + steer: res.content.trim(), + tokens: { + input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0, + output: usage?.completionTokens ?? usage?.completion_tokens ?? 0, + }, + } +} + +async function analyze( + task: AgenticTask, + messages: Msg[], + opts: AgenticOptions, +): Promise { + const trajectory = compactTrajectory(messages) const analystModel = opts.analystModel ?? opts.model const inner = createChatClient({ transport: 'router', @@ -381,8 +444,10 @@ function analystExecutor(opts: AgenticOptions): Executor { return { runtime: 'agentic-analyst', async execute(task: unknown): Promise> { - const t = task as { task: AgenticTask; messages: Msg[] } - const { steer, tokens } = await analyze(t.task, t.messages, opts) + const t = task as { task: AgenticTask; messages: Msg[]; rawInstruction?: string } + const { steer, tokens } = t.rawInstruction + ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) + : await analyze(t.task, t.messages, opts) const analystModel = opts.analystModel ?? opts.model artifact = { outRef: `analyst:${steer.length}`, @@ -664,6 +729,11 @@ export interface StrategyCtx { shot(spec?: ShotSpec): Promise /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */ critique(messages: Msg[]): Promise + /** The RAW analyst channel: the firewalled critic answers `instruction` over the + * trajectory verbatim — no findings extraction, so verdict-shaped formats + * (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall: + * trajectory in, never scores. Null when the analyst went down. */ + consult(messages: Msg[], instruction: string): Promise /** The tools THIS artifact's task actually offers (names + descriptions only — never * the implementations). Tool sets vary per task on heterogeneous domains; a strategy * that restricts shots MUST select from this list, never from hardcoded names. */ @@ -756,6 +826,19 @@ export function defineStrategy( const findings = settled.out as unknown as string return /^\s*COMPLETE\b/i.test(findings) ? null : findings }, + async consult(messages, instruction) { + const child = leaf(`analyst:${seq}`, 'analyst') + seq += 1 + const res = scope.spawn( + child, + { task, messages, rawInstruction: instruction }, + { budget: perChild(1), label: child.name }, + ) + if (!res.ok) return null + const settled = await drainOne(scope) + if (settled.kind === 'down') return null + return settled.out as unknown as string + }, } const r = await run(ctx) // Override the body's self-reported score/resolved with the harness-verified diff --git a/tests/loops/strategy-suite.test.ts b/tests/loops/strategy-suite.test.ts index fbc10e5..df24060 100644 --- a/tests/loops/strategy-suite.test.ts +++ b/tests/loops/strategy-suite.test.ts @@ -76,12 +76,18 @@ function stubRouter(): CapturedChatRequest[] { 'fetch', vi.fn(async (_url: string, init?: { body?: string }) => { captured.push(JSON.parse(init?.body ?? '{}') as CapturedChatRequest) + const body = { + choices: [{ message: { content: 'DONE' } }], + usage: { prompt_tokens: 10, completion_tokens: 5 }, + } + // Both response-reading styles: runShot uses json(); agent-eval's llm-client + // reads text() — a stub missing either silently downs the analyst leaf. return { ok: true, - json: async () => ({ - choices: [{ message: { content: 'DONE' } }], - usage: { prompt_tokens: 10, completion_tokens: 5 }, - }), + status: 200, + headers: { get: () => 'application/json' }, + json: async () => body, + text: async () => JSON.stringify(body), } }), ) @@ -631,3 +637,26 @@ describe('promotionGate non-inferiority', () => { expect(a.promoted).toBe(true) }) }) + +// ── The raw analyst channel (verdict-shaped steering survives) ───────────────────── + +describe('consult', () => { + it('the instruction reaches the analyst verbatim and the raw reply returns intact', async () => { + const captured = stubRouter() + const surface = fixtureSurface(() => ({ passes: 0, total: 1 })) + let reply: string | null = null + const controller = defineStrategy('controller', async ({ shot, consult }) => { + const out = await shot() + if (out) + reply = await consult(out.messages, 'Reply with EXACTLY: VERDICT: STOP confidence=0.9') + return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 } + }) + await runAgentic({ surface, task, ...worker, strategy: controller, budget: 2 }) + // The consult call is the SECOND router request; its system prompt is the raw instruction. + const consultReq = captured[1] as { messages?: Array<{ role: string; content: string }> } + expect(consultReq?.messages?.[0]?.role).toBe('system') + expect(consultReq?.messages?.[0]?.content).toContain('VERDICT: STOP') + // The stubbed model replies 'DONE'; consult returns it verbatim (no findings filter). + expect(reply).toBe('DONE') + }) +})