tangle-network · drewstone · Jun 11, 2026 · Jun 11, 2026
diff --git a/bench/src/ablation-grid.mts b/bench/src/ablation-grid.mts
@@ -162,7 +162,14 @@ async function main(): Promise<void> {
           return { environment: surface, tasks: loadSlice }
         })()
 
-  const worker = { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7 }
+  const worker = {
+    routerBaseUrl,
+    routerKey,
+    model: workerModel,
+    innerTurns: Number(process.env.INNER_TURNS ?? 4),
+    temperature: 0.7,
+    ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
+  }
   const gammaPrompt = cells.some((c) => c.gamma) ? readFileSync(must('PROMPT_ARTIFACT'), 'utf8').trim() : undefined
 
   // The prompt each cell carries: original | γ artifact, then κ on top of whichever.

diff --git a/bench/src/flywheel-evolve.mts b/bench/src/flywheel-evolve.mts
@@ -71,6 +71,7 @@ async function main(): Promise<void> {
       model: workerModel,
       innerTurns: Number(process.env.INNER_TURNS ?? 4),
       temperature: 0.7,
+      ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
     },
     author: {
       chat: createChatClient({ transport: 'router', apiKey: routerKey, baseUrl: routerBaseUrl, defaultModel: authorModel }),

diff --git a/bench/src/steering-modes.mts b/bench/src/steering-modes.mts
@@ -179,7 +179,7 @@ improve the outcome, whether a FRESH restart is more promising, or whether furth
 is waste. Respond with EXACTLY one line in this format and nothing else:
 VERDICT: CONTINUE|RESTART|STOP confidence=<0.0-1.0> reason=<one short clause>`
 
-const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, critique }) => {
+const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget, shot, consult }) => {
   let handle = await surface.open(task)
   const progression: number[] = []
   let messages: Msg[] | undefined
@@ -194,13 +194,16 @@ const belief: Strategy = defineStrategy('belief', async ({ surface, task, budget
       progression.push(out.score)
       if (out.score >= 1) break
       if (shots === budget - 1) break
-      const verdictText = await critique(out.messages)
+      // The RAW channel — the findings protocol strips verdict formats; consult() does not.
+      const verdictText = await consult(out.messages, beliefInstruction)
       completions += 1
-      if (!verdictText) break // analyst says complete
+      if (!verdictText) break // analyst went down
       const m = verdictText.match(/VERDICT:\s*(CONTINUE|RESTART|STOP)/i)
       const decision = m?.[1]?.toUpperCase()
       if (!decision) parseFailures += 1
-      if (decision === 'STOP') break
+      // Calibration floor: an uncertain STOP is not obeyed — spend the budget.
+      const conf = Number.parseFloat(verdictText.match(/confidence=([\d.]+)/i)?.[1] ?? '1')
+      if (decision === 'STOP' && conf >= 0.7) break
       if (decision === 'RESTART') {
         await surface.close(handle)
         handle = await surface.open(task)
@@ -234,14 +237,17 @@ async function main(): Promise<void> {
     { name: 'refine', strategy: refine },
     { name: 'structural', strategy: structural },
     { name: 'contrastive', strategy: contrastive },
-    { name: 'belief', strategy: belief, analystInstruction: beliefInstruction },
+    { name: 'belief', strategy: belief },
   ]
+  const armFilter = process.env.ARMS?.split(',').map((a) => a.trim())
+  const selected = armFilter ? arms.filter((a) => armFilter.includes(a.name)) : arms
+  if (selected.length < 2) throw new Error(`ARMS must include refine + at least one mode (got: ${process.env.ARMS})`)
 
   console.error(
-    `=== STEERING MODES · ${arms.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`,
+    `=== STEERING MODES · ${selected.map((a) => a.name).join(' vs ')} · AIME[${offset},${offset + n}) · budget=${budget} · worker=${workerModel} ===`,
   )
   const reports: Record<string, BenchmarkReport> = {}
-  for (const arm of arms) {
+  for (const arm of selected) {
     const waterfall = createWaterfallCollector()
     const report = await runBenchmark({
       environment,
@@ -252,7 +258,7 @@ async function main(): Promise<void> {
         model: workerModel,
         innerTurns: Number(process.env.INNER_TURNS ?? 2),
         temperature: 0.7,
-        ...(arm.analystInstruction ? { analystInstruction: arm.analystInstruction } : {}),
+        ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
       },
       strategies: [arm.strategy],
       budget,
@@ -282,7 +288,7 @@ async function main(): Promise<void> {
   const verdicts: Record<string, PromotionVerdict> = {}
   const incumbentReport = reports.refine
   if (!incumbentReport) throw new Error('refine arm missing')
-  for (const arm of arms) {
+  for (const arm of selected) {
     if (arm.name === 'refine') continue
     const candidate = reports[arm.name]
     if (!candidate) continue
@@ -321,7 +327,7 @@ async function main(): Promise<void> {
         models: { worker: workerModel, analyst: workerModel },
         domain: `aime[${offset},${offset + n})`,
         budget,
-        arms: arms.map((a) => a.name),
+        arms: selected.map((a) => a.name),
         reports,
         verdicts,
       },

diff --git a/src/runtime/strategy-author.ts b/src/runtime/strategy-author.ts
@@ -43,6 +43,11 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
     A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
     instruction (or null when it judges the work complete). Costs ~1 completion.
 
+  consult(messages, instruction): Promise<string | null>
+    The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
+    trajectory verbatim (no reformatting) — use it when you need a specific reply format
+    (a decision, a prediction). Costs ~1 completion.
+
   surface.open(task) / surface.close(handle)
     Open a persistent artifact you manage yourself (remember to close in a finally).
     close is idempotent — closing an already-closed handle is a safe no-op.

diff --git a/src/runtime/strategy.ts b/src/runtime/strategy.ts
@@ -85,6 +85,9 @@ export interface AgenticOptions {
   routerKey: string
   model: string
   temperature?: number
+  /** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
+   *  budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
+  maxTokens?: number
   /** Turns the agent may take within ONE shot before the driver intervenes. */
   innerTurns?: number
   /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
@@ -116,6 +119,9 @@ interface ShotTask {
   steer?: string // analyst-derived steer injected before this shot (depth)
   persona?: ShotPersona // role override — multi-agent loops give each shot its own hat
   tools?: string[] // restrict THIS shot to these domain tools (names); unknown names throw
+  /** analyst leaf only: a RAW instruction — the analyst answers it over the trajectory
+   *  directly (no findings schema). The verdict-capable channel. */
+  rawInstruction?: string
 }
 
 interface ShotOut {
@@ -158,6 +164,7 @@ async function runShot(
         tools,
         tool_choice: 'auto',
         temperature: opts.temperature ?? 0.7,
+        ...(opts.maxTokens ? { max_tokens: opts.maxTokens } : {}),
       }),
     })
     if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`)
@@ -211,12 +218,10 @@ interface AnalyzeOut {
   tokens: { input: number; output: number }
 }
 
-async function analyze(
-  task: AgenticTask,
-  messages: Msg[],
-  opts: AgenticOptions,
-): Promise<AnalyzeOut> {
-  const trajectory = messages
+/** The firewall's input shape: the trajectory as compacted text — calls, results,
+ *  assistant text. NEVER scores, NEVER check internals. Shared by both analyst channels. */
+function compactTrajectory(messages: Msg[]): string {
+  return messages
     .filter((m) => m.role === 'assistant' || m.role === 'tool')
     .map((m) => {
       if (m.role === 'tool') return `RESULT ${String(m.content).slice(0, 280)}`
@@ -227,6 +232,64 @@ async function analyze(
     })
     .join('\n')
     .slice(0, 7000)
+}
+
+/** The RAW analyst channel: the firewalled critic answers `instruction` over the
+ *  trajectory directly — no findings schema, no recommended-action extraction. The
+ *  channel for verdict-shaped steering (budget controllers, calibrated predictions)
+ *  whose output format the findings protocol would strip. Same firewall as analyze():
+ *  trajectory in, never scores. */
+async function consultAnalyst(
+  task: AgenticTask,
+  messages: Msg[],
+  instruction: string,
+  opts: AgenticOptions,
+): Promise<AnalyzeOut> {
+  const trajectory = compactTrajectory(messages)
+  const analystModel = opts.analystModel ?? opts.model
+  const chat = createChatClient({
+    transport: 'router',
+    apiKey: opts.routerKey,
+    baseUrl: opts.routerBaseUrl,
+    defaultModel: analystModel,
+  })
+  const res = await chat.chat({
+    model: analystModel,
+    temperature: 0.2,
+    maxTokens: 1024,
+    messages: [
+      { role: 'system', content: instruction },
+      {
+        role: 'user',
+        content: `TASK: ${task.userPrompt.slice(0, 1500)}\n\nTRAJECTORY:\n${trajectory}`,
+      },
+    ],
+  })
+  const usage = (
+    res as {
+      usage?: {
+        promptTokens?: number
+        prompt_tokens?: number
+        completionTokens?: number
+        completion_tokens?: number
+      }
+    }
+  ).usage
+  return {
+    steer: res.content.trim(),
+    tokens: {
+      input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
+      output: usage?.completionTokens ?? usage?.completion_tokens ?? 0,
+    },
+  }
+}
+
+async function analyze(
+  task: AgenticTask,
+  messages: Msg[],
+  opts: AgenticOptions,
+): Promise<AnalyzeOut> {
+  const trajectory = compactTrajectory(messages)
   const analystModel = opts.analystModel ?? opts.model
   const inner = createChatClient({
     transport: 'router',
@@ -381,8 +444,10 @@ function analystExecutor(opts: AgenticOptions): Executor<unknown> {
   return {
     runtime: 'agentic-analyst',
     async execute(task: unknown): Promise<ExecutorResult<unknown>> {
-      const t = task as { task: AgenticTask; messages: Msg[] }
-      const { steer, tokens } = await analyze(t.task, t.messages, opts)
+      const t = task as { task: AgenticTask; messages: Msg[]; rawInstruction?: string }
+      const { steer, tokens } = t.rawInstruction
+        ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts)
+        : await analyze(t.task, t.messages, opts)
       const analystModel = opts.analystModel ?? opts.model
       artifact = {
         outRef: `analyst:${steer.length}`,
@@ -664,6 +729,11 @@ export interface StrategyCtx {
   shot(spec?: ShotSpec): Promise<ShotResult | null>
   /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
   critique(messages: Msg[]): Promise<string | null>
+  /** The RAW analyst channel: the firewalled critic answers `instruction` over the
+   *  trajectory verbatim — no findings extraction, so verdict-shaped formats
+   *  (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
+   *  trajectory in, never scores. Null when the analyst went down. */
+  consult(messages: Msg[], instruction: string): Promise<string | null>
   /** The tools THIS artifact's task actually offers (names + descriptions only — never
    *  the implementations). Tool sets vary per task on heterogeneous domains; a strategy
    *  that restricts shots MUST select from this list, never from hardcoded names. */
@@ -756,6 +826,19 @@ export function defineStrategy(
             const findings = settled.out as unknown as string
             return /^\s*COMPLETE\b/i.test(findings) ? null : findings
           },
+          async consult(messages, instruction) {
+            const child = leaf(`analyst:${seq}`, 'analyst')
+            seq += 1
+            const res = scope.spawn(
+              child,
+              { task, messages, rawInstruction: instruction },
+              { budget: perChild(1), label: child.name },
+            )
+            if (!res.ok) return null
+            const settled = await drainOne(scope)
+            if (settled.kind === 'down') return null
+            return settled.out as unknown as string
+          },
         }
         const r = await run(ctx)
         // Override the body's self-reported score/resolved with the harness-verified

diff --git a/tests/loops/strategy-suite.test.ts b/tests/loops/strategy-suite.test.ts
@@ -76,12 +76,18 @@ function stubRouter(): CapturedChatRequest[] {
     'fetch',
     vi.fn(async (_url: string, init?: { body?: string }) => {
       captured.push(JSON.parse(init?.body ?? '{}') as CapturedChatRequest)
+      const body = {
+        choices: [{ message: { content: 'DONE' } }],
+        usage: { prompt_tokens: 10, completion_tokens: 5 },
+      }
+      // Both response-reading styles: runShot uses json(); agent-eval's llm-client
+      // reads text() — a stub missing either silently downs the analyst leaf.
       return {
         ok: true,
-        json: async () => ({
-          choices: [{ message: { content: 'DONE' } }],
-          usage: { prompt_tokens: 10, completion_tokens: 5 },
-        }),
+        status: 200,
+        headers: { get: () => 'application/json' },
+        json: async () => body,
+        text: async () => JSON.stringify(body),
       }
     }),
   )
@@ -631,3 +637,26 @@ describe('promotionGate non-inferiority', () => {
     expect(a.promoted).toBe(true)
   })
 })
+
+// ── The raw analyst channel (verdict-shaped steering survives) ─────────────────────
+
+describe('consult', () => {
+  it('the instruction reaches the analyst verbatim and the raw reply returns intact', async () => {
+    const captured = stubRouter()
+    const surface = fixtureSurface(() => ({ passes: 0, total: 1 }))
+    let reply: string | null = null
+    const controller = defineStrategy('controller', async ({ shot, consult }) => {
+      const out = await shot()
+      if (out)
+        reply = await consult(out.messages, 'Reply with EXACTLY: VERDICT: STOP confidence=0.9')
+      return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
+    })
+    await runAgentic({ surface, task, ...worker, strategy: controller, budget: 2 })
+    // The consult call is the SECOND router request; its system prompt is the raw instruction.
+    const consultReq = captured[1] as { messages?: Array<{ role: string; content: string }> }
+    expect(consultReq?.messages?.[0]?.role).toBe('system')
+    expect(consultReq?.messages?.[0]?.content).toContain('VERDICT: STOP')
+    // The stubbed model replies 'DONE'; consult returns it verbatim (no findings filter).
+    expect(reply).toBe('DONE')
+  })
+})