TransformerOptimus · adithyn7 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/crates/agent/src/tool/codebase_graph.rs b/crates/agent/src/tool/codebase_graph.rs
@@ -7,9 +7,6 @@ use crate::context_engine::ContextEngineApi;
 use crate::error::ToolError;
 use super::{Tool, ToolContext, ToolResult};
 
-/// Max rows rendered per graph section (callers / deps / related) under bench policy.
-const GRAPH_SECTION_CAP: usize = 50;
-
 pub struct CodebaseGraphTool {
     context_engine: Arc<dyn ContextEngineApi>,
 }
@@ -77,12 +74,9 @@ impl Tool for CodebaseGraphTool {
         }
 
         // Bench policy: cap each rendered section so a high-degree node doesn't dump
-        // hundreds of rows into the context. No-op (unbounded) under the app policy.
-        let section_cap = if ctx.policy.codebase_result_caps {
-            GRAPH_SECTION_CAP
-        } else {
-            usize::MAX
-        };
+        // hundreds of rows into the context. No-op (unbounded) under the app policy
+        // when `codebase_graph_section_cap` is None.
+        let section_cap = ctx.policy.codebase_graph_section_cap.unwrap_or(usize::MAX);
 
         log::info!(
             "[codebase_graph] query={:?}, function_name={:?}, file_path={:?}, query_type={:?}",

diff --git a/crates/agent/src/tool/codebase_search.rs b/crates/agent/src/tool/codebase_search.rs
@@ -10,18 +10,13 @@ use crate::context_engine::{ContextEngineApi, SearchResultItem};
 use crate::error::ToolError;
 use super::{Tool, ToolContext, ToolResult};
 
-/// Default result count when the model omits `limit` (bench policy only).
-const DEFAULT_SEARCH_LIMIT: u32 = 20;
-/// Max bytes of chunk content rendered per result (bench policy only).
-const MAX_CHUNK_BYTES: usize = 2048;
-
-/// Truncate a chunk's content to `MAX_CHUNK_BYTES` on a UTF-8 boundary, appending a
+/// Truncate a chunk's content to `max_bytes` on a UTF-8 boundary, appending a
 /// marker when clipped. Keeps oversized index payloads from ballooning the context.
-fn cap_chunk_content(content: &str) -> String {
-    if content.len() <= MAX_CHUNK_BYTES {
+fn cap_chunk_content(content: &str, max_bytes: usize) -> String {
+    if content.len() <= max_bytes {
         return content.to_string();
     }
-    let mut end = MAX_CHUNK_BYTES;
+    let mut end = max_bytes;
     while !content.is_char_boundary(end) {
         end -= 1;
     }
@@ -89,12 +84,15 @@ impl Tool for CodebaseSearchTool {
         let strategy = args.get("strategy").and_then(|v| v.as_str());
         let mut limit = args.get("limit").and_then(|v| v.as_u64().map(|n| n as u32));
 
-        // Bench policy: when the model omits `limit`, default to 20 so the engine
-        // doesn't return a huge result set that balloons the conversation. The model
-        // can still page by raising the existing `limit` param. No-op under app policy.
-        let caps = ctx.policy.codebase_result_caps;
-        if caps && limit.is_none() {
-            limit = Some(DEFAULT_SEARCH_LIMIT);
+        // Bench policy: when the model omits `limit`, default to the policy value so
+        // the engine doesn't return a huge result set that balloons the conversation.
+        // The model can still page by raising the existing `limit` param. No-op under
+        // app policy (codebase_search_limit = None).
+        let policy_limit = ctx.policy.codebase_search_limit;
+        if let Some(l) = policy_limit {
+            if limit.is_none() {
+                limit = Some(l);
+            }
         }
 
         log::info!(
@@ -158,12 +156,12 @@ impl Tool for CodebaseSearchTool {
         // Bench policy: cap result count (defensive — the engine may ignore `limit`)
         // and cap each chunk's content so oversized payloads don't inflate every
         // subsequent turn's context. No-op under the default (app) policy.
-        if caps {
-            if let Some(l) = limit {
-                results.truncate(l as usize);
-            }
+        if let Some(l) = policy_limit {
+            results.truncate(l as usize);
+        }
+        if let Some(max_bytes) = ctx.policy.codebase_search_chunk_bytes {
             for item in results.iter_mut() {
-                item.content = cap_chunk_content(&item.content);
+                item.content = cap_chunk_content(&item.content, max_bytes);
             }
         }
 

diff --git a/crates/agent/src/tool/mod.rs b/crates/agent/src/tool/mod.rs
@@ -97,8 +97,16 @@ pub struct ToolPolicy {
     pub search_timeout_ms: Option<u64>,
     /// Skip the default ignore-dir list in grep/glob. `false` = no extra ignores (app default).
     pub search_default_ignores: bool,
-    /// Apply codebase_search/graph result + content caps. `false` = no caps (app default).
-    pub codebase_result_caps: bool,
+    /// Default `limit` for codebase_search when the model omits it; the same value
+    /// also bounds the final result count. `None` = pass through, no default,
+    /// no truncation (app default).
+    pub codebase_search_limit: Option<u32>,
+    /// Cap per-chunk content bytes in codebase_search results (UTF-8 boundary +
+    /// `…[truncated]` marker). `None` = no cap (app default).
+    pub codebase_search_chunk_bytes: Option<usize>,
+    /// Cap rows rendered per codebase_graph section (callers / deps / related);
+    /// truncated sections append `… and N more`. `None` = no cap (app default).
+    pub codebase_graph_section_cap: Option<usize>,
 }
 
 impl Default for ToolPolicy {
@@ -108,20 +116,25 @@ impl Default for ToolPolicy {
             bash_timeout_ceiling_ms: None,
             search_timeout_ms: None,
             search_default_ignores: false,
-            codebase_result_caps: false,
+            codebase_search_limit: None,
+            codebase_search_chunk_bytes: None,
+            codebase_graph_section_cap: None,
         }
     }
 }
 
 impl ToolPolicy {
     /// Strict policy for the eval harness: bash clamped to 300s, grep/glob walled at
-    /// 60s and skipping build/vendor dirs, codebase_* results capped.
+    /// 60s and skipping build/vendor dirs, codebase_search ≤20 results × ≤2KB/chunk,
+    /// codebase_graph ≤50 rows per section.
     pub fn bench() -> Self {
         Self {
             bash_timeout_ceiling_ms: Some(300_000),
             search_timeout_ms: Some(60_000),
             search_default_ignores: true,
-            codebase_result_caps: true,
+            codebase_search_limit: Some(20),
+            codebase_search_chunk_bytes: Some(2048),
+            codebase_graph_section_cap: Some(50),
         }
     }
 }

diff --git a/crates/bench-runner/Cargo.toml b/crates/bench-runner/Cargo.toml
@@ -13,4 +13,4 @@ git-ops = { path = "../git-ops" }
 tokio = { version = "1", features = ["full"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
-clap = { version = "4", features = ["derive"] }
+clap = { version = "4", features = ["derive", "env"] }
diff --git a/crates/bench-runner/src/main.rs b/crates/bench-runner/src/main.rs
@@ -68,6 +68,43 @@ struct Cli {
     ce_machine_id: String,
     #[arg(long, default_value = "")]
     ce_auth_token: String,
+
+    // ──────────────── LLM transport / retry tunables ────────────────
+    // Defaults match the v0.1.6/v0.1.7 frozen-policy values exactly; overrides are
+    // for tuning the bench binary without rebuilding (e.g. raising the header
+    // timeout for slow Kimi cold starts). Each accepts a CLI flag or env var.
+    /// Header-arrival timeout (ms). 0 disables. Default 15000.
+    #[arg(long, env = "SUPERCODER_LLM_HEADER_TIMEOUT_MS", default_value_t = 15_000)]
+    llm_header_timeout_ms: u64,
+    /// Max LLM retry attempts after the initial call. Default 3.
+    #[arg(long, env = "SUPERCODER_LLM_MAX_RETRIES", default_value_t = 3)]
+    llm_max_retries: u32,
+    /// Initial backoff between LLM retries (ms). Default 1000.
+    #[arg(long, env = "SUPERCODER_LLM_RETRY_INITIAL_MS", default_value_t = 1_000)]
+    llm_retry_initial_ms: u64,
+    /// Backoff multiplier between LLM retries. Default 2.0.
+    #[arg(long, env = "SUPERCODER_LLM_RETRY_MULTIPLIER", default_value_t = 2.0)]
+    llm_retry_multiplier: f64,
+    /// Backoff cap between LLM retries (ms). Default 30000.
+    #[arg(long, env = "SUPERCODER_LLM_RETRY_MAX_MS", default_value_t = 30_000)]
+    llm_retry_max_ms: u64,
+
+    // ──────────────── Tool tunables (bench ToolPolicy overrides) ────
+    /// Bash timeout ceiling (ms). Default 300000 (5 min).
+    #[arg(long, env = "SUPERCODER_BASH_TIMEOUT_MS", default_value_t = 300_000)]
+    bash_timeout_ms: u64,
+    /// grep/glob wall timeout (ms). Default 60000.
+    #[arg(long, env = "SUPERCODER_SEARCH_TIMEOUT_MS", default_value_t = 60_000)]
+    search_timeout_ms: u64,
+    /// Default limit for codebase_search results when the model omits `limit`. Default 20.
+    #[arg(long, env = "SUPERCODER_CODEBASE_SEARCH_LIMIT", default_value_t = 20)]
+    codebase_search_limit: u32,
+    /// Per-chunk content cap for codebase_search results (bytes). Default 2048.
+    #[arg(long, env = "SUPERCODER_CODEBASE_SEARCH_CHUNK_BYTES", default_value_t = 2048)]
+    codebase_search_chunk_bytes: usize,
+    /// Per-section render cap for codebase_graph. Default 50.
+    #[arg(long, env = "SUPERCODER_CODEBASE_GRAPH_SECTION_CAP", default_value_t = 50)]
+    codebase_graph_section_cap: usize,
 }
 
 #[derive(Copy, Clone, Debug, ValueEnum)]
@@ -202,19 +239,50 @@ async fn run(cli: &Cli, spec: &TaskSpec) -> RunResult {
         thinking: None,
         disable_cache_control: false,
         // Strict LLM transport policy: HTTP/1.1, no connection pooling, 15s
-        // header-arrival timeout. Matches opencode's working transport shape (which
-        // has zero decode deaths on the same router that gave bench-runner 26 on
-        // long Kimi runs). Part of the frozen harness identity. See LlmPolicy::bench().
-        policy: agent::llm::LlmPolicy::bench(),
+        // header-arrival timeout (CLI/env-tunable). Matches opencode's working
+        // transport shape (which has zero decode deaths on the same router that gave
+        // bench-runner 26 on long Kimi runs). http1_only + no_pool stay hardcoded
+        // (they ARE the transport fix); the header timeout is exposed since slow
+        // upstreams may legitimately exceed it.
+        policy: {
+            let mut p = agent::llm::LlmPolicy::bench();
+            p.header_timeout_ms = if cli.llm_header_timeout_ms == 0 {
+                None
+            } else {
+                Some(cli.llm_header_timeout_ms)
+            };
+            p
+        },
     };
 
     let mut config = AgentConfig::new(llm, spec.working_dir.clone());
     config.mode = ToolMode::Coding;
     config.max_iterations = spec.max_iterations;
+
+    // Tune LLM retry behavior (CLI/env). Defaults match RetryConfig::default() (3
+    // retries, 1s→2s→4s, cap 30s) — set --llm-retry-multiplier higher and
+    // --llm-retry-max-ms wider when the upstream needs longer recovery windows
+    // (e.g. proxy circuit-breaker open).
+    config.retry_config = agent::agent::config::RetryConfig {
+        max_retries: cli.llm_max_retries,
+        initial_delay: std::time::Duration::from_millis(cli.llm_retry_initial_ms),
+        multiplier: cli.llm_retry_multiplier,
+        max_delay: std::time::Duration::from_millis(cli.llm_retry_max_ms),
+    };
+
     // Strict tool policy is part of the frozen harness identity: bash timeout ceiling,
-    // grep/glob ignore-list + wall timeout, codebase_* result caps. Always on (the app
-    // keeps the permissive default). See ToolPolicy::bench().
-    config.tool_policy = agent::tool::ToolPolicy::bench();
+    // grep/glob ignore-list + wall timeout, codebase_* result caps. The per-knob
+    // values are CLI/env-tunable; defaults match ToolPolicy::bench() exactly. The
+    // `search_default_ignores` flag stays hardcoded (it's the ignore-list behavior,
+    // not a tuning value).
+    config.tool_policy = agent::tool::ToolPolicy {
+        bash_timeout_ceiling_ms: Some(cli.bash_timeout_ms),
+        search_timeout_ms: Some(cli.search_timeout_ms),
+        search_default_ignores: true,
+        codebase_search_limit: Some(cli.codebase_search_limit),
+        codebase_search_chunk_bytes: Some(cli.codebase_search_chunk_bytes),
+        codebase_graph_section_cap: Some(cli.codebase_graph_section_cap),
+    };
     // Everything else (system_prompt, checkpoint_dir, skills, subagents, …) stays at the
     // AgentConfig::new defaults of None — headless.
 
@@ -404,3 +472,61 @@ async fn extract_patch(working_dir: &PathBuf, base_commit: &str) -> Result<Strin
         .map_err(|e| e.to_string())?;
     Ok(diff.stdout)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::Cli;
+    use clap::Parser;
+
+    /// Minimum args required to parse — model is the only required field.
+    fn parse(args: &[&str]) -> Cli {
+        let mut full = vec!["bench-runner", "--model", "test"];
+        full.extend_from_slice(args);
+        Cli::try_parse_from(full).expect("parse")
+    }
+
+    #[test]
+    fn cli_defaults_match_frozen_policy() {
+        // Defaults must NOT change behavior vs the hardcoded v0.1.6 LlmPolicy::bench()
+        // and ToolPolicy::bench() values. If this test fails the eval team's previous
+        // runs are not reproducible.
+        let cli = parse(&[]);
+        assert_eq!(cli.llm_header_timeout_ms, 15_000);
+        assert_eq!(cli.llm_max_retries, 3);
+        assert_eq!(cli.llm_retry_initial_ms, 1_000);
+        assert_eq!(cli.llm_retry_multiplier, 2.0);
+        assert_eq!(cli.llm_retry_max_ms, 30_000);
+        assert_eq!(cli.bash_timeout_ms, 300_000);
+        assert_eq!(cli.search_timeout_ms, 60_000);
+        assert_eq!(cli.codebase_search_limit, 20);
+        assert_eq!(cli.codebase_search_chunk_bytes, 2048);
+        assert_eq!(cli.codebase_graph_section_cap, 50);
+    }
+
+    #[test]
+    fn cli_flag_overrides() {
+        let cli = parse(&[
+            "--llm-header-timeout-ms", "90000",
+            "--llm-retry-multiplier", "3.0",
+            "--llm-retry-max-ms", "60000",
+            "--bash-timeout-ms", "600000",
+            "--codebase-search-limit", "10",
+        ]);
+        assert_eq!(cli.llm_header_timeout_ms, 90_000);
+        assert_eq!(cli.llm_retry_multiplier, 3.0);
+        assert_eq!(cli.llm_retry_max_ms, 60_000);
+        assert_eq!(cli.bash_timeout_ms, 600_000);
+        assert_eq!(cli.codebase_search_limit, 10);
+        // Untouched flags keep their defaults
+        assert_eq!(cli.llm_max_retries, 3);
+        assert_eq!(cli.codebase_search_chunk_bytes, 2048);
+    }
+
+    #[test]
+    fn header_timeout_zero_means_disabled() {
+        // Per the CLI doc: 0 disables the header timeout entirely. The run path
+        // converts this to LlmPolicy::header_timeout_ms = None.
+        let cli = parse(&["--llm-header-timeout-ms", "0"]);
+        assert_eq!(cli.llm_header_timeout_ms, 0);
+    }
+}