From 52405390e4c31af3b439f26bbc0cba0c2c9c8439 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Thu, 11 Jun 2026 06:35:38 -0600 Subject: [PATCH] =?UTF-8?q?fix(bench):=20WORKER=5FTEMPERATURE=20knob=20?= =?UTF-8?q?=E2=80=94=20kimi-class=20models=20400=20on=20temperature=20?= =?UTF-8?q?=E2=89=A0=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kimi matrix row's zeros were a broken channel both times: the model rejects runShot's hardcoded temperature 0.7 with a 400 (probe-verified: 'only 1 is allowed for this model'), downing every rollout. The temperature is now per-run settable on all three runners; kimi re-runs with WORKER_TEMPERATURE=1. --- bench/run-followups.sh | 17 +++++++++++++++++ bench/src/ablation-grid.mts | 3 ++- bench/src/flywheel-evolve.mts | 2 +- bench/src/steering-modes.mts | 2 +- 4 files changed, 21 insertions(+), 3 deletions(-) create mode 100755 bench/run-followups.sh diff --git a/bench/run-followups.sh b/bench/run-followups.sh new file mode 100755 index 00000000..8ab9fdb1 --- /dev/null +++ b/bench/run-followups.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Sequential gym-free follow-ups: kimi matrix row (with the worker cap), glm matrix row +# (OOM victim, plain re-run), then the belief re-run on the consult channel. +set -uo pipefail +cd "$(dirname "$0")" +echo "=== matrix re-run: moonshotai/kimi-k2.6 (WORKER_MAX_TOKENS=8192) ===" +ENV=aime CELLS=base,steer,compress,steer+compress N=20 HOLDOUT=12 HOLDOUT_OFFSET=4 BUDGET=3 INNER_TURNS=2 CONCURRENCY=3 \ + KAPPA=llm-50 WORKER_MODEL=moonshotai/kimi-k2.6 WORKER_MAX_TOKENS=8192 \ + OUT=/tmp/matrix-moonshotai-kimi-k2-6.json npx tsx src/ablation-grid.mts 2>&1 | tail -10 +echo "=== matrix re-run: zai/glm-4.7 ===" +ENV=aime CELLS=base,steer,compress,steer+compress N=20 HOLDOUT=12 HOLDOUT_OFFSET=4 BUDGET=3 INNER_TURNS=2 CONCURRENCY=3 \ + KAPPA=llm-50 WORKER_MODEL=zai/glm-4.7 \ + OUT=/tmp/matrix-zai-glm-4-7.json npx tsx src/ablation-grid.mts 2>&1 | tail -10 +echo "=== belief re-run (consult channel, confidence floor) ===" +ARMS=sample,refine,belief N=16 OFFSET=40 BUDGET=3 INNER_TURNS=2 CONCURRENCY=4 \ + WORKER_MODEL=deepseek-v4-flash OUT=/tmp/steering-belief2.json npx tsx src/steering-modes.mts 2>&1 | tail -14 +echo "FOLLOWUPS COMPLETE" diff --git a/bench/src/ablation-grid.mts b/bench/src/ablation-grid.mts index a46c5fd5..df087e68 100644 --- a/bench/src/ablation-grid.mts +++ b/bench/src/ablation-grid.mts @@ -167,7 +167,8 @@ async function main(): Promise { routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), - temperature: 0.7, + // kimi-class models 400 on any temperature ≠ 1 — the knob must be per-run settable. + temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7), ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), } const gammaPrompt = cells.some((c) => c.gamma) ? readFileSync(must('PROMPT_ARTIFACT'), 'utf8').trim() : undefined diff --git a/bench/src/flywheel-evolve.mts b/bench/src/flywheel-evolve.mts index 9b4087cc..e1c6ea05 100644 --- a/bench/src/flywheel-evolve.mts +++ b/bench/src/flywheel-evolve.mts @@ -70,7 +70,7 @@ async function main(): Promise { routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 4), - temperature: 0.7, + temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7), ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), }, author: { diff --git a/bench/src/steering-modes.mts b/bench/src/steering-modes.mts index 38fb7070..6c0e6a24 100644 --- a/bench/src/steering-modes.mts +++ b/bench/src/steering-modes.mts @@ -257,7 +257,7 @@ async function main(): Promise { routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 2), - temperature: 0.7, + temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7), ...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}), }, strategies: [arm.strategy],