Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions bench/run-followups.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Sequential gym-free follow-ups: kimi matrix row (with the worker cap), glm matrix row
# (OOM victim, plain re-run), then the belief re-run on the consult channel.
set -uo pipefail
cd "$(dirname "$0")"
echo "=== matrix re-run: moonshotai/kimi-k2.6 (WORKER_MAX_TOKENS=8192) ==="
ENV=aime CELLS=base,steer,compress,steer+compress N=20 HOLDOUT=12 HOLDOUT_OFFSET=4 BUDGET=3 INNER_TURNS=2 CONCURRENCY=3 \
KAPPA=llm-50 WORKER_MODEL=moonshotai/kimi-k2.6 WORKER_MAX_TOKENS=8192 \
OUT=/tmp/matrix-moonshotai-kimi-k2-6.json npx tsx src/ablation-grid.mts 2>&1 | tail -10
echo "=== matrix re-run: zai/glm-4.7 ==="
ENV=aime CELLS=base,steer,compress,steer+compress N=20 HOLDOUT=12 HOLDOUT_OFFSET=4 BUDGET=3 INNER_TURNS=2 CONCURRENCY=3 \
KAPPA=llm-50 WORKER_MODEL=zai/glm-4.7 \
OUT=/tmp/matrix-zai-glm-4-7.json npx tsx src/ablation-grid.mts 2>&1 | tail -10
echo "=== belief re-run (consult channel, confidence floor) ==="
ARMS=sample,refine,belief N=16 OFFSET=40 BUDGET=3 INNER_TURNS=2 CONCURRENCY=4 \
WORKER_MODEL=deepseek-v4-flash OUT=/tmp/steering-belief2.json npx tsx src/steering-modes.mts 2>&1 | tail -14
echo "FOLLOWUPS COMPLETE"
3 changes: 2 additions & 1 deletion bench/src/ablation-grid.mts
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ async function main(): Promise<void> {
routerKey,
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 4),
temperature: 0.7,
// kimi-class models 400 on any temperature ≠ 1 — the knob must be per-run settable.
temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7),
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
}
const gammaPrompt = cells.some((c) => c.gamma) ? readFileSync(must('PROMPT_ARTIFACT'), 'utf8').trim() : undefined
Expand Down
2 changes: 1 addition & 1 deletion bench/src/flywheel-evolve.mts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ async function main(): Promise<void> {
routerKey,
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 4),
temperature: 0.7,
temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7),
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
},
author: {
Expand Down
2 changes: 1 addition & 1 deletion bench/src/steering-modes.mts
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ async function main(): Promise<void> {
routerKey,
model: workerModel,
innerTurns: Number(process.env.INNER_TURNS ?? 2),
temperature: 0.7,
temperature: Number(process.env.WORKER_TEMPERATURE ?? 0.7),
...(process.env.WORKER_MAX_TOKENS ? { maxTokens: Number(process.env.WORKER_MAX_TOKENS) } : {}),
},
strategies: [arm.strategy],
Expand Down
Loading