Skip to content

Commit efa6f38

Browse files
bradleyshepcursoragentclockwork-labs-botclockwork-labs-bot
authored
LLM benchmark tool updates (#4413)
# Description of Changes LLM benchmark updates for local development: - **Local SDK paths**: Templates use relative paths to workspace crates (`crates/bindings`, `crates/bindings-csharp`, `crates/bindings-typescript`) instead of published packages, so the bench runs against local SDK changes. - **NODEJS_DIR support**: On Windows (e.g. nvm4w), if `pnpm` is not on PATH, the bench uses `NODEJS_DIR` to locate `pnpm` and prepends it to PATH for subprocesses. - **Refactor**: Extracted `relative_to_workspace()` in `templates.rs` and removed noisy `NODEJS_DIR` logging in `publishers.rs`. - **Benchmark results**: Updated `docs/llms/llm-comparison-details.json` and `docs/llms/llm-comparison-summary.json`. # API and ABI breaking changes None. # Expected complexity level and risk **2** — Local-only changes to the benchmark tool. Templates now require local SDKs to be built (especially TypeScript: `pnpm build` in `crates/bindings-typescript`). No impact on published SDKs or runtime. # Testing - [ ] Run `cargo llm run --lang rust --modes docs --providers openai` from repo root - [ ] Run TypeScript benchmarks with `pnpm build` in `crates/bindings-typescript` first - [ ] On Windows with nvm4w, set `NODEJS_DIR` if `pnpm` is not on PATH and run TypeScript benchmarks --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: clockwork-labs-bot <bot@clockworklabs.com> Co-authored-by: clockwork-labs-bot <clockwork-labs-bot@users.noreply.github.com>
1 parent 14f7991 commit efa6f38

87 files changed

Lines changed: 932 additions & 222 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/DEVELOP.md

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,25 @@
1-
# DEVELOP.md
1+
# DEVELOP.md
22

33
This document explains how to configure the environment, run the LLM benchmark tool, and work with the benchmark suite.
44

55
---
66

77
## Table of Contents
88

9-
1. [Quick Checks & Fixes](#quick-checks-fixes)
10-
2. [Environment Variables](#environment-variables)
11-
3. [Benchmark Suite](#benchmark-suite)
12-
4. [Context Construction](#context-construction)
13-
5. [Troubleshooting](#troubleshooting)
9+
1. [Prerequisites](#prerequisites)
10+
2. [Quick Checks & Fixes](#quick-checks-fixes)
11+
3. [Environment Variables](#environment-variables)
12+
4. [Benchmark Suite](#benchmark-suite)
13+
5. [Context Construction](#context-construction)
14+
6. [Troubleshooting](#troubleshooting)
15+
---
16+
17+
## Prerequisites
18+
19+
- **Run from repo root**`cargo llm` and related commands must be run from the workspace root (this repo).
20+
- **TypeScript benchmarks** — Run `pnpm build` in `crates/bindings-typescript` first. Rust and C# use local crates that are built as part of the workspace.
21+
- **Windows (nvm4w)** — If `pnpm` is not found when running TypeScript benchmarks, set `NODEJS_DIR` to your Node.js bin directory (e.g. `C:\nvm\v20.10.0`).
22+
1423
---
1524

1625
## Quick Checks & Fixes

tools/xtask-llm-benchmark/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ serde_json.workspace = true
1919
blake3.workspace = true
2020
clap.workspace = true
2121
chrono = { version = "0.4", features = ["clock", "serde"] }
22+
dotenvy = "0.15"
2223
async-trait = "0.1.89"
2324
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
2425
urlencoding = "2.1.3"

tools/xtask-llm-benchmark/src/bench/publishers.rs

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ use crate::bench::utils::sanitize_db_name;
22
use anyhow::{bail, Result};
33
use regex::Regex;
44
use std::borrow::Cow;
5+
use std::env;
56
use std::fs;
6-
use std::path::Path;
7+
use std::path::{Path, PathBuf};
78
use std::process::Command;
89
use std::sync::LazyLock;
910

@@ -248,14 +249,63 @@ impl Publisher for TypeScriptPublisher {
248249
Self::ensure_package_json(source)?;
249250
let db = sanitize_db_name(module_name);
250251

251-
// Install dependencies (--ignore-workspace to avoid parent workspace interference)
252-
run(
253-
Command::new("pnpm")
254-
.arg("install")
255-
.arg("--ignore-workspace")
256-
.current_dir(source),
257-
"pnpm install (typescript)",
258-
)?;
252+
// Install dependencies (--ignore-workspace to avoid parent workspace interference).
253+
// If NODEJS_DIR is set (e.g. nvm4w on Windows), use full path to pnpm so spawn finds it.
254+
let pnpm_exe = env::var("NODEJS_DIR")
255+
.ok()
256+
.map(|s| s.trim().trim_matches('"').trim().to_string())
257+
.filter(|s| !s.is_empty())
258+
.map(PathBuf::from)
259+
.and_then(|dir| {
260+
#[cfg(windows)]
261+
{
262+
let pnpm_cmd = dir.join("pnpm.cmd");
263+
let pnpm_exe_path = dir.join("pnpm.exe");
264+
if pnpm_cmd.is_file() {
265+
eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.cmd)", dir.display());
266+
Some(pnpm_cmd)
267+
} else if pnpm_exe_path.is_file() {
268+
eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.exe)", dir.display());
269+
Some(pnpm_exe_path)
270+
} else {
271+
eprintln!(
272+
"[pnpm] NODEJS_DIR set to {} but pnpm.cmd/pnpm.exe not found there, using PATH",
273+
dir.display()
274+
);
275+
None
276+
}
277+
}
278+
#[cfg(not(windows))]
279+
{
280+
let pnpm = dir.join("pnpm");
281+
if pnpm.is_file() {
282+
eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm)", dir.display());
283+
Some(pnpm)
284+
} else {
285+
eprintln!(
286+
"[pnpm] NODEJS_DIR set to {} but pnpm not found there, using PATH",
287+
dir.display()
288+
);
289+
None
290+
}
291+
}
292+
});
293+
let mut pnpm_cmd = match &pnpm_exe {
294+
Some(p) => Command::new(p),
295+
None => Command::new("pnpm"),
296+
};
297+
pnpm_cmd.arg("install").arg("--ignore-workspace").current_dir(source);
298+
// When using NODEJS_DIR, prepend it to PATH so pnpm.cmd can find node.
299+
if let Some(ref dir) = pnpm_exe {
300+
if let Some(parent) = dir.parent() {
301+
let mut paths: Vec<PathBuf> = env::split_paths(&env::var("PATH").unwrap_or_default()).collect();
302+
paths.insert(0, parent.to_path_buf());
303+
if let Ok(new_path) = env::join_paths(paths) {
304+
pnpm_cmd.env("PATH", new_path);
305+
}
306+
}
307+
}
308+
run(&mut pnpm_cmd, "pnpm install (typescript)")?;
259309

260310
// Publish (spacetime CLI handles TypeScript compilation internally)
261311
run(

tools/xtask-llm-benchmark/src/bench/runner.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,7 @@ fn load_golden_source(task: &TaskPaths, lang: Lang) -> Result<String> {
690690
}
691691

692692
// "1" | "01" | "001" | "t_001" -> "t_001"
693+
// "t_000_empty_reducers" | "t_001_basic_tables" -> accepted as-is (full task dir name)
693694
fn normalize_task_selector(raw: &str) -> Result<String> {
694695
let s = raw.trim().to_ascii_lowercase();
695696
if s.is_empty() {
@@ -700,6 +701,12 @@ fn normalize_task_selector(raw: &str) -> Result<String> {
700701
let n: u32 = rest.parse()?;
701702
return Ok(format!("t_{:03}", n));
702703
}
704+
// Full task dir name: t_000_empty_reducers, t_001_basic_tables, etc.
705+
if rest.chars().next().is_some_and(|c| c.is_ascii_digit())
706+
&& rest.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
707+
{
708+
return Ok(s);
709+
}
703710
bail!("invalid task selector: {raw}");
704711
}
705712
if s.chars().all(|c| c.is_ascii_digit()) {

tools/xtask-llm-benchmark/src/bench/templates.rs

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,31 @@ fn tmpl_root() -> PathBuf {
4343
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src").join("templates")
4444
}
4545

46+
/// Workspace root (public/) for local SDK paths.
47+
fn workspace_root() -> PathBuf {
48+
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
49+
.ancestors()
50+
.nth(2)
51+
.expect("xtask-llm-benchmark is under public/tools/xtask-llm-benchmark")
52+
.to_path_buf()
53+
}
54+
55+
/// Relative path from materialized root to a workspace subpath (e.g. "crates/bindings").
56+
/// Avoids Windows canonical paths (//?/D:/...) which can break Cargo/MSBuild/pnpm.
57+
fn relative_to_workspace(root: &Path, ws_subpath: &str) -> Result<String> {
58+
let ws = workspace_root()
59+
.canonicalize()
60+
.with_context(|| "workspace root not found")?;
61+
let root_canon = root
62+
.canonicalize()
63+
.with_context(|| format!("materialized root not found: {}", root.display()))?;
64+
let root_rel = root_canon
65+
.strip_prefix(&ws)
66+
.with_context(|| format!("materialized dir {:?} not under workspace {:?}", root_canon, ws))?;
67+
let ups = root_rel.components().count();
68+
Ok(std::iter::repeat_n("..", ups).collect::<Vec<_>>().join("/") + "/" + ws_subpath)
69+
}
70+
4671
fn copy_tree_with_templates(src: &Path, dst: &Path) -> Result<()> {
4772
fn recurse(from: &Path, to: &Path) -> Result<()> {
4873
fs::create_dir_all(to)?;
@@ -98,7 +123,19 @@ fn inject_rust(root: &Path, llm_code: &str) -> anyhow::Result<()> {
98123
}
99124
contents.push_str(&cleaned);
100125
}
101-
fs::write(&lib, contents).with_context(|| format!("write {}", lib.display()))
126+
fs::write(&lib, contents).with_context(|| format!("write {}", lib.display()))?;
127+
128+
let relative = relative_to_workspace(root, "crates/bindings")?;
129+
let sdk_path = workspace_root().join("crates/bindings");
130+
if !sdk_path.is_dir() {
131+
bail!("local Rust SDK not found at {}", sdk_path.display());
132+
}
133+
let replacement = format!(r#"{{ path = "{}" }}"#, relative);
134+
let cargo_toml = root.join("Cargo.toml");
135+
let mut toml = fs::read_to_string(&cargo_toml).with_context(|| format!("read {}", cargo_toml.display()))?;
136+
toml = toml.replace("{SPACETIME_RUST_SDK_PATH}", &replacement);
137+
fs::write(&cargo_toml, toml).with_context(|| format!("write {}", cargo_toml.display()))?;
138+
Ok(())
102139
}
103140

104141
fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> {
@@ -116,7 +153,23 @@ fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> {
116153
}
117154
contents.push_str(&cleaned);
118155
}
119-
fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))
156+
fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))?;
157+
158+
let base_rel = relative_to_workspace(root, "crates/bindings-csharp")?;
159+
let runtime_csproj = workspace_root().join("crates/bindings-csharp/Runtime/Runtime.csproj");
160+
if !runtime_csproj.is_file() {
161+
bail!("local C# Runtime not found at {}", runtime_csproj.display());
162+
}
163+
let runtime_ref = format!("{}/Runtime/Runtime.csproj", base_rel);
164+
let runtime_dir = format!("{}/Runtime", base_rel);
165+
let codegen_ref = format!("{}/Codegen/Codegen.csproj", base_rel);
166+
let csproj_path = root.join("StdbModule.csproj");
167+
let mut csproj = fs::read_to_string(&csproj_path).with_context(|| format!("read {}", csproj_path.display()))?;
168+
csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_DIR}", &runtime_dir);
169+
csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_REF}", &runtime_ref);
170+
csproj = csproj.replace("{SPACETIME_CSHARP_CODEGEN_REF}", &codegen_ref);
171+
fs::write(&csproj_path, csproj).with_context(|| format!("write {}", csproj_path.display()))?;
172+
Ok(())
120173
}
121174

122175
fn inject_typescript(root: &Path, llm_code: &str) -> anyhow::Result<()> {
@@ -134,7 +187,26 @@ fn inject_typescript(root: &Path, llm_code: &str) -> anyhow::Result<()> {
134187
}
135188
contents.push_str(&cleaned);
136189
}
137-
fs::write(&lib, contents).with_context(|| format!("write {}", lib.display()))
190+
fs::write(&lib, contents).with_context(|| format!("write {}", lib.display()))?;
191+
192+
let relative = relative_to_workspace(root, "crates/bindings-typescript")?;
193+
let sdk_path = workspace_root().join("crates/bindings-typescript");
194+
if !sdk_path.is_dir() {
195+
bail!("local TypeScript SDK not found at {}", sdk_path.display());
196+
}
197+
let dist_server = sdk_path.join("dist/server/index.mjs");
198+
if !dist_server.is_file() {
199+
bail!(
200+
"local TypeScript SDK at {} is not built (missing dist/server). Run: pnpm build (in crates/bindings-typescript)",
201+
sdk_path.display()
202+
);
203+
}
204+
let replacement = format!("file:{}", relative);
205+
let package_json = root.join("package.json");
206+
let mut pkg = fs::read_to_string(&package_json).with_context(|| format!("read {}", package_json.display()))?;
207+
pkg = pkg.replace("{SPACETIME_TS_SDK_REF}", &replacement);
208+
fs::write(&package_json, pkg).with_context(|| format!("write {}", package_json.display()))?;
209+
Ok(())
138210
}
139211

140212
/// Remove leading/trailing Markdown fences like ```rust ... ``` or ~~~

tools/xtask-llm-benchmark/src/benchmarks/basics/t_000_empty_reducers/answers/csharp.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
public static partial class Module
44
{
5+
[Table(Accessor = "EmptyTable")]
6+
public partial struct EmptyTable
7+
{
8+
[PrimaryKey] public int Id;
9+
}
10+
511
[Reducer]
612
public static void EmptyReducer_NoArgs(ReducerContext ctx) { }
713

tools/xtask-llm-benchmark/src/benchmarks/basics/t_000_empty_reducers/answers/rust.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
use spacetimedb::{reducer, ReducerContext};
1+
use spacetimedb::{reducer, table, ReducerContext};
2+
3+
#[table(accessor = empty_table)]
4+
pub struct EmptyTable {
5+
#[primary_key]
6+
pub id: i32,
7+
}
28

39
#[reducer]
410
pub fn empty_reducer_no_args(ctx: &ReducerContext) -> Result<(), String> {

tools/xtask-llm-benchmark/src/benchmarks/basics/t_000_empty_reducers/answers/typescript.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
import { schema, t } from 'spacetimedb/server';
1+
import { schema, table, t } from 'spacetimedb/server';
22

3-
const spacetimedb = schema({});
3+
const emptyTable = table({ name: 'empty_table' }, { id: t.i32().primaryKey() });
4+
5+
const spacetimedb = schema({ emptyTable });
46
export default spacetimedb;
57

68
export const emptyReducerNoArgs = spacetimedb.reducer({}, ctx => {

tools/xtask-llm-benchmark/src/benchmarks/basics/t_000_empty_reducers/tasks/csharp.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
Write a SpacetimeDB backend module in C# that defines only these five empty reducers.
1+
Write a SpacetimeDB backend module in C# that defines a table and these five empty reducers.
2+
3+
TABLES
4+
- EmptyTable
5+
- Struct: EmptyTable
6+
- Fields:
7+
- Id: int (primary key)
28

39
REDUCERS
410
- EmptyReducer_NoArgs: no arguments, returns void, empty body

0 commit comments

Comments
 (0)