diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 824c9c2..4be4dad 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,6 +33,10 @@ jobs:
dotnet build examples/McpToolAgent/McpToolAgent.csproj --no-restore --configuration Release
- name: Test
run: dotnet test SharpClawCode.sln --no-build --configuration Release --collect:"XPlat Code Coverage" --results-directory ./coverage
+ - name: Agent scenario harness
+ run: |
+ dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj --no-build --configuration Release -- test run
+ dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj --no-build --configuration Release -- test gates
- name: Upload coverage
if: matrix.os == 'ubuntu-latest'
uses: actions/upload-artifact@v4
diff --git a/Directory.Packages.props b/Directory.Packages.props
index 3027c60..ea920d0 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -26,6 +26,7 @@
+
diff --git a/SharpClawCode.sln b/SharpClawCode.sln
index 41659ce..7487f23 100644
--- a/SharpClawCode.sln
+++ b/SharpClawCode.sln
@@ -65,6 +65,14 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MinimalConsoleAgent", "exam
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WorkerServiceHost", "examples\WorkerServiceHost\WorkerServiceHost.csproj", "{2E8A9F4F-8161-4E49-9F04-533D972C11CB}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpClaw.Testing.Abstractions", "src\SharpClaw.Testing.Abstractions\SharpClaw.Testing.Abstractions.csproj", "{A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpClaw.Testing.Harness", "src\SharpClaw.Testing.Harness\SharpClaw.Testing.Harness.csproj", "{A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpClaw.Testing.Cli", "src\SharpClaw.Testing.Cli\SharpClaw.Testing.Cli.csproj", "{425E2495-940F-46A6-9F3E-ED05301504BD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpClaw.Testing.Xunit", "src\SharpClaw.Testing.Xunit\SharpClaw.Testing.Xunit.csproj", "{C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -411,6 +419,54 @@ Global
{2E8A9F4F-8161-4E49-9F04-533D972C11CB}.Release|x64.Build.0 = Release|Any CPU
{2E8A9F4F-8161-4E49-9F04-533D972C11CB}.Release|x86.ActiveCfg = Release|Any CPU
{2E8A9F4F-8161-4E49-9F04-533D972C11CB}.Release|x86.Build.0 = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|x64.Build.0 = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Debug|x86.Build.0 = Debug|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|Any CPU.Build.0 = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|x64.ActiveCfg = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|x64.Build.0 = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|x86.ActiveCfg = Release|Any CPU
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF}.Release|x86.Build.0 = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|x64.Build.0 = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Debug|x86.Build.0 = Debug|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|Any CPU.Build.0 = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|x64.ActiveCfg = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|x64.Build.0 = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|x86.ActiveCfg = Release|Any CPU
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E}.Release|x86.Build.0 = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|x64.Build.0 = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Debug|x86.Build.0 = Debug|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|Any CPU.Build.0 = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|x64.ActiveCfg = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|x64.Build.0 = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|x86.ActiveCfg = Release|Any CPU
+ {425E2495-940F-46A6-9F3E-ED05301504BD}.Release|x86.Build.0 = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|x64.Build.0 = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Debug|x86.Build.0 = Debug|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|Any CPU.Build.0 = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|x64.ActiveCfg = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|x64.Build.0 = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|x86.ActiveCfg = Release|Any CPU
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -444,5 +500,9 @@ Global
{963C636F-2096-45B1-8101-B8345967F197} = {B36A84DF-456D-A817-6EDD-3EC3E7F6E11F}
{7BA2E64A-B330-4783-9330-AEF46B91929A} = {B36A84DF-456D-A817-6EDD-3EC3E7F6E11F}
{2E8A9F4F-8161-4E49-9F04-533D972C11CB} = {B36A84DF-456D-A817-6EDD-3EC3E7F6E11F}
+ {A4E45F2B-9118-41EC-8AF2-08EBF0F9B3EF} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B}
+ {A78CD9D6-54CF-422C-B5D8-B3BC4D99323E} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B}
+ {425E2495-940F-46A6-9F3E-ED05301504BD} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B}
+ {C45BBEA7-5970-40BB-AE6D-B8F09D1E2EE1} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B}
EndGlobalSection
EndGlobal
diff --git a/docs/testing.md b/docs/testing.md
index e5174c3..acaf5d2 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -8,6 +8,7 @@
| **SharpClaw.Code.IntegrationTests** | Runtime + provider flows with real composition |
| **SharpClaw.Code.MockProvider** | **`DeterministicMockModelProvider`**, **`AddDeterministicMockModelProvider`**, **`ParityMetadataKeys`**, **`ParityProviderScenario`** |
| **SharpClaw.Code.ParityHarness** | End-to-end scenarios over real **`AddSharpClawRuntime`** + mock LLM |
+| **SharpClaw.Testing.\*** | JSON scenario contracts, oracle runner, CLI commands, and xUnit adapter for explicit agent testing |
Run all tests:
@@ -15,6 +16,13 @@ Run all tests:
dotnet test SharpClawCode.sln
```
+Run the explicit agent scenario harness:
+
+```bash
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test run
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test gates
+```
+
Build the example hosts as part of normal validation:
```bash
@@ -59,6 +67,10 @@ Stable scenario **ids** are listed in **`ParityScenarioIds`** (e.g. `streaming_t
**Note:** Many scenarios exercise **`IToolExecutor`** directly rather than going through the LLM agent loop (which matches current **`AgentFrameworkBridge`** behavior).
+## Agent scenario harness
+
+The scenario harness lives in **`SharpClaw.Testing.Abstractions`**, **`SharpClaw.Testing.Harness`**, **`SharpClaw.Testing.Cli`**, and **`SharpClaw.Testing.Xunit`**. Scenario files live in **`tests/agent-scenarios`** and use JSON with explicit oracles. See **`docs/testing/agent-testing-harness.md`** for the contract, CLI usage, xUnit adapter, and gate model.
+
## CI
-CI restores and builds the full solution, explicitly builds every example host project, and then runs `dotnet test` on the solution. Parity tests use temp directories under **`Path.GetTempPath()`** and avoid network.
+CI restores and builds the full solution, explicitly builds every example host project, runs `dotnet test`, then runs the explicit agent scenario harness through `sharpclaw test run` and `sharpclaw test gates`. Parity tests use temp directories under **`Path.GetTempPath()`** and avoid network.
diff --git a/docs/testing/agent-testing-harness.md b/docs/testing/agent-testing-harness.md
new file mode 100644
index 0000000..5e7b205
--- /dev/null
+++ b/docs/testing/agent-testing-harness.md
@@ -0,0 +1,140 @@
+# Agent Testing Harness
+
+## Purpose
+
+The agent testing harness is a disciplined scenario runner for SharpClaw agent behavior. It is not a generic AI test generator. Each scenario declares the prompt, the trace source, risk level, and explicit oracles that must pass.
+
+Every run produces a structured trace and evaluates that trace against named oracles. The first implementation uses a `scripted` executor so the model, loader, trace writer, report writer, gates, CLI, and xUnit adapter can stabilize before wiring the harness to the live runtime/gateway.
+
+## Scenario Format
+
+Scenarios live under `tests/agent-scenarios` as JSON files:
+
+```json
+{
+ "id": "basic-tool-call",
+ "risk": "Low",
+ "input": {
+ "prompt": "Read the project README.",
+ "executor": "scripted",
+ "scriptedTrace": [
+ {
+ "kind": "ToolCall",
+ "toolCall": {
+ "toolName": "read_file",
+ "argumentsJson": "{\"path\":\"README.md\"}"
+ }
+ }
+ ],
+ "scriptedFinalAnswer": "README starts with SharpClaw Code."
+ },
+ "expected": {
+ "oracles": [
+ { "type": "ToolCalled", "toolName": "read_file" },
+ { "type": "FinalAnswerContains", "text": "SharpClaw Code" }
+ ]
+ }
+}
+```
+
+The JSON contracts are defined in `SharpClaw.Testing.Abstractions` and serialized with `System.Text.Json`. The shape avoids runtime reflection-heavy polymorphic JSON: `TraceStep` has explicit optional payloads such as `toolCall`, `toolResult`, and `stateChange`.
+
+## Oracle Model
+
+Built-in oracles:
+
+- `ToolCalled`
+- `ToolNotCalled`
+- `FinalAnswerContains`
+- `MaxToolCalls`
+- `StateEquals`
+- `ApprovalRequired`
+- `NoUnsafeTool`
+
+Failed oracles include a clear message plus expected and actual summaries. Scenarios with no explicit oracles fail the explicit-oracle gate.
+
+## CLI Usage
+
+Initialize example scenarios:
+
+```bash
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test init
+```
+
+Run scenarios, write traces, evaluate oracles, and generate `docs/testing/test-run-report.md`:
+
+```bash
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test run
+```
+
+Regenerate a markdown report from the latest result file:
+
+```bash
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test report
+```
+
+Run gate checks:
+
+```bash
+dotnet run --project src/SharpClaw.Code.Cli/SharpClaw.Code.Cli.csproj -- test gates
+```
+
+Defaults:
+
+- Scenarios: `tests/agent-scenarios`
+- Markdown report: `docs/testing/test-run-report.md`
+- Machine-readable results: `artifacts/testing/test-run-results.json`
+- Trace files: `artifacts/testing/traces`
+
+## xUnit Usage
+
+`SharpClaw.Testing.Xunit` exposes data and assertion helpers:
+
+```csharp
+public static IEnumerable