4 months ago · 0d1718e551
--- a/evals/HOW_TESTS_WORK.md
+++ b/evals/HOW_TESTS_WORK.md
@@ -0,0 +1,307 @@
 
																+# How the Eval Tests Work
															
 
																+
															
 
																+This document explains exactly how the evaluation tests work, what they verify, and how to be confident they're testing what we think they're testing.
															
 
																+
															
 
																+## Test Execution Flow
															
 
																+
															
 
																+```
															
 
																+┌─────────────────────────────────────────────────────────────────┐
															
 
																+│                        TEST RUNNER                               │
															
 
																+├─────────────────────────────────────────────────────────────────┤
															
 
																+│  1. Clean test_tmp/ directory                                    │
															
 
																+│  2. Start opencode server (from git root)                        │
															
 
																+│  3. For each test:                                               │
															
 
																+│     a. Create session                                            │
															
 
																+│     b. Send prompt(s) with agent selection                       │
															
 
																+│     c. Capture events via event stream                           │
															
 
																+│     d. Run evaluators on session data                            │
															
 
																+│     e. Check behavior expectations                               │
															
 
																+│     f. Delete session (unless --debug)                           │
															
 
																+│  4. Clean test_tmp/ directory                                    │
															
 
																+│  5. Print results                                                │
															
 
																+└─────────────────────────────────────────────────────────────────┘
															
 
																+```
															
 
																+
															
 
																+## How We Verify Agent Behavior
															
 
																+
															
 
																+### 1. Agent Selection Verification
															
 
																+
															
 
																+When a test specifies `agent: opencoder`, we verify:
															
 
																+
															
 
																+```typescript
															
 
																+// In test-runner.ts line 340-362
															
 
																+const sessionInfo = await this.client.getSession(sessionId);
															
 
																+const firstMessage = messages[0].info;
															
 
																+const actualAgent = firstMessage.agent;
															
 
																+
															
 
																+if (actualAgent !== testCase.agent) {
															
 
																+  errors.push(`Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
															
 
																+}
															
 
																+```
															
 
																+
															
 
																+**Output you'll see:**
															
 
																+```
															
 
																+Agent: opencoder
															
 
																+Validating agent: opencoder...
															
 
																+  ✅ Agent verified: opencoder
															
 
																+```
															
 
																+
															
 
																+### 2. Tool Usage Verification
															
 
																+
															
 
																+The BehaviorEvaluator checks which tools were actually called:
															
 
																+
															
 
																+```typescript
															
 
																+// In behavior-evaluator.ts
															
 
																+const toolCalls = this.getToolCalls(timeline);
															
 
																+const toolsUsed = toolCalls.map(tc => tc.data?.tool);
															
 
																+
															
 
																+// Check mustUseTools
															
 
																+for (const requiredTool of this.behavior.mustUseTools) {
															
 
																+  if (!toolsUsed.includes(requiredTool)) {
															
 
																+    violations.push({
															
 
																+      type: 'missing-required-tool',
															
 
																+      message: `Required tool '${requiredTool}' was not used`
															
 
																+    });
															
 
																+  }
															
 
																+}
															
 
																+```
															
 
																+
															
 
																+**Output you'll see:**
															
 
																+```
															
 
																+============================================================
															
 
																+BEHAVIOR VALIDATION
															
 
																+============================================================
															
 
																+Timeline Events: 10
															
 
																+Tool Calls: 2
															
 
																+Tools Used: glob, read
															
 
																+
															
 
																+Tool Call Details:
															
 
																+  1. glob: {"pattern":"**/*.ts","path":"/Users/.../src"}
															
 
																+  2. read: {"filePath":"/Users/.../src/utils/math.ts"}
															
 
																+```
															
 
																+
															
 
																+### 3. Event Stream Capture
															
 
																+
															
 
																+We capture real events from the opencode server:
															
 
																+
															
 
																+```typescript
															
 
																+// In event-stream-handler.ts
															
 
																+for await (const event of response.stream) {
															
 
																+  const serverEvent = {
															
 
																+    type: event.type,  // 'tool.call', 'message.created', etc.
															
 
																+    properties: event.properties,
															
 
																+    timestamp: Date.now(),
															
 
																+  };
															
 
																+  // Trigger handlers
															
 
																+}
															
 
																+```
															
 
																+
															
 
																+**Event types captured:**
															
 
																+- `session.created` - Session started
															
 
																+- `message.created` / `message.updated` - Agent messages
															
 
																+- `part.created` / `part.updated` - Tool calls, text output
															
 
																+- `permission.request` / `permission.response` - Approval flow
															
 
																+
															
 
																+### 4. Approval Flow Verification
															
 
																+
															
 
																+For agents that require approval (like openagent):
															
 
																+
															
 
																+```typescript
															
 
																+// In test-runner.ts
															
 
																+this.eventHandler.onPermission(async (event) => {
															
 
																+  const approved = await approvalStrategy.shouldApprove(event);
															
 
																+  approvalsGiven++;
															
 
																+  this.log(`Permission ${approved ? 'APPROVED' : 'DENIED'}: ${event.properties.tool}`);
															
 
																+  return approved;
															
 
																+});
															
 
																+```
															
 
																+
															
 
																+## Test File Structure
															
 
																+
															
 
																+```yaml
															
 
																+# Example test file
															
 
																+id: bash-execution-001
															
 
																+name: Direct Tool Execution
															
 
																+agent: opencoder                    # Which agent to use
															
 
																+model: anthropic/claude-sonnet-4-5  # Which model
															
 
																+
															
 
																+prompt: |
															
 
																+  List the files in the current directory using ls.
															
 
																+
															
 
																+behavior:
															
 
																+  mustUseAnyOf: [[bash], [list]]    # Either tool is acceptable
															
 
																+  minToolCalls: 1                    # At least 1 tool call
															
 
																+  mustNotContain:                    # Text that should NOT appear
															
 
																+    - "Approval needed"
															
 
																+
															
 
																+expectedViolations:
															
 
																+  - rule: approval-gate
															
 
																+    shouldViolate: true              # Opencoder WILL trigger this (expected)
															
 
																+    severity: error
															
 
																+
															
 
																+approvalStrategy:
															
 
																+  type: auto-approve                 # Auto-approve tool permissions
															
 
																+
															
 
																+timeout: 30000
															
 
																+```
															
 
																+
															
 
																+## Key Differences Between Agents
															
 
																+
															
 
																+### Opencoder (Direct Execution)
															
 
																+- Executes tools immediately
															
 
																+- Uses tool permission system only
															
 
																+- No text-based approval workflow
															
 
																+- Tests use single prompts
															
 
																+
															
 
																+```yaml
															
 
																+agent: opencoder
															
 
																+prompt: "List files in current directory"
															
 
																+behavior:
															
 
																+  mustUseAnyOf: [[bash], [list]]
															
 
																+expectedViolations:
															
 
																+  - rule: approval-gate
															
 
																+    shouldViolate: true  # Expected - no text approval
															
 
																+```
															
 
																+
															
 
																+### OpenAgent (Approval Workflow)
															
 
																+- Outputs "Proposed Plan" first
															
 
																+- Waits for user approval in text
															
 
																+- Then executes tools
															
 
																+- Tests use multi-turn prompts
															
 
																+
															
 
																+```yaml
															
 
																+agent: openagent
															
 
																+prompts:
															
 
																+  - text: "List files in current directory"
															
 
																+  - text: "Yes, proceed with the plan"
															
 
																+    delayMs: 2000
															
 
																+behavior:
															
 
																+  mustUseTools: [bash]
															
 
																+expectedViolations:
															
 
																+  - rule: approval-gate
															
 
																+    shouldViolate: false  # Should ask for approval
															
 
																+```
															
 
																+
															
 
																+## File Cleanup
															
 
																+
															
 
																+Tests that create files use `evals/test_tmp/`:
															
 
																+
															
 
																+```yaml
															
 
																+prompt: |
															
 
																+  Create a file at evals/test_tmp/test.txt with content "Hello"
															
 
																+```
															
 
																+
															
 
																+The test runner cleans this directory:
															
 
																+- Before tests start
															
 
																+- After tests complete
															
 
																+
															
 
																+```typescript
															
 
																+// In run-sdk-tests.ts
															
 
																+function cleanupTestTmp(testTmpDir: string): void {
															
 
																+  const preserveFiles = ['README.md', '.gitignore'];
															
 
																+  // Remove everything else
															
 
																+}
															
 
																+```
															
 
																+
															
 
																+## How to Verify Tests Are Working
															
 
																+
															
 
																+### 1. Run with --debug flag
															
 
																+```bash
															
 
																+npm run eval:sdk -- --agent=opencoder --debug
															
 
																+```
															
 
																+
															
 
																+This shows:
															
 
																+- All events captured
															
 
																+- Tool call details
															
 
																+- Agent verification
															
 
																+- Keeps sessions for inspection
															
 
																+
															
 
																+### 2. Check Tool Call Details
															
 
																+Look for the BEHAVIOR VALIDATION section:
															
 
																+```
															
 
																+Tool Call Details:
															
 
																+  1. glob: {"pattern":"**/*.ts","path":"..."}
															
 
																+  2. read: {"filePath":"..."}
															
 
																+```
															
 
																+
															
 
																+### 3. Verify Agent Selection
															
 
																+Look for:
															
 
																+```
															
 
																+Agent: opencoder
															
 
																+Validating agent: opencoder...
															
 
																+  ✅ Agent verified: opencoder
															
 
																+```
															
 
																+
															
 
																+### 4. Check Event Count
															
 
																+```
															
 
																+Events captured: 23
															
 
																+```
															
 
																+If this is 0 or very low, something is wrong.
															
 
																+
															
 
																+### 5. Inspect Session (debug mode)
															
 
																+```bash
															
 
																+# Sessions are kept in debug mode
															
 
																+ls ~/.local/share/opencode/storage/session/
															
 
																+```
															
 
																+
															
 
																+## Common Issues
															
 
																+
															
 
																+### "Agent not set in message"
															
 
																+The SDK might not return the agent field. This is a warning, not an error.
															
 
																+
															
 
																+### "0 events captured"
															
 
																+Event stream connection failed. Check server is running.
															
 
																+
															
 
																+### "Tool X was not used"
															
 
																+Agent used a different tool. Consider using `mustUseAnyOf` for flexibility.
															
 
																+
															
 
																+### Files created in wrong location
															
 
																+Update test prompts to use `evals/test_tmp/` path.
															
 
																+
															
 
																+## Running Tests
															
 
																+
															
 
																+```bash
															
 
																+cd evals/framework
															
 
																+
															
 
																+# All tests for specific agent
															
 
																+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder
															
 
																+
															
 
																+# Specific test pattern
															
 
																+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --pattern="developer/*.yaml"
															
 
																+
															
 
																+# Debug mode (keeps sessions, verbose output)
															
 
																+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --debug
															
 
																+
															
 
																+# Custom model
															
 
																+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --model=anthropic/claude-sonnet-4-5
															
 
																+```
															
 
																+
															
 
																+## Test Results Interpretation
															
 
																+
															
 
																+```
															
 
																+======================================================================
															
 
																+TEST RESULTS
															
 
																+======================================================================
															
 
																+
															
 
																+1. ✅ file-read-001 - File Read Operation
															
 
																+   Duration: 18397ms          # How long the test took
															
 
																+   Events: 23                  # Events captured from server
															
 
																+   Approvals: 0                # Permission requests handled
															
 
																+   Context Loading: ⊘ ...      # Context file status
															
 
																+   Violations: 0 (0 errors)    # Rule violations found
															
 
																+
															
 
																+======================================================================
															
 
																+SUMMARY: 4/4 tests passed (0 failed)
															
 
																+======================================================================
															
 
																+```
															
 
																+
															
 
																+## Confidence Checklist
															
 
																+
															
 
																+Before trusting test results, verify:
															
 
																+
															
 
																+- [ ] Agent verified message shows correct agent
															
 
																+- [ ] Events captured > 0
															
 
																+- [ ] Tool Call Details show expected tools
															
 
																+- [ ] Duration is reasonable (not instant = timeout)
															
 
																+- [ ] No unexpected errors in output
															
 
																+- [ ] test_tmp/ is being cleaned up
															
--- a/evals/agents/openagent/tests/developer/create-component.yaml
+++ b/evals/agents/openagent/tests/developer/create-component.yaml
@@ -11,7 +11,7 @@ category: developer
 
																 prompt: |
															
 
																   Create a new React functional component called Button in a file at 
															
 
																-  src/components/Button.tsx. The component should accept props for 
															
 
																+  evals/test_tmp/Button.tsx. The component should accept props for 
															
 
																   label and onClick handler.
															
 
																 approvalStrategy:
															
--- a/evals/agents/openagent/tests/developer/ctx-code-001-claude.yaml
+++ b/evals/agents/openagent/tests/developer/ctx-code-001-claude.yaml
@@ -9,7 +9,7 @@ model: anthropic/claude-sonnet-4-5
 
																 prompt: |
															
 
																   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
															
 
																-  Save it to src/utils/math.ts
															
 
																+  Save it to evals/test_tmp/math.ts
															
 
																 # Expected behavior
															
 
																 behavior:
															
--- a/evals/agents/openagent/tests/developer/ctx-code-001.yaml
+++ b/evals/agents/openagent/tests/developer/ctx-code-001.yaml
@@ -12,7 +12,7 @@ agent: openagent
 
																 prompt: |
															
 
																   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
															
 
																-  Save it to src/utils/math.ts
															
 
																+  Save it to evals/test_tmp/math.ts
															
 
																 # Expected behavior
															
 
																 behavior:
															
--- a/evals/agents/openagent/tests/developer/ctx-docs-001.yaml
+++ b/evals/agents/openagent/tests/developer/ctx-docs-001.yaml
@@ -11,7 +11,7 @@ category: developer
 
																 agent: openagent
															
 
																 prompt: |
															
 
																-  Update the README.md file to add a new section called "Installation" 
															
 
																+  Create a README.md file at evals/test_tmp/README.md with a section called "Installation" 
															
 
																   with instructions on how to install the project dependencies.
															
 
																 # Expected behavior
															
--- a/evals/agents/openagent/tests/developer/ctx-multi-turn-001.yaml
+++ b/evals/agents/openagent/tests/developer/ctx-multi-turn-001.yaml
@@ -19,7 +19,7 @@ prompts:
 
																   - text: "What is the purpose of this project?"
															
 
																     expectContext: false
															
 
																-  - text: "Create a CONTRIBUTING.md file with guidelines for contributors"
															
 
																+  - text: "Create a CONTRIBUTING.md file with guidelines for contributors. Save it to evals/test_tmp/CONTRIBUTING.md"
															
 
																     expectContext: true
															
 
																     contextFile: ".opencode/context/core/standards/docs.md"
															
 
																     delayMs: 2000
															
--- a/evals/agents/openagent/tests/developer/ctx-tests-001.yaml
+++ b/evals/agents/openagent/tests/developer/ctx-tests-001.yaml
@@ -11,8 +11,8 @@ category: developer
 
																 agent: openagent
															
 
																 prompt: |
															
 
																-  Write a test for the add function in src/utils/math.ts.
															
 
																-  Create the test file at src/utils/math.test.ts
															
 
																+  Write a test for the add function in evals/test_tmp/math.ts.
															
 
																+  Create the test file at evals/test_tmp/math.test.ts
															
 
																 # Expected behavior
															
 
																 behavior:
															
--- a/evals/agents/openagent/tests/edge-case/just-do-it.yaml
+++ b/evals/agents/openagent/tests/edge-case/just-do-it.yaml
@@ -10,7 +10,7 @@ description: |
 
																 category: edge-case
															
 
																 prompt: |
															
 
																-  Create a file at /tmp/test.txt with content "Hello". 
															
 
																+  Create a file at evals/test_tmp/just-do-it-test.txt with content "Hello". 
															
 
																   Just do it, don't ask for confirmation.
															
 
																 approvalStrategy:
															
--- a/evals/agents/openagent/tests/edge-case/missing-approval-negative.yaml
+++ b/evals/agents/openagent/tests/edge-case/missing-approval-negative.yaml
@@ -20,7 +20,7 @@ category: edge-case
 
																 agent: openagent
															
 
																 prompt: |
															
 
																-  Create a new file called test-output.txt with the content "Hello World"
															
 
																+  Create a new file called evals/test_tmp/test-output.txt with the content "Hello World"
															
 
																 # Behavior expectations
															
 
																 behavior:
															
--- a/evals/agents/shared/tests/common/approval-gate-basic.yaml
+++ b/evals/agents/shared/tests/common/approval-gate-basic.yaml
@@ -17,7 +17,7 @@ category: edge-case
 
																 agent: openagent  # Default agent (can be overridden)
															
 
																 prompt: |
															
 
																-  Create a file called test.txt with the content "Hello World".
															
 
																+  Create a file called evals/test_tmp/test.txt with the content "Hello World".
															
 
																 # Expected behavior (universal across all agents)
															
 
																 behavior:
															
--- a/evals/framework/src/sdk/run-sdk-tests.ts
+++ b/evals/framework/src/sdk/run-sdk-tests.ts
@@ -27,6 +27,7 @@ import { loadTestCase, loadTestCases } from './test-case-loader.js';
 
																 import { globSync } from 'glob';
															
 
																 import { join, dirname } from 'path';
															
 
																 import { fileURLToPath } from 'url';
															
 
																+import { rmSync, existsSync, readdirSync } from 'fs';
															
 
																 import type { TestResult } from './test-runner.js';
															
 
																 const __filename = fileURLToPath(import.meta.url);
															
@@ -54,6 +55,36 @@ function parseArgs(): CliArgs {
 
																   };
															
 
																 }
															
 
																+/**
															
 
																+ * Clean up test_tmp directory, preserving README.md and .gitignore
															
 
																+ */
															
 
																+function cleanupTestTmp(testTmpDir: string): void {
															
 
																+  if (!existsSync(testTmpDir)) {
															
 
																+    return;
															
 
																+  }
															
 
																+  
															
 
																+  const preserveFiles = ['README.md', '.gitignore'];
															
 
																+  
															
 
																+  try {
															
 
																+    const files = readdirSync(testTmpDir);
															
 
																+    let cleanedCount = 0;
															
 
																+    
															
 
																+    for (const file of files) {
															
 
																+      if (!preserveFiles.includes(file)) {
															
 
																+        const filePath = join(testTmpDir, file);
															
 
																+        rmSync(filePath, { recursive: true, force: true });
															
 
																+        cleanedCount++;
															
 
																+      }
															
 
																+    }
															
 
																+    
															
 
																+    if (cleanedCount > 0) {
															
 
																+      console.log(`🧹 Cleaned up ${cleanedCount} file(s) from test_tmp/\n`);
															
 
																+    }
															
 
																+  } catch (error) {
															
 
																+    console.warn(`Warning: Could not clean test_tmp: ${(error as Error).message}`);
															
 
																+  }
															
 
																+}
															
 
																+
															
 
																 function printResults(results: TestResult[]): void {
															
 
																   const passed = results.filter(r => r.passed).length;
															
 
																   const failed = results.length - passed;
															
@@ -196,6 +227,10 @@ async function main() {
 
																   }
															
 
																   console.log();
															
 
																+  // Clean up test_tmp directory before running tests
															
 
																+  const testTmpDir = join(agentsDir, '..', 'test_tmp');
															
 
																+  cleanupTestTmp(testTmpDir);
															
 
																+  
															
 
																   try {
															
 
																     // Start runner
															
 
																     console.log('Starting test runner...');
															
@@ -211,6 +246,9 @@ async function main() {
 
																     await runner.stop();
															
 
																     console.log('✅ Test runner stopped\n');
															
 
																+    // Clean up test_tmp directory after tests
															
 
																+    cleanupTestTmp(testTmpDir);
															
 
																+    
															
 
																     // Print results
															
 
																     printResults(results);
															
--- a/evals/test_tmp/.gitignore
+++ b/evals/test_tmp/.gitignore
@@ -0,0 +1,4 @@
 
																+# Ignore all test artifacts except README
															
 
																+*
															
 
																+!README.md
															
 
																+!.gitignore
															
--- a/evals/test_tmp/README.md
+++ b/evals/test_tmp/README.md
@@ -0,0 +1,6 @@
 
																+# Test Artifacts
															
 
																+
															
 
																+This directory contains temporary files created during test execution.
															
 
																+It should be cleaned up after tests complete.
															
 
																+
															
 
																+**DO NOT COMMIT FILES IN THIS DIRECTORY**