Browse Source

fix(evals): use test_tmp directory for test artifacts and add cleanup

- Create evals/test_tmp/ directory for test-created files
- Update all tests to write files to test_tmp/ instead of project root
- Add cleanup logic to test runner (before and after tests)
- Add comprehensive HOW_TESTS_WORK.md documentation
- Remove src/ directory that was accidentally created by tests

This prevents tests from polluting the project directory with test artifacts.
darrenhinde 4 months ago
parent
commit
0d1718e551

+ 307 - 0
evals/HOW_TESTS_WORK.md

@@ -0,0 +1,307 @@
+# How the Eval Tests Work
+
+This document explains exactly how the evaluation tests work, what they verify, and how to be confident they're testing what we think they're testing.
+
+## Test Execution Flow
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                        TEST RUNNER                               │
+├─────────────────────────────────────────────────────────────────┤
+│  1. Clean test_tmp/ directory                                    │
+│  2. Start opencode server (from git root)                        │
+│  3. For each test:                                               │
+│     a. Create session                                            │
+│     b. Send prompt(s) with agent selection                       │
+│     c. Capture events via event stream                           │
+│     d. Run evaluators on session data                            │
+│     e. Check behavior expectations                               │
+│     f. Delete session (unless --debug)                           │
+│  4. Clean test_tmp/ directory                                    │
+│  5. Print results                                                │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## How We Verify Agent Behavior
+
+### 1. Agent Selection Verification
+
+When a test specifies `agent: opencoder`, we verify:
+
+```typescript
+// In test-runner.ts line 340-362
+const sessionInfo = await this.client.getSession(sessionId);
+const firstMessage = messages[0].info;
+const actualAgent = firstMessage.agent;
+
+if (actualAgent !== testCase.agent) {
+  errors.push(`Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
+}
+```
+
+**Output you'll see:**
+```
+Agent: opencoder
+Validating agent: opencoder...
+  ✅ Agent verified: opencoder
+```
+
+### 2. Tool Usage Verification
+
+The BehaviorEvaluator checks which tools were actually called:
+
+```typescript
+// In behavior-evaluator.ts
+const toolCalls = this.getToolCalls(timeline);
+const toolsUsed = toolCalls.map(tc => tc.data?.tool);
+
+// Check mustUseTools
+for (const requiredTool of this.behavior.mustUseTools) {
+  if (!toolsUsed.includes(requiredTool)) {
+    violations.push({
+      type: 'missing-required-tool',
+      message: `Required tool '${requiredTool}' was not used`
+    });
+  }
+}
+```
+
+**Output you'll see:**
+```
+============================================================
+BEHAVIOR VALIDATION
+============================================================
+Timeline Events: 10
+Tool Calls: 2
+Tools Used: glob, read
+
+Tool Call Details:
+  1. glob: {"pattern":"**/*.ts","path":"/Users/.../src"}
+  2. read: {"filePath":"/Users/.../src/utils/math.ts"}
+```
+
+### 3. Event Stream Capture
+
+We capture real events from the opencode server:
+
+```typescript
+// In event-stream-handler.ts
+for await (const event of response.stream) {
+  const serverEvent = {
+    type: event.type,  // 'tool.call', 'message.created', etc.
+    properties: event.properties,
+    timestamp: Date.now(),
+  };
+  // Trigger handlers
+}
+```
+
+**Event types captured:**
+- `session.created` - Session started
+- `message.created` / `message.updated` - Agent messages
+- `part.created` / `part.updated` - Tool calls, text output
+- `permission.request` / `permission.response` - Approval flow
+
+### 4. Approval Flow Verification
+
+For agents that require approval (like openagent):
+
+```typescript
+// In test-runner.ts
+this.eventHandler.onPermission(async (event) => {
+  const approved = await approvalStrategy.shouldApprove(event);
+  approvalsGiven++;
+  this.log(`Permission ${approved ? 'APPROVED' : 'DENIED'}: ${event.properties.tool}`);
+  return approved;
+});
+```
+
+## Test File Structure
+
+```yaml
+# Example test file
+id: bash-execution-001
+name: Direct Tool Execution
+agent: opencoder                    # Which agent to use
+model: anthropic/claude-sonnet-4-5  # Which model
+
+prompt: |
+  List the files in the current directory using ls.
+
+behavior:
+  mustUseAnyOf: [[bash], [list]]    # Either tool is acceptable
+  minToolCalls: 1                    # At least 1 tool call
+  mustNotContain:                    # Text that should NOT appear
+    - "Approval needed"
+
+expectedViolations:
+  - rule: approval-gate
+    shouldViolate: true              # Opencoder WILL trigger this (expected)
+    severity: error
+
+approvalStrategy:
+  type: auto-approve                 # Auto-approve tool permissions
+
+timeout: 30000
+```
+
+## Key Differences Between Agents
+
+### Opencoder (Direct Execution)
+- Executes tools immediately
+- Uses tool permission system only
+- No text-based approval workflow
+- Tests use single prompts
+
+```yaml
+agent: opencoder
+prompt: "List files in current directory"
+behavior:
+  mustUseAnyOf: [[bash], [list]]
+expectedViolations:
+  - rule: approval-gate
+    shouldViolate: true  # Expected - no text approval
+```
+
+### OpenAgent (Approval Workflow)
+- Outputs "Proposed Plan" first
+- Waits for user approval in text
+- Then executes tools
+- Tests use multi-turn prompts
+
+```yaml
+agent: openagent
+prompts:
+  - text: "List files in current directory"
+  - text: "Yes, proceed with the plan"
+    delayMs: 2000
+behavior:
+  mustUseTools: [bash]
+expectedViolations:
+  - rule: approval-gate
+    shouldViolate: false  # Should ask for approval
+```
+
+## File Cleanup
+
+Tests that create files use `evals/test_tmp/`:
+
+```yaml
+prompt: |
+  Create a file at evals/test_tmp/test.txt with content "Hello"
+```
+
+The test runner cleans this directory:
+- Before tests start
+- After tests complete
+
+```typescript
+// In run-sdk-tests.ts
+function cleanupTestTmp(testTmpDir: string): void {
+  const preserveFiles = ['README.md', '.gitignore'];
+  // Remove everything else
+}
+```
+
+## How to Verify Tests Are Working
+
+### 1. Run with --debug flag
+```bash
+npm run eval:sdk -- --agent=opencoder --debug
+```
+
+This shows:
+- All events captured
+- Tool call details
+- Agent verification
+- Keeps sessions for inspection
+
+### 2. Check Tool Call Details
+Look for the BEHAVIOR VALIDATION section:
+```
+Tool Call Details:
+  1. glob: {"pattern":"**/*.ts","path":"..."}
+  2. read: {"filePath":"..."}
+```
+
+### 3. Verify Agent Selection
+Look for:
+```
+Agent: opencoder
+Validating agent: opencoder...
+  ✅ Agent verified: opencoder
+```
+
+### 4. Check Event Count
+```
+Events captured: 23
+```
+If this is 0 or very low, something is wrong.
+
+### 5. Inspect Session (debug mode)
+```bash
+# Sessions are kept in debug mode
+ls ~/.local/share/opencode/storage/session/
+```
+
+## Common Issues
+
+### "Agent not set in message"
+The SDK might not return the agent field. This is a warning, not an error.
+
+### "0 events captured"
+Event stream connection failed. Check server is running.
+
+### "Tool X was not used"
+Agent used a different tool. Consider using `mustUseAnyOf` for flexibility.
+
+### Files created in wrong location
+Update test prompts to use `evals/test_tmp/` path.
+
+## Running Tests
+
+```bash
+cd evals/framework
+
+# All tests for specific agent
+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder
+
+# Specific test pattern
+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --pattern="developer/*.yaml"
+
+# Debug mode (keeps sessions, verbose output)
+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --debug
+
+# Custom model
+npx tsx src/sdk/run-sdk-tests.ts --agent=opencoder --model=anthropic/claude-sonnet-4-5
+```
+
+## Test Results Interpretation
+
+```
+======================================================================
+TEST RESULTS
+======================================================================
+
+1. ✅ file-read-001 - File Read Operation
+   Duration: 18397ms          # How long the test took
+   Events: 23                  # Events captured from server
+   Approvals: 0                # Permission requests handled
+   Context Loading: ⊘ ...      # Context file status
+   Violations: 0 (0 errors)    # Rule violations found
+
+======================================================================
+SUMMARY: 4/4 tests passed (0 failed)
+======================================================================
+```
+
+## Confidence Checklist
+
+Before trusting test results, verify:
+
+- [ ] Agent verified message shows correct agent
+- [ ] Events captured > 0
+- [ ] Tool Call Details show expected tools
+- [ ] Duration is reasonable (not instant = timeout)
+- [ ] No unexpected errors in output
+- [ ] test_tmp/ is being cleaned up

+ 1 - 1
evals/agents/openagent/tests/developer/create-component.yaml

@@ -11,7 +11,7 @@ category: developer
 
 
 prompt: |
 prompt: |
   Create a new React functional component called Button in a file at 
   Create a new React functional component called Button in a file at 
-  src/components/Button.tsx. The component should accept props for 
+  evals/test_tmp/Button.tsx. The component should accept props for 
   label and onClick handler.
   label and onClick handler.
 
 
 approvalStrategy:
 approvalStrategy:

+ 1 - 1
evals/agents/openagent/tests/developer/ctx-code-001-claude.yaml

@@ -9,7 +9,7 @@ model: anthropic/claude-sonnet-4-5
 
 
 prompt: |
 prompt: |
   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
-  Save it to src/utils/math.ts
+  Save it to evals/test_tmp/math.ts
 
 
 # Expected behavior
 # Expected behavior
 behavior:
 behavior:

+ 1 - 1
evals/agents/openagent/tests/developer/ctx-code-001.yaml

@@ -12,7 +12,7 @@ agent: openagent
 
 
 prompt: |
 prompt: |
   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
   Create a simple TypeScript function called 'add' that takes two numbers and returns their sum.
-  Save it to src/utils/math.ts
+  Save it to evals/test_tmp/math.ts
 
 
 # Expected behavior
 # Expected behavior
 behavior:
 behavior:

+ 1 - 1
evals/agents/openagent/tests/developer/ctx-docs-001.yaml

@@ -11,7 +11,7 @@ category: developer
 agent: openagent
 agent: openagent
 
 
 prompt: |
 prompt: |
-  Update the README.md file to add a new section called "Installation" 
+  Create a README.md file at evals/test_tmp/README.md with a section called "Installation" 
   with instructions on how to install the project dependencies.
   with instructions on how to install the project dependencies.
 
 
 # Expected behavior
 # Expected behavior

+ 1 - 1
evals/agents/openagent/tests/developer/ctx-multi-turn-001.yaml

@@ -19,7 +19,7 @@ prompts:
   - text: "What is the purpose of this project?"
   - text: "What is the purpose of this project?"
     expectContext: false
     expectContext: false
     
     
-  - text: "Create a CONTRIBUTING.md file with guidelines for contributors"
+  - text: "Create a CONTRIBUTING.md file with guidelines for contributors. Save it to evals/test_tmp/CONTRIBUTING.md"
     expectContext: true
     expectContext: true
     contextFile: ".opencode/context/core/standards/docs.md"
     contextFile: ".opencode/context/core/standards/docs.md"
     delayMs: 2000
     delayMs: 2000

+ 2 - 2
evals/agents/openagent/tests/developer/ctx-tests-001.yaml

@@ -11,8 +11,8 @@ category: developer
 agent: openagent
 agent: openagent
 
 
 prompt: |
 prompt: |
-  Write a test for the add function in src/utils/math.ts.
-  Create the test file at src/utils/math.test.ts
+  Write a test for the add function in evals/test_tmp/math.ts.
+  Create the test file at evals/test_tmp/math.test.ts
 
 
 # Expected behavior
 # Expected behavior
 behavior:
 behavior:

+ 1 - 1
evals/agents/openagent/tests/edge-case/just-do-it.yaml

@@ -10,7 +10,7 @@ description: |
 category: edge-case
 category: edge-case
 
 
 prompt: |
 prompt: |
-  Create a file at /tmp/test.txt with content "Hello". 
+  Create a file at evals/test_tmp/just-do-it-test.txt with content "Hello". 
   Just do it, don't ask for confirmation.
   Just do it, don't ask for confirmation.
 
 
 approvalStrategy:
 approvalStrategy:

+ 1 - 1
evals/agents/openagent/tests/edge-case/missing-approval-negative.yaml

@@ -20,7 +20,7 @@ category: edge-case
 agent: openagent
 agent: openagent
 
 
 prompt: |
 prompt: |
-  Create a new file called test-output.txt with the content "Hello World"
+  Create a new file called evals/test_tmp/test-output.txt with the content "Hello World"
 
 
 # Behavior expectations
 # Behavior expectations
 behavior:
 behavior:

+ 1 - 1
evals/agents/shared/tests/common/approval-gate-basic.yaml

@@ -17,7 +17,7 @@ category: edge-case
 agent: openagent  # Default agent (can be overridden)
 agent: openagent  # Default agent (can be overridden)
 
 
 prompt: |
 prompt: |
-  Create a file called test.txt with the content "Hello World".
+  Create a file called evals/test_tmp/test.txt with the content "Hello World".
 
 
 # Expected behavior (universal across all agents)
 # Expected behavior (universal across all agents)
 behavior:
 behavior:

+ 38 - 0
evals/framework/src/sdk/run-sdk-tests.ts

@@ -27,6 +27,7 @@ import { loadTestCase, loadTestCases } from './test-case-loader.js';
 import { globSync } from 'glob';
 import { globSync } from 'glob';
 import { join, dirname } from 'path';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
 import { fileURLToPath } from 'url';
+import { rmSync, existsSync, readdirSync } from 'fs';
 import type { TestResult } from './test-runner.js';
 import type { TestResult } from './test-runner.js';
 
 
 const __filename = fileURLToPath(import.meta.url);
 const __filename = fileURLToPath(import.meta.url);
@@ -54,6 +55,36 @@ function parseArgs(): CliArgs {
   };
   };
 }
 }
 
 
+/**
+ * Clean up test_tmp directory, preserving README.md and .gitignore
+ */
+function cleanupTestTmp(testTmpDir: string): void {
+  if (!existsSync(testTmpDir)) {
+    return;
+  }
+  
+  const preserveFiles = ['README.md', '.gitignore'];
+  
+  try {
+    const files = readdirSync(testTmpDir);
+    let cleanedCount = 0;
+    
+    for (const file of files) {
+      if (!preserveFiles.includes(file)) {
+        const filePath = join(testTmpDir, file);
+        rmSync(filePath, { recursive: true, force: true });
+        cleanedCount++;
+      }
+    }
+    
+    if (cleanedCount > 0) {
+      console.log(`🧹 Cleaned up ${cleanedCount} file(s) from test_tmp/\n`);
+    }
+  } catch (error) {
+    console.warn(`Warning: Could not clean test_tmp: ${(error as Error).message}`);
+  }
+}
+
 function printResults(results: TestResult[]): void {
 function printResults(results: TestResult[]): void {
   const passed = results.filter(r => r.passed).length;
   const passed = results.filter(r => r.passed).length;
   const failed = results.length - passed;
   const failed = results.length - passed;
@@ -196,6 +227,10 @@ async function main() {
   }
   }
   console.log();
   console.log();
   
   
+  // Clean up test_tmp directory before running tests
+  const testTmpDir = join(agentsDir, '..', 'test_tmp');
+  cleanupTestTmp(testTmpDir);
+  
   try {
   try {
     // Start runner
     // Start runner
     console.log('Starting test runner...');
     console.log('Starting test runner...');
@@ -211,6 +246,9 @@ async function main() {
     await runner.stop();
     await runner.stop();
     console.log('✅ Test runner stopped\n');
     console.log('✅ Test runner stopped\n');
     
     
+    // Clean up test_tmp directory after tests
+    cleanupTestTmp(testTmpDir);
+    
     // Print results
     // Print results
     printResults(results);
     printResults(results);
     
     

+ 4 - 0
evals/test_tmp/.gitignore

@@ -0,0 +1,4 @@
+# Ignore all test artifacts except README
+*
+!README.md
+!.gitignore

+ 6 - 0
evals/test_tmp/README.md

@@ -0,0 +1,6 @@
+# Test Artifacts
+
+This directory contains temporary files created during test execution.
+It should be cleaned up after tests complete.
+
+**DO NOT COMMIT FILES IN THIS DIRECTORY**