4 months ago · aebd68e046
--- a/.github/workflows/test-agents.yml
+++ b/.github/workflows/test-agents.yml
@@ -0,0 +1,128 @@
 
				+name: Test Agents
			
 
				+
			
 
				+on:
			
 
				+  pull_request:
			
 
				+    branches: [ main, dev ]
			
 
				+    paths:
			
 
				+      - '.opencode/agent/**'
			
 
				+      - 'evals/**'
			
 
				+      - '.github/workflows/test-agents.yml'
			
 
				+  push:
			
 
				+    branches: [ main ]
			
 
				+  workflow_dispatch:
			
 
				+
			
 
				+jobs:
			
 
				+  test-openagent:
			
 
				+    name: Test OpenAgent
			
 
				+    runs-on: ubuntu-latest
			
 
				+    timeout-minutes: 10
			
 
				+    
			
 
				+    steps:
			
 
				+      - name: Checkout code
			
 
				+        uses: actions/checkout@v4
			
 
				+      
			
 
				+      - name: Setup Node.js
			
 
				+        uses: actions/setup-node@v4
			
 
				+        with:
			
 
				+          node-version: '20'
			
 
				+          cache: 'npm'
			
 
				+          cache-dependency-path: 'evals/framework/package-lock.json'
			
 
				+      
			
 
				+      - name: Install dependencies
			
 
				+        run: |
			
 
				+          cd evals/framework
			
 
				+          npm ci
			
 
				+      
			
 
				+      - name: Build framework
			
 
				+        run: |
			
 
				+          cd evals/framework
			
 
				+          npm run build
			
 
				+      
			
 
				+      - name: Run OpenAgent smoke test
			
 
				+        run: npm run test:ci:openagent
			
 
				+        env:
			
 
				+          CI: true
			
 
				+      
			
 
				+      - name: Upload test results
			
 
				+        if: always()
			
 
				+        uses: actions/upload-artifact@v4
			
 
				+        with:
			
 
				+          name: openagent-results
			
 
				+          path: evals/results/
			
 
				+          retention-days: 30
			
 
				+
			
 
				+  test-opencoder:
			
 
				+    name: Test OpenCoder
			
 
				+    runs-on: ubuntu-latest
			
 
				+    timeout-minutes: 10
			
 
				+    
			
 
				+    steps:
			
 
				+      - name: Checkout code
			
 
				+        uses: actions/checkout@v4
			
 
				+      
			
 
				+      - name: Setup Node.js
			
 
				+        uses: actions/setup-node@v4
			
 
				+        with:
			
 
				+          node-version: '20'
			
 
				+          cache: 'npm'
			
 
				+          cache-dependency-path: 'evals/framework/package-lock.json'
			
 
				+      
			
 
				+      - name: Install dependencies
			
 
				+        run: |
			
 
				+          cd evals/framework
			
 
				+          npm ci
			
 
				+      
			
 
				+      - name: Build framework
			
 
				+        run: |
			
 
				+          cd evals/framework
			
 
				+          npm run build
			
 
				+      
			
 
				+      - name: Run OpenCoder smoke test
			
 
				+        run: npm run test:ci:opencoder
			
 
				+        env:
			
 
				+          CI: true
			
 
				+      
			
 
				+      - name: Upload test results
			
 
				+        if: always()
			
 
				+        uses: actions/upload-artifact@v4
			
 
				+        with:
			
 
				+          name: opencoder-results
			
 
				+          path: evals/results/
			
 
				+          retention-days: 30
			
 
				+
			
 
				+  report-results:
			
 
				+    name: Report Test Results
			
 
				+    runs-on: ubuntu-latest
			
 
				+    needs: [test-openagent, test-opencoder]
			
 
				+    if: always()
			
 
				+    
			
 
				+    steps:
			
 
				+      - name: Download OpenAgent results
			
 
				+        uses: actions/download-artifact@v4
			
 
				+        with:
			
 
				+          name: openagent-results
			
 
				+          path: results/openagent
			
 
				+        continue-on-error: true
			
 
				+      
			
 
				+      - name: Download OpenCoder results
			
 
				+        uses: actions/download-artifact@v4
			
 
				+        with:
			
 
				+          name: opencoder-results
			
 
				+          path: results/opencoder
			
 
				+        continue-on-error: true
			
 
				+      
			
 
				+      - name: Display results summary
			
 
				+        run: |
			
 
				+          echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY
			
 
				+          echo "" >> $GITHUB_STEP_SUMMARY
			
 
				+          
			
 
				+          if [ -f results/openagent/latest.json ]; then
			
 
				+            echo "### OpenAgent" >> $GITHUB_STEP_SUMMARY
			
 
				+            cat results/openagent/latest.json | jq -r '"- Passed: \(.passed)\n- Failed: \(.failed)\n- Total: \(.total)"' >> $GITHUB_STEP_SUMMARY
			
 
				+          fi
			
 
				+          
			
 
				+          if [ -f results/opencoder/latest.json ]; then
			
 
				+            echo "" >> $GITHUB_STEP_SUMMARY
			
 
				+            echo "### OpenCoder" >> $GITHUB_STEP_SUMMARY
			
 
				+            cat results/opencoder/latest.json | jq -r '"- Passed: \(.passed)\n- Failed: \(.failed)\n- Total: \(.total)"' >> $GITHUB_STEP_SUMMARY
			
 
				+          fi
			
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,111 @@
 
				+# Changelog
			
 
				+
			
 
				+All notable changes to this project will be documented in this file.
			
 
				+
			
 
				+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
			
 
				+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
			
 
				+
			
 
				+## [0.1.0-alpha.1] - 2025-11-26
			
 
				+
			
 
				+### Added
			
 
				+
			
 
				+#### SDK-Based Evaluation Framework
			
 
				+- Complete test execution framework using OpenCode SDK
			
 
				+- Support for openagent and opencoder testing
			
 
				+- Real agent testing with session management
			
 
				+- Smart timeout system with activity monitoring
			
 
				+- Multi-turn conversation support
			
 
				+
			
 
				+#### Modular Architecture
			
 
				+- Refactored test-runner.ts (884 lines → 4 focused modules):
			
 
				+  - `test-runner.ts` (411 lines): Thin orchestrator
			
 
				+  - `test-executor.ts` (392 lines): Core execution logic
			
 
				+  - `result-validator.ts` (253 lines): Validation logic
			
 
				+  - `event-logger.ts` (128 lines): Logging utilities
			
 
				+- Improved Single Responsibility Principle compliance
			
 
				+- Enhanced testability through dependency injection
			
 
				+
			
 
				+#### Test Infrastructure
			
 
				+- 20+ test cases across multiple categories:
			
 
				+  - OpenAgent: Developer (12), Context Loading (5), Business (2), Edge Cases (3)
			
 
				+  - OpenCoder: Developer (4)
			
 
				+- BehaviorEvaluator for validating expected agent actions
			
 
				+- Comprehensive evaluators: approval-gate, context-loading, delegation, tool-usage
			
 
				+
			
 
				+#### Interactive Results Dashboard
			
 
				+- Real-time test results visualization
			
 
				+- Filtering by agent, category, status
			
 
				+- Detailed violation tracking
			
 
				+- CSV export functionality
			
 
				+- Historical results tracking
			
 
				+- One-command deployment (`./serve.sh`)
			
 
				+
			
 
				+#### Documentation
			
 
				+- ARCHITECTURE.md: Comprehensive system review (456 lines)
			
 
				+- GETTING_STARTED.md: Quick start guide (435 lines)
			
 
				+- SDK_EVAL_README.md: Complete SDK guide (298 lines)
			
 
				+- Test design guide and architecture overview
			
 
				+- Documentation cleanup (removed 3 outdated files)
			
 
				+
			
 
				+#### Script Organization
			
 
				+- Organized 12 scripts into logical directories:
			
 
				+  - `scripts/debug/`: Session debugging tools (4 files)
			
 
				+  - `scripts/test/`: Test execution scripts (6 files)
			
 
				+  - `scripts/utils/`: Utility scripts (2 files)
			
 
				+- Comprehensive scripts/README.md with usage examples
			
 
				+
			
 
				+#### Monorepo Structure
			
 
				+- Root package.json with convenient npm scripts
			
 
				+- Easy agent selection (openagent, opencoder)
			
 
				+- Easy model selection (grok, claude, gpt-4)
			
 
				+- Quick dashboard access from root
			
 
				+- No folder navigation required
			
 
				+
			
 
				+#### CI/CD
			
 
				+- GitHub Actions workflow for automated testing
			
 
				+- Pre-merge validation for agent changes
			
 
				+- Fast smoke tests for both agents
			
 
				+- Automated test result reporting
			
 
				+
			
 
				+#### Agent Improvements
			
 
				+- Enhanced openagent with better context loading
			
 
				+- New opencoder agent with test suite
			
 
				+- Improved subagent invocation patterns
			
 
				+- Ultra-compact context index system
			
 
				+
			
 
				+### Changed
			
 
				+- Reorganized evaluation framework structure
			
 
				+- Improved test case schema with behavior expectations
			
 
				+- Enhanced context loading detection
			
 
				+
			
 
				+### Removed
			
 
				+- Outdated documentation files (TESTING_CONFIDENCE.md, TEST_REVIEW.md, SESSION_STORAGE_FIX.md)
			
 
				+- Redundant test files
			
 
				+
			
 
				+### Fixed
			
 
				+- Context loading evaluator detection accuracy
			
 
				+- Multi-turn prompt handling
			
 
				+- Test artifact cleanup
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Version Format
			
 
				+
			
 
				+```
			
 
				+v0.1.0-alpha.1
			
 
				+│ │ │  │      │
			
 
				+│ │ │  │      └─ Build/Iteration number
			
 
				+│ │ │  └──────── Release stage (alpha, beta, rc)
			
 
				+│ │ └─────────── Patch version
			
 
				+│ └───────────── Minor version
			
 
				+└─────────────── Major version (0 = pre-release)
			
 
				+```
			
 
				+
			
 
				+### Version Progression
			
 
				+
			
 
				+- **Alpha** (`v0.x.0-alpha.N`): Early development, unstable
			
 
				+- **Beta** (`v0.x.0-beta.N`): Feature complete, testing
			
 
				+- **RC** (`v0.x.0-rc.N`): Release candidate, stable
			
 
				+- **Stable** (`v1.x.x`): Production ready
			
 
				+
			
 
				+[0.1.0-alpha.1]: https://github.com/darrenhinde/OpenAgents/releases/tag/v0.1.0-alpha.1
			
--- a/QUICK_START.md
+++ b/QUICK_START.md
@@ -0,0 +1,303 @@
 
				+# 🚀 OpenCode Agents - Quick Start
			
 
				+
			
 
				+![Version](https://img.shields.io/badge/version-0.1.0--alpha.1-blue)
			
 
				+
			
 
				+## 📋 Available Agents
			
 
				+
			
 
				+- **openagent** - Full-featured development agent (22+ tests)
			
 
				+  - Developer tests: Code, docs, tests, delegation
			
 
				+  - Context loading tests: Standards, patterns, workflows
			
 
				+  - Business tests: Conversations, data analysis
			
 
				+  - Edge cases: Approval gates, negative tests
			
 
				+
			
 
				+- **opencoder** - Specialized coding agent (4+ tests)
			
 
				+  - Developer tests: Bash execution, file operations
			
 
				+  - Multi-tool workflows
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🧪 Running Tests
			
 
				+
			
 
				+### Test All Agents
			
 
				+```bash
			
 
				+npm test                              # All agents, all tests (default)
			
 
				+npm run test:all                      # Explicit all agents
			
 
				+```
			
 
				+
			
 
				+### Test Specific Agent
			
 
				+```bash
			
 
				+npm run test:openagent                # OpenAgent only
			
 
				+npm run test:opencoder                # OpenCoder only
			
 
				+```
			
 
				+
			
 
				+### Test with Different Models
			
 
				+
			
 
				+#### OpenAgent
			
 
				+```bash
			
 
				+npm run test:openagent:grok           # Grok (free tier, fast)
			
 
				+npm run test:openagent:claude         # Claude Sonnet 4.5 (best quality)
			
 
				+npm run test:openagent:gpt4           # GPT-4 Turbo (OpenAI)
			
 
				+```
			
 
				+
			
 
				+#### OpenCoder
			
 
				+```bash
			
 
				+npm run test:opencoder:grok           # Grok (free tier, fast)
			
 
				+npm run test:opencoder:claude         # Claude Sonnet 4.5 (best quality)
			
 
				+npm run test:opencoder:gpt4           # GPT-4 Turbo (OpenAI)
			
 
				+```
			
 
				+
			
 
				+#### All Agents
			
 
				+```bash
			
 
				+npm run test:all:grok                 # All agents with Grok
			
 
				+npm run test:all:claude               # All agents with Claude
			
 
				+npm run test:all:gpt4                 # All agents with GPT-4
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🎯 Test Specific Categories
			
 
				+
			
 
				+### OpenAgent Categories
			
 
				+```bash
			
 
				+npm run test:openagent:developer      # Developer tests (code, docs, tests)
			
 
				+npm run test:openagent:context        # Context loading tests
			
 
				+npm run test:openagent:business       # Business/conversation tests
			
 
				+```
			
 
				+
			
 
				+### OpenCoder Categories
			
 
				+```bash
			
 
				+npm run test:opencoder:developer      # Developer tests
			
 
				+npm run test:opencoder:bash           # Bash execution tests
			
 
				+```
			
 
				+
			
 
				+### Custom Patterns
			
 
				+```bash
			
 
				+npm run test:pattern -- "developer/*.yaml"              # All developer tests
			
 
				+npm run test:pattern -- "context-loading/*.yaml"        # Context tests
			
 
				+npm run test:pattern -- "edge-case/*.yaml"              # Edge cases
			
 
				+npm run test:openagent -- --pattern="developer/ctx-*"   # OpenAgent context tests
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📊 View Results
			
 
				+
			
 
				+### Dashboard (Recommended)
			
 
				+```bash
			
 
				+npm run dashboard                     # Launch interactive dashboard
			
 
				+npm run dashboard:open                # Launch and auto-open browser
			
 
				+```
			
 
				+
			
 
				+The dashboard provides:
			
 
				+- ✅ Real-time test results visualization
			
 
				+- ✅ Filter by agent, category, status
			
 
				+- ✅ Detailed violation tracking
			
 
				+- ✅ CSV export functionality
			
 
				+- ✅ Historical results tracking
			
 
				+
			
 
				+### Command Line
			
 
				+```bash
			
 
				+npm run results:openagent             # Recent OpenAgent results
			
 
				+npm run results:opencoder             # Recent OpenCoder results
			
 
				+npm run results:latest                # Latest test summary (JSON)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🐛 Debug Mode
			
 
				+
			
 
				+```bash
			
 
				+npm run test:debug                    # Run with debug output
			
 
				+npm run test:openagent -- --debug     # Debug OpenAgent tests
			
 
				+npm run test:opencoder -- --debug     # Debug OpenCoder tests
			
 
				+```
			
 
				+
			
 
				+Debug mode shows:
			
 
				+- Detailed event logging
			
 
				+- Tool call details
			
 
				+- Session information
			
 
				+- Evaluation progress
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🔧 Development
			
 
				+
			
 
				+```bash
			
 
				+npm run dev:setup                     # Install dependencies
			
 
				+npm run dev:build                     # Build framework
			
 
				+npm run dev:test                      # Run unit tests
			
 
				+npm run dev:clean                     # Clean and reinstall
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📈 Version Management
			
 
				+
			
 
				+```bash
			
 
				+npm run version                       # Show current version
			
 
				+npm run version:bump alpha            # Bump alpha version
			
 
				+npm run version:bump beta             # Bump to beta
			
 
				+npm run version:bump rc               # Bump to release candidate
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📁 Test Structure
			
 
				+
			
 
				+```
			
 
				+evals/agents/
			
 
				+├── openagent/tests/
			
 
				+│   ├── developer/          # Code, docs, tests (12 tests)
			
 
				+│   │   ├── ctx-code-001.yaml
			
 
				+│   │   ├── ctx-docs-001.yaml
			
 
				+│   │   ├── ctx-tests-001.yaml
			
 
				+│   │   ├── ctx-delegation-001.yaml
			
 
				+│   │   └── ...
			
 
				+│   ├── context-loading/    # Context loading (5 tests)
			
 
				+│   │   ├── ctx-simple-coding-standards.yaml
			
 
				+│   │   ├── ctx-simple-documentation-format.yaml
			
 
				+│   │   └── ...
			
 
				+│   ├── business/           # Conversations (2 tests)
			
 
				+│   │   ├── conv-simple-001.yaml
			
 
				+│   │   └── data-analysis.yaml
			
 
				+│   └── edge-case/          # Edge cases (3 tests)
			
 
				+│       ├── just-do-it.yaml
			
 
				+│       ├── missing-approval-negative.yaml
			
 
				+│       └── no-approval-negative.yaml
			
 
				+│
			
 
				+└── opencoder/tests/
			
 
				+    └── developer/          # Bash, file ops (4 tests)
			
 
				+        ├── bash-execution-001.yaml
			
 
				+        ├── file-read-001.yaml
			
 
				+        ├── multi-tool-001.yaml
			
 
				+        └── simple-bash-test.yaml
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 💡 Common Workflows
			
 
				+
			
 
				+### Quick Test (Free Tier)
			
 
				+```bash
			
 
				+npm run test:openagent:grok           # Fast, free
			
 
				+npm run test:opencoder:grok           # Fast, free
			
 
				+```
			
 
				+
			
 
				+### Quality Test (Best Model)
			
 
				+```bash
			
 
				+npm run test:openagent:claude         # Best quality
			
 
				+npm run test:opencoder:claude         # Best quality
			
 
				+```
			
 
				+
			
 
				+### Full Test Suite
			
 
				+```bash
			
 
				+npm run test:all:claude               # All agents, best model
			
 
				+```
			
 
				+
			
 
				+### Continuous Development
			
 
				+```bash
			
 
				+# 1. Run tests in debug mode
			
 
				+npm run test:openagent:developer -- --debug
			
 
				+
			
 
				+# 2. View results in dashboard
			
 
				+npm run dashboard:open
			
 
				+
			
 
				+# 3. Iterate on agent prompts
			
 
				+# Edit .opencode/agent/openagent.md
			
 
				+
			
 
				+# 4. Re-run tests
			
 
				+npm run test:openagent:developer
			
 
				+```
			
 
				+
			
 
				+### CI/CD Smoke Tests
			
 
				+```bash
			
 
				+npm run test:ci                       # Fast smoke tests for both agents
			
 
				+npm run test:ci:openagent             # OpenAgent smoke test
			
 
				+npm run test:ci:opencoder             # OpenCoder smoke test
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🎯 Test Results
			
 
				+
			
 
				+After running tests, results are saved to:
			
 
				+- `evals/results/latest.json` - Latest test run
			
 
				+- `evals/results/history/YYYY-MM/DD-HHMMSS-{agent}.json` - Historical results
			
 
				+
			
 
				+View in dashboard: `npm run dashboard:open`
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🔍 Understanding Test Results
			
 
				+
			
 
				+### Test Status
			
 
				+- ✅ **PASSED** - All checks passed, no violations
			
 
				+- ❌ **FAILED** - Test failed (execution error or violations)
			
 
				+
			
 
				+### Evaluators
			
 
				+Tests are evaluated by multiple evaluators:
			
 
				+- **approval-gate** - Checks if agent requested approval when required
			
 
				+- **context-loading** - Validates context files were loaded before execution
			
 
				+- **delegation** - Checks if agent delegated to subagents appropriately
			
 
				+- **tool-usage** - Validates correct tool usage
			
 
				+- **behavior** - Checks if agent performed expected actions
			
 
				+
			
 
				+### Violations
			
 
				+- **Error** - Critical issues that cause test failure
			
 
				+- **Warning** - Non-critical issues
			
 
				+- **Info** - Informational messages
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📚 Additional Resources
			
 
				+
			
 
				+- [README.md](README.md) - Project overview
			
 
				+- [evals/GETTING_STARTED.md](evals/GETTING_STARTED.md) - Detailed evaluation guide
			
 
				+- [evals/ARCHITECTURE.md](evals/ARCHITECTURE.md) - System architecture
			
 
				+- [evals/framework/SDK_EVAL_README.md](evals/framework/SDK_EVAL_README.md) - SDK documentation
			
 
				+- [CHANGELOG.md](CHANGELOG.md) - Version history
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🆘 Troubleshooting
			
 
				+
			
 
				+### Tests not running?
			
 
				+```bash
			
 
				+# Ensure dependencies are installed
			
 
				+npm run dev:setup
			
 
				+
			
 
				+# Build the framework
			
 
				+npm run dev:build
			
 
				+```
			
 
				+
			
 
				+### Dashboard not loading?
			
 
				+```bash
			
 
				+# Check if results exist
			
 
				+ls -la evals/results/
			
 
				+
			
 
				+# Try launching manually
			
 
				+cd evals/results && ./serve.sh
			
 
				+```
			
 
				+
			
 
				+### Version mismatch?
			
 
				+```bash
			
 
				+# Check current version
			
 
				+npm run version
			
 
				+
			
 
				+# Sync VERSION file with package.json
			
 
				+npm run version > VERSION
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🎉 Getting Help
			
 
				+
			
 
				+- Check [evals/GETTING_STARTED.md](evals/GETTING_STARTED.md) for detailed guides
			
 
				+- Review test examples in `evals/agents/*/tests/`
			
 
				+- Run tests in debug mode: `npm run test:debug`
			
 
				+- View results dashboard: `npm run dashboard:open`
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**Current Version:** 0.1.0-alpha.1  
			
 
				+**Last Updated:** 2025-11-26
			
--- a/VERSION
+++ b/VERSION
@@ -0,0 +1 @@
 
				+0.1.0-alpha.1
			
--- a/package.json
+++ b/package.json
@@ -0,0 +1,64 @@
 
				+{
			
 
				+  "name": "opencode-agents",
			
 
				+  "version": "0.1.0-alpha.1",
			
 
				+  "description": "OpenCode agent evaluation framework and test suites",
			
 
				+  "private": true,
			
 
				+  "workspaces": [
			
 
				+    "evals/framework"
			
 
				+  ],
			
 
				+  "scripts": {
			
 
				+    "test": "npm run test:all",
			
 
				+    "test:all": "cd evals/framework && npm run eval:sdk",
			
 
				+    "test:openagent": "cd evals/framework && npm run eval:sdk -- --agent=openagent",
			
 
				+    "test:opencoder": "cd evals/framework && npm run eval:sdk -- --agent=opencoder",
			
 
				+    "test:openagent:grok": "npm run test:openagent -- --model=opencode/grok-code-fast",
			
 
				+    "test:openagent:claude": "npm run test:openagent -- --model=anthropic/claude-3-5-sonnet-20241022",
			
 
				+    "test:openagent:gpt4": "npm run test:openagent -- --model=openai/gpt-4-turbo",
			
 
				+    "test:opencoder:grok": "npm run test:opencoder -- --model=opencode/grok-code-fast",
			
 
				+    "test:opencoder:claude": "npm run test:opencoder -- --model=anthropic/claude-3-5-sonnet-20241022",
			
 
				+    "test:opencoder:gpt4": "npm run test:opencoder -- --model=openai/gpt-4-turbo",
			
 
				+    "test:all:grok": "npm run test:all -- --model=opencode/grok-code-fast",
			
 
				+    "test:all:claude": "npm run test:all -- --model=anthropic/claude-3-5-sonnet-20241022",
			
 
				+    "test:all:gpt4": "npm run test:all -- --model=openai/gpt-4-turbo",
			
 
				+    "test:pattern": "cd evals/framework && npm run eval:sdk -- --pattern",
			
 
				+    "test:debug": "cd evals/framework && npm run eval:sdk -- --debug",
			
 
				+    "test:openagent:developer": "npm run test:openagent -- --pattern='developer/*.yaml'",
			
 
				+    "test:openagent:context": "npm run test:openagent -- --pattern='context-loading/*.yaml'",
			
 
				+    "test:openagent:business": "npm run test:openagent -- --pattern='business/*.yaml'",
			
 
				+    "test:opencoder:developer": "npm run test:opencoder -- --pattern='developer/*.yaml'",
			
 
				+    "test:opencoder:bash": "npm run test:opencoder -- --pattern='developer/bash-*.yaml'",
			
 
				+    "test:ci": "npm run test:ci:openagent && npm run test:ci:opencoder",
			
 
				+    "test:ci:openagent": "npm run test:openagent -- --pattern='developer/ctx-code-001.yaml' --no-evaluators",
			
 
				+    "test:ci:opencoder": "npm run test:opencoder -- --pattern='developer/simple-bash-test.yaml' --no-evaluators",
			
 
				+    "dashboard": "cd evals/results && ./serve.sh",
			
 
				+    "dashboard:open": "npm run dashboard && open http://localhost:8000",
			
 
				+    "results:openagent": "echo 'OpenAgent results:' && ls -lh evals/results/history/*openagent*.json 2>/dev/null | tail -5 || echo 'No results yet'",
			
 
				+    "results:opencoder": "echo 'OpenCoder results:' && ls -lh evals/results/history/*opencoder*.json 2>/dev/null | tail -5 || echo 'No results yet'",
			
 
				+    "results:latest": "cat evals/results/latest.json 2>/dev/null | jq '.agent, .passed, .failed' || echo 'No results yet'",
			
 
				+    "version": "cat VERSION",
			
 
				+    "version:bump": "./scripts/bump-version.sh",
			
 
				+    "version:bump:patch": "npm version patch --no-git-tag-version && npm run version > VERSION",
			
 
				+    "version:bump:minor": "npm version minor --no-git-tag-version && npm run version > VERSION",
			
 
				+    "version:bump:major": "npm version major --no-git-tag-version && npm run version > VERSION",
			
 
				+    "version:bump:alpha": "npm version prerelease --preid=alpha --no-git-tag-version && npm run version > VERSION",
			
 
				+    "version:bump:beta": "npm version prerelease --preid=beta --no-git-tag-version && npm run version > VERSION",
			
 
				+    "version:bump:rc": "npm version prerelease --preid=rc --no-git-tag-version && npm run version > VERSION",
			
 
				+    "dev:setup": "cd evals/framework && npm install",
			
 
				+    "dev:build": "cd evals/framework && npm run build",
			
 
				+    "dev:test": "cd evals/framework && npm test",
			
 
				+    "dev:clean": "cd evals/framework && rm -rf dist node_modules && npm install"
			
 
				+  },
			
 
				+  "keywords": [
			
 
				+    "opencode",
			
 
				+    "agents",
			
 
				+    "evaluation",
			
 
				+    "testing",
			
 
				+    "ai"
			
 
				+  ],
			
 
				+  "author": "Darren Hinde",
			
 
				+  "license": "MIT",
			
 
				+  "repository": {
			
 
				+    "type": "git",
			
 
				+    "url": "https://github.com/darrenhinde/OpenAgents.git"
			
 
				+  }
			
 
				+}
			
--- a/scripts/bump-version.sh
+++ b/scripts/bump-version.sh
@@ -0,0 +1,73 @@
 
				+#!/bin/bash
			
 
				+# Version bump script
			
 
				+# Usage: ./scripts/bump-version.sh [alpha|beta|rc|patch|minor|major]
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+STAGE=${1:-alpha}
			
 
				+
			
 
				+# Colors
			
 
				+GREEN='\033[0;32m'
			
 
				+BLUE='\033[0;34m'
			
 
				+YELLOW='\033[1;33m'
			
 
				+RED='\033[0;31m'
			
 
				+NC='\033[0m'
			
 
				+
			
 
				+# Get current version
			
 
				+CURRENT_VERSION=$(cat VERSION 2>/dev/null || echo "0.1.0-alpha.1")
			
 
				+
			
 
				+echo -e "${BLUE}📦 Version Bump Tool${NC}"
			
 
				+echo -e "${BLUE}====================${NC}"
			
 
				+echo ""
			
 
				+echo -e "Current version: ${YELLOW}${CURRENT_VERSION}${NC}"
			
 
				+echo -e "Bump type:       ${YELLOW}${STAGE}${NC}"
			
 
				+echo ""
			
 
				+
			
 
				+# Navigate to root
			
 
				+cd "$(dirname "$0")/.." || exit 1
			
 
				+
			
 
				+# Bump version in package.json
			
 
				+case "$STAGE" in
			
 
				+  alpha)
			
 
				+    npm run version:bump:alpha
			
 
				+    ;;
			
 
				+  beta)
			
 
				+    npm run version:bump:beta
			
 
				+    ;;
			
 
				+  rc)
			
 
				+    npm run version:bump:rc
			
 
				+    ;;
			
 
				+  patch)
			
 
				+    npm run version:bump:patch
			
 
				+    ;;
			
 
				+  minor)
			
 
				+    npm run version:bump:minor
			
 
				+    ;;
			
 
				+  major)
			
 
				+    npm run version:bump:major
			
 
				+    ;;
			
 
				+  *)
			
 
				+    echo -e "${RED}❌ Invalid stage: $STAGE${NC}"
			
 
				+    echo -e "${YELLOW}Valid options: alpha, beta, rc, patch, minor, major${NC}"
			
 
				+    exit 1
			
 
				+    ;;
			
 
				+esac
			
 
				+
			
 
				+# Get new version
			
 
				+NEW_VERSION=$(cat VERSION)
			
 
				+
			
 
				+echo ""
			
 
				+echo -e "${GREEN}✅ Version bumped!${NC}"
			
 
				+echo -e "New version: ${GREEN}${NEW_VERSION}${NC}"
			
 
				+echo ""
			
 
				+
			
 
				+# Prompt for changelog update
			
 
				+echo -e "${YELLOW}📝 Don't forget to update CHANGELOG.md!${NC}"
			
 
				+echo ""
			
 
				+echo -e "Next steps:"
			
 
				+echo -e "  1. Update CHANGELOG.md with changes"
			
 
				+echo -e "  2. Commit: ${BLUE}git add VERSION package.json CHANGELOG.md${NC}"
			
 
				+echo -e "  3. Commit: ${BLUE}git commit -m \"chore: bump version to v${NEW_VERSION}\"${NC}"
			
 
				+echo -e "  4. Tag:    ${BLUE}git tag v${NEW_VERSION}${NC}"
			
 
				+echo -e "  5. Push:   ${BLUE}git push origin main --tags${NC}"
			
 
				+echo ""
			
--- a/scripts/dashboard.sh
+++ b/scripts/dashboard.sh
@@ -0,0 +1,58 @@
 
				+#!/bin/bash
			
 
				+# Enhanced dashboard launcher with auto-open
			
 
				+# Usage: ./scripts/dashboard.sh [port] [auto-open]
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+PORT=${1:-8000}
			
 
				+AUTO_OPEN=${2:-true}
			
 
				+
			
 
				+# Colors
			
 
				+GREEN='\033[0;32m'
			
 
				+BLUE='\033[0;34m'
			
 
				+YELLOW='\033[1;33m'
			
 
				+NC='\033[0m'
			
 
				+
			
 
				+echo -e "${BLUE}🚀 Starting OpenCode Agents Dashboard...${NC}"
			
 
				+echo -e "${BLUE}📊 Results directory: evals/results${NC}"
			
 
				+echo -e "${BLUE}🌐 URL: http://localhost:$PORT${NC}"
			
 
				+echo ""
			
 
				+
			
 
				+# Navigate to results directory
			
 
				+cd "$(dirname "$0")/../evals/results" || exit 1
			
 
				+
			
 
				+# Check if results exist
			
 
				+if [ ! -f "latest.json" ]; then
			
 
				+  echo -e "${YELLOW}⚠️  No test results found yet.${NC}"
			
 
				+  echo -e "${YELLOW}   Run tests first: npm test${NC}"
			
 
				+  echo ""
			
 
				+fi
			
 
				+
			
 
				+# Start server in background
			
 
				+./serve.sh "$PORT" &
			
 
				+SERVER_PID=$!
			
 
				+
			
 
				+# Wait for server to start
			
 
				+sleep 2
			
 
				+
			
 
				+# Auto-open browser
			
 
				+if [ "$AUTO_OPEN" = "true" ]; then
			
 
				+  echo -e "${GREEN}🌐 Opening browser...${NC}"
			
 
				+  if command -v open &> /dev/null; then
			
 
				+    open "http://localhost:$PORT"
			
 
				+  elif command -v xdg-open &> /dev/null; then
			
 
				+    xdg-open "http://localhost:$PORT"
			
 
				+  elif command -v start &> /dev/null; then
			
 
				+    start "http://localhost:$PORT"
			
 
				+  else
			
 
				+    echo -e "${YELLOW}⚠️  Could not auto-open browser. Please visit: http://localhost:$PORT${NC}"
			
 
				+  fi
			
 
				+fi
			
 
				+
			
 
				+echo ""
			
 
				+echo -e "${GREEN}✅ Dashboard running (PID: $SERVER_PID)${NC}"
			
 
				+echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
			
 
				+echo ""
			
 
				+
			
 
				+# Wait for Ctrl+C
			
 
				+wait $SERVER_PID
			
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -0,0 +1,60 @@
 
				+#!/bin/bash
			
 
				+# Advanced test runner with multi-agent support
			
 
				+# Usage: ./scripts/test.sh [agent] [model] [options]
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+# Colors
			
 
				+GREEN='\033[0;32m'
			
 
				+BLUE='\033[0;34m'
			
 
				+YELLOW='\033[1;33m'
			
 
				+RED='\033[0;31m'
			
 
				+NC='\033[0m' # No Color
			
 
				+
			
 
				+# Defaults
			
 
				+AGENT=${1:-all}
			
 
				+MODEL=${2:-opencode/grok-code-fast}
			
 
				+shift 2 2>/dev/null || true
			
 
				+EXTRA_ARGS="$@"
			
 
				+
			
 
				+echo -e "${BLUE}🧪 OpenCode Agents Test Runner${NC}"
			
 
				+echo -e "${BLUE}================================${NC}"
			
 
				+echo ""
			
 
				+echo -e "Agent:  ${GREEN}${AGENT}${NC}"
			
 
				+echo -e "Model:  ${GREEN}${MODEL}${NC}"
			
 
				+if [ -n "$EXTRA_ARGS" ]; then
			
 
				+  echo -e "Extra:  ${YELLOW}${EXTRA_ARGS}${NC}"
			
 
				+fi
			
 
				+echo ""
			
 
				+
			
 
				+# Navigate to framework directory
			
 
				+cd "$(dirname "$0")/../evals/framework" || exit 1
			
 
				+
			
 
				+# Check if dependencies are installed
			
 
				+if [ ! -d "node_modules" ]; then
			
 
				+  echo -e "${YELLOW}⚠️  Dependencies not installed. Running npm install...${NC}"
			
 
				+  npm install
			
 
				+  echo ""
			
 
				+fi
			
 
				+
			
 
				+# Run tests
			
 
				+if [ "$AGENT" = "all" ]; then
			
 
				+  echo -e "${YELLOW}Running tests for ALL agents...${NC}"
			
 
				+  npm run eval:sdk -- --model="$MODEL" $EXTRA_ARGS
			
 
				+else
			
 
				+  echo -e "${YELLOW}Running tests for ${AGENT}...${NC}"
			
 
				+  npm run eval:sdk -- --agent="$AGENT" --model="$MODEL" $EXTRA_ARGS
			
 
				+fi
			
 
				+
			
 
				+EXIT_CODE=$?
			
 
				+
			
 
				+echo ""
			
 
				+if [ $EXIT_CODE -eq 0 ]; then
			
 
				+  echo -e "${GREEN}✅ Tests complete!${NC}"
			
 
				+else
			
 
				+  echo -e "${RED}❌ Tests failed with exit code ${EXIT_CODE}${NC}"
			
 
				+fi
			
 
				+echo -e "${BLUE}View results: npm run dashboard${NC}"
			
 
				+echo ""
			
 
				+
			
 
				+exit $EXIT_CODE