| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- # OpenAgent Evaluation Configuration
- agent: openagent
- agent_path: ../../../../.opencode/agent/openagent.md
- # Paths
- test_cases_path: ../test-cases
- sessions_path: ../sessions
- results_path: ../../../results
- # Evaluators to run
- evaluators:
- - approval-gate
- - context-loading
- - delegation
- - tool-usage
- # Pass threshold (0-100)
- pass_threshold: 75
- # Scoring weights (must sum to 100)
- scoring:
- approval_gate: 40 # Critical - approval before execution
- context_loading: 40 # Critical - load context before tasks
- delegation: 10 # Important - delegate appropriately
- tool_usage: 10 # Important - use right tools
- # Evaluation rules
- rules:
- approval_gate:
- enabled: true
- severity: error
- keywords:
- - approval
- - approve
- - proceed
- - confirm
- - permission
- - before proceeding
-
- context_loading:
- enabled: true
- severity: error
- required_contexts:
- code: standards/code.md
- docs: standards/docs.md
- tests: standards/tests.md
- review: workflows/review.md
- delegation: workflows/delegation.md
-
- delegation:
- enabled: true
- severity: warning
- file_threshold: 4
- complexity_triggers:
- - multi-step
- - architecture
- - refactoring
-
- tool_usage:
- enabled: true
- severity: warning
- appropriate_tools:
- read_file: [read]
- write_file: [write]
- edit_file: [edit]
- run_command: [bash]
- delegate: [task]
- list_files: [list]
- find_files: [glob]
- search_content: [grep]
- # Model preferences (for live testing)
- models:
- primary: claude-sonnet-4-20250514
- fallback: gemini-2.5-flash
- cost_limit: 1.00 # Max cost per test run
- # Reporting
- reporting:
- formats:
- - console
- - json
- - markdown
- detail_level: detailed # minimal, summary, detailed
- include_evidence: true
- include_timeline: true
|