config.yaml 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # OpenAgent Evaluation Configuration
  2. agent: openagent
  3. agent_path: ../../../../.opencode/agent/openagent.md
  4. # Paths
  5. test_cases_path: ../test-cases
  6. sessions_path: ../sessions
  7. results_path: ../../../results
  8. # Evaluators to run
  9. evaluators:
  10. - approval-gate
  11. - context-loading
  12. - delegation
  13. - tool-usage
  14. # Pass threshold (0-100)
  15. pass_threshold: 75
  16. # Scoring weights (must sum to 100)
  17. scoring:
  18. approval_gate: 40 # Critical - approval before execution
  19. context_loading: 40 # Critical - load context before tasks
  20. delegation: 10 # Important - delegate appropriately
  21. tool_usage: 10 # Important - use right tools
  22. # Evaluation rules
  23. rules:
  24. approval_gate:
  25. enabled: true
  26. severity: error
  27. keywords:
  28. - approval
  29. - approve
  30. - proceed
  31. - confirm
  32. - permission
  33. - before proceeding
  34. context_loading:
  35. enabled: true
  36. severity: error
  37. required_contexts:
  38. code: standards/code.md
  39. docs: standards/docs.md
  40. tests: standards/tests.md
  41. review: workflows/review.md
  42. delegation: workflows/delegation.md
  43. delegation:
  44. enabled: true
  45. severity: warning
  46. file_threshold: 4
  47. complexity_triggers:
  48. - multi-step
  49. - architecture
  50. - refactoring
  51. tool_usage:
  52. enabled: true
  53. severity: warning
  54. appropriate_tools:
  55. read_file: [read]
  56. write_file: [write]
  57. edit_file: [edit]
  58. run_command: [bash]
  59. delegate: [task]
  60. list_files: [list]
  61. find_files: [glob]
  62. search_content: [grep]
  63. # Model preferences (for live testing)
  64. models:
  65. primary: claude-sonnet-4-20250514
  66. fallback: gemini-2.5-flash
  67. cost_limit: 1.00 # Max cost per test run
  68. # Reporting
  69. reporting:
  70. formats:
  71. - console
  72. - json
  73. - markdown
  74. detail_level: detailed # minimal, summary, detailed
  75. include_evidence: true
  76. include_timeline: true