core-tests.json 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. {
  2. "name": "OpenAgent Core Test Suite",
  3. "description": "Minimal set of tests providing maximum coverage of critical OpenAgent functionality",
  4. "version": "1.0.0",
  5. "agent": "openagent",
  6. "totalTests": 7,
  7. "estimatedRuntime": "5-8 minutes",
  8. "coverage": {
  9. "approvalGate": true,
  10. "contextLoading": true,
  11. "stopOnFailure": true,
  12. "delegation": true,
  13. "toolUsage": true,
  14. "multiTurn": true,
  15. "subagents": true
  16. },
  17. "tests": [
  18. {
  19. "id": 1,
  20. "name": "Approval Gate",
  21. "path": "01-critical-rules/approval-gate/05-approval-before-execution-positive.yaml",
  22. "category": "critical-rules",
  23. "priority": "critical",
  24. "estimatedTime": "30-60s",
  25. "description": "Validates approval before execution workflow - the most critical safety rule"
  26. },
  27. {
  28. "id": 2,
  29. "name": "Context Loading (Simple)",
  30. "path": "01-critical-rules/context-loading/01-code-task.yaml",
  31. "category": "critical-rules",
  32. "priority": "critical",
  33. "estimatedTime": "60-90s",
  34. "description": "Validates context loading for code tasks - most common use case"
  35. },
  36. {
  37. "id": 3,
  38. "name": "Context Loading (Multi-Turn)",
  39. "path": "01-critical-rules/context-loading/09-multi-standards-to-docs.yaml",
  40. "category": "critical-rules",
  41. "priority": "high",
  42. "estimatedTime": "120-180s",
  43. "description": "Validates multi-turn context loading with multiple context files"
  44. },
  45. {
  46. "id": 4,
  47. "name": "Stop on Failure",
  48. "path": "01-critical-rules/stop-on-failure/02-stop-and-report-positive.yaml",
  49. "category": "critical-rules",
  50. "priority": "critical",
  51. "estimatedTime": "60-90s",
  52. "description": "Validates agent stops and reports errors instead of auto-fixing"
  53. },
  54. {
  55. "id": 5,
  56. "name": "Simple Task (No Delegation)",
  57. "path": "08-delegation/simple-task-direct.yaml",
  58. "category": "delegation",
  59. "priority": "high",
  60. "estimatedTime": "30-60s",
  61. "description": "Validates agent handles simple tasks directly without unnecessary delegation"
  62. },
  63. {
  64. "id": 6,
  65. "name": "Subagent Delegation",
  66. "path": "06-integration/medium/04-subagent-verification.yaml",
  67. "category": "integration",
  68. "priority": "high",
  69. "estimatedTime": "90-120s",
  70. "description": "Validates subagent delegation and execution for appropriate tasks"
  71. },
  72. {
  73. "id": 7,
  74. "name": "Tool Usage",
  75. "path": "09-tool-usage/dedicated-tools-usage.yaml",
  76. "category": "tool-usage",
  77. "priority": "medium",
  78. "estimatedTime": "30-60s",
  79. "description": "Validates agent uses proper tools (read/grep) instead of bash antipatterns"
  80. }
  81. ],
  82. "rationale": {
  83. "why7Tests": "These 7 tests provide ~85% coverage of critical functionality with 90% fewer tests than the full suite",
  84. "coverageBreakdown": {
  85. "criticalSafetyRules": "4/4 rules covered (approval, context, stop-on-failure, report-first)",
  86. "delegationLogic": "2 tests cover both simple (no delegation) and complex (delegation) scenarios",
  87. "toolUsage": "1 test ensures proper tool usage patterns",
  88. "multiTurn": "1 test validates complex multi-turn conversations with context"
  89. },
  90. "useCases": [
  91. "Quick validation when updating OpenAgent prompt",
  92. "Pre-commit hooks for fast feedback",
  93. "CI/CD pull request validation",
  94. "Development iteration cycles"
  95. ]
  96. },
  97. "usage": {
  98. "npm": {
  99. "root": "npm run test:core",
  100. "openagent": "npm run test:openagent:core",
  101. "withModel": "npm run test:openagent:core -- --model=anthropic/claude-sonnet-4-5"
  102. },
  103. "script": {
  104. "basic": "./scripts/testing/test.sh openagent --core",
  105. "withModel": "./scripts/testing/test.sh openagent opencode/grok-code-fast --core"
  106. },
  107. "direct": {
  108. "basic": "cd evals/framework && npm run eval:sdk:core",
  109. "withAgent": "cd evals/framework && npm run eval:sdk:core -- --agent=openagent"
  110. }
  111. },
  112. "comparison": {
  113. "fullSuite": {
  114. "tests": 71,
  115. "runtime": "40-80 minutes",
  116. "coverage": "100%"
  117. },
  118. "coreSuite": {
  119. "tests": 7,
  120. "runtime": "5-8 minutes",
  121. "coverage": "~85%"
  122. },
  123. "savings": {
  124. "tests": "90% fewer tests",
  125. "time": "85-90% faster",
  126. "tokens": "~90% reduction"
  127. }
  128. }
  129. }