core-tests.json 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. {
  2. "name": "OpenAgent Core Test Suite",
  3. "description": "Minimal set of tests providing maximum coverage of critical OpenAgent functionality",
  4. "version": "1.0.0",
  5. "totalTests": 7,
  6. "estimatedRuntime": "5-8 minutes",
  7. "coverage": {
  8. "approvalGate": true,
  9. "contextLoading": true,
  10. "stopOnFailure": true,
  11. "delegation": true,
  12. "toolUsage": true,
  13. "multiTurn": true,
  14. "subagents": true
  15. },
  16. "tests": [
  17. {
  18. "id": 1,
  19. "name": "Approval Gate",
  20. "path": "01-critical-rules/approval-gate/05-approval-before-execution-positive.yaml",
  21. "category": "critical-rules",
  22. "priority": "critical",
  23. "estimatedTime": "30-60s",
  24. "description": "Validates approval before execution workflow - the most critical safety rule"
  25. },
  26. {
  27. "id": 2,
  28. "name": "Context Loading (Simple)",
  29. "path": "01-critical-rules/context-loading/01-code-task.yaml",
  30. "category": "critical-rules",
  31. "priority": "critical",
  32. "estimatedTime": "60-90s",
  33. "description": "Validates context loading for code tasks - most common use case"
  34. },
  35. {
  36. "id": 3,
  37. "name": "Context Loading (Multi-Turn)",
  38. "path": "01-critical-rules/context-loading/09-multi-standards-to-docs.yaml",
  39. "category": "critical-rules",
  40. "priority": "high",
  41. "estimatedTime": "120-180s",
  42. "description": "Validates multi-turn context loading with multiple context files"
  43. },
  44. {
  45. "id": 4,
  46. "name": "Stop on Failure",
  47. "path": "01-critical-rules/stop-on-failure/02-stop-and-report-positive.yaml",
  48. "category": "critical-rules",
  49. "priority": "critical",
  50. "estimatedTime": "60-90s",
  51. "description": "Validates agent stops and reports errors instead of auto-fixing"
  52. },
  53. {
  54. "id": 5,
  55. "name": "Simple Task (No Delegation)",
  56. "path": "08-delegation/simple-task-direct.yaml",
  57. "category": "delegation",
  58. "priority": "high",
  59. "estimatedTime": "30-60s",
  60. "description": "Validates agent handles simple tasks directly without unnecessary delegation"
  61. },
  62. {
  63. "id": 6,
  64. "name": "Subagent Delegation",
  65. "path": "06-integration/medium/04-subagent-verification.yaml",
  66. "category": "integration",
  67. "priority": "high",
  68. "estimatedTime": "90-120s",
  69. "description": "Validates subagent delegation and execution for appropriate tasks"
  70. },
  71. {
  72. "id": 7,
  73. "name": "Tool Usage",
  74. "path": "09-tool-usage/dedicated-tools-usage.yaml",
  75. "category": "tool-usage",
  76. "priority": "medium",
  77. "estimatedTime": "30-60s",
  78. "description": "Validates agent uses proper tools (read/grep) instead of bash antipatterns"
  79. }
  80. ],
  81. "rationale": {
  82. "why7Tests": "These 7 tests provide ~85% coverage of critical functionality with 90% fewer tests than the full suite",
  83. "coverageBreakdown": {
  84. "criticalSafetyRules": "4/4 rules covered (approval, context, stop-on-failure, report-first)",
  85. "delegationLogic": "2 tests cover both simple (no delegation) and complex (delegation) scenarios",
  86. "toolUsage": "1 test ensures proper tool usage patterns",
  87. "multiTurn": "1 test validates complex multi-turn conversations with context"
  88. },
  89. "useCases": [
  90. "Quick validation when updating OpenAgent prompt",
  91. "Pre-commit hooks for fast feedback",
  92. "CI/CD pull request validation",
  93. "Development iteration cycles"
  94. ]
  95. },
  96. "usage": {
  97. "npm": {
  98. "root": "npm run test:core",
  99. "openagent": "npm run test:openagent:core",
  100. "withModel": "npm run test:openagent:core -- --model=anthropic/claude-sonnet-4-5"
  101. },
  102. "script": {
  103. "basic": "./scripts/test.sh openagent --core",
  104. "withModel": "./scripts/test.sh openagent opencode/grok-code-fast --core"
  105. },
  106. "direct": {
  107. "basic": "cd evals/framework && npm run eval:sdk:core",
  108. "withAgent": "cd evals/framework && npm run eval:sdk:core -- --agent=openagent"
  109. }
  110. },
  111. "comparison": {
  112. "fullSuite": {
  113. "tests": 71,
  114. "runtime": "40-80 minutes",
  115. "coverage": "100%"
  116. },
  117. "coreSuite": {
  118. "tests": 7,
  119. "runtime": "5-8 minutes",
  120. "coverage": "~85%"
  121. },
  122. "savings": {
  123. "tests": "90% fewer tests",
  124. "time": "85-90% faster",
  125. "tokens": "~90% reduction"
  126. }
  127. }
  128. }