run-tests.js 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. /**
  2. * OpenAgent Synthetic Test Runner
  3. *
  4. * Loads synthetic test sessions, runs evaluators, compares actual vs expected results
  5. */
  6. const fs = require('fs');
  7. const path = require('path');
  8. // Import framework from evals/framework
  9. const {
  10. ApprovalGateEvaluator,
  11. ContextLoadingEvaluator,
  12. DelegationEvaluator,
  13. ToolUsageEvaluator
  14. } = require('../../framework/dist');
  15. // Mock SessionInfo for synthetic tests
  16. function createMockSessionInfo(testId) {
  17. return {
  18. id: `synthetic_${testId}`,
  19. version: '1.0',
  20. title: `Synthetic Test: ${testId}`,
  21. time: {
  22. created: Date.now(),
  23. updated: Date.now()
  24. }
  25. };
  26. }
  27. // Load test cases
  28. function loadTestCases(testsDir) {
  29. const testCases = [];
  30. const categories = fs.readdirSync(testsDir);
  31. for (const category of categories) {
  32. const categoryPath = path.join(testsDir, category);
  33. if (!fs.statSync(categoryPath).isDirectory()) continue;
  34. const tests = fs.readdirSync(categoryPath);
  35. for (const testName of tests) {
  36. const testPath = path.join(categoryPath, testName);
  37. if (!fs.statSync(testPath).isDirectory()) continue;
  38. const timelinePath = path.join(testPath, 'timeline.json');
  39. const expectedPath = path.join(testPath, 'expected.json');
  40. if (fs.existsSync(timelinePath) && fs.existsSync(expectedPath)) {
  41. testCases.push({
  42. id: testName,
  43. category,
  44. timeline: JSON.parse(fs.readFileSync(timelinePath, 'utf-8')),
  45. expected: JSON.parse(fs.readFileSync(expectedPath, 'utf-8'))
  46. });
  47. }
  48. }
  49. }
  50. return testCases;
  51. }
  52. // Compare actual vs expected
  53. function compareResults(actual, expected, evaluatorName) {
  54. const issues = [];
  55. // Check passed
  56. if (actual.passed !== expected.passed) {
  57. issues.push(` ✗ Passed mismatch: got ${actual.passed}, expected ${expected.passed}`);
  58. }
  59. // Check score
  60. if (actual.score !== expected.score) {
  61. issues.push(` ✗ Score mismatch: got ${actual.score}, expected ${expected.score}`);
  62. }
  63. // Check violation count
  64. if (actual.violations.length !== expected.violation_count) {
  65. issues.push(` ✗ Violation count: got ${actual.violations.length}, expected ${expected.violation_count}`);
  66. }
  67. // Check violation types (if violations exist)
  68. if (expected.violations && expected.violations.length > 0) {
  69. for (const expectedViolation of expected.violations) {
  70. const found = actual.violations.some(v =>
  71. v.type === expectedViolation.type &&
  72. v.severity === expectedViolation.severity
  73. );
  74. if (!found) {
  75. issues.push(` ✗ Missing violation: ${expectedViolation.type} (${expectedViolation.severity})`);
  76. }
  77. }
  78. }
  79. return issues;
  80. }
  81. // Run single test
  82. async function runTest(testCase) {
  83. console.log(`\n${'='.repeat(80)}`);
  84. console.log(`TEST: ${testCase.id}`);
  85. console.log(`Category: ${testCase.category}`);
  86. console.log(`Description: ${testCase.expected.description}`);
  87. console.log('='.repeat(80));
  88. const sessionInfo = createMockSessionInfo(testCase.id);
  89. const timeline = testCase.timeline;
  90. // Create evaluators
  91. const evaluators = {
  92. ApprovalGateEvaluator: new ApprovalGateEvaluator(),
  93. ContextLoadingEvaluator: new ContextLoadingEvaluator(),
  94. DelegationEvaluator: new DelegationEvaluator(),
  95. ToolUsageEvaluator: new ToolUsageEvaluator()
  96. };
  97. const results = {};
  98. const allIssues = [];
  99. // Run each evaluator
  100. for (const [name, evaluator] of Object.entries(evaluators)) {
  101. console.log(`\nRunning ${name}...`);
  102. const actual = await evaluator.evaluate(timeline, sessionInfo);
  103. const expected = testCase.expected.expected_results[name];
  104. results[name] = actual;
  105. // Display actual results
  106. console.log(` Status: ${actual.passed ? '✓ PASS' : '✗ FAIL'}`);
  107. console.log(` Score: ${actual.score}/100`);
  108. console.log(` Violations: ${actual.violations.length}`);
  109. if (actual.violations.length > 0) {
  110. actual.violations.forEach(v => {
  111. console.log(` - [${v.severity.toUpperCase()}] ${v.type}: ${v.message}`);
  112. });
  113. }
  114. // Compare with expected
  115. const issues = compareResults(actual, expected, name);
  116. if (issues.length > 0) {
  117. console.log(`\n ❌ ISSUES FOUND:`);
  118. issues.forEach(issue => console.log(issue));
  119. allIssues.push(...issues.map(i => `${name}: ${i}`));
  120. } else {
  121. console.log(` ✅ Matches expected behavior`);
  122. }
  123. }
  124. // Overall test result
  125. const testPassed = allIssues.length === 0;
  126. console.log(`\n${'─'.repeat(80)}`);
  127. console.log(`TEST RESULT: ${testPassed ? '✅ PASS' : '❌ FAIL'}`);
  128. if (!testPassed) {
  129. console.log(`\nIssues (${allIssues.length}):`);
  130. allIssues.forEach(issue => console.log(` ${issue}`));
  131. }
  132. return {
  133. id: testCase.id,
  134. passed: testPassed,
  135. issues: allIssues,
  136. results
  137. };
  138. }
  139. // Main
  140. async function main() {
  141. console.log('='.repeat(80));
  142. console.log('OPENAGENT SYNTHETIC TEST SUITE');
  143. console.log('='.repeat(80));
  144. const testsDir = path.join(__dirname, 'tests');
  145. const testCases = loadTestCases(testsDir);
  146. console.log(`\nFound ${testCases.length} test cases:\n`);
  147. testCases.forEach((tc, idx) => {
  148. console.log(` ${idx + 1}. ${tc.category}/${tc.id}`);
  149. });
  150. // Run all tests
  151. const testResults = [];
  152. for (const testCase of testCases) {
  153. const result = await runTest(testCase);
  154. testResults.push(result);
  155. }
  156. // Summary
  157. console.log('\n\n' + '='.repeat(80));
  158. console.log('TEST SUMMARY');
  159. console.log('='.repeat(80));
  160. const passedCount = testResults.filter(r => r.passed).length;
  161. const failedCount = testResults.length - passedCount;
  162. const passRate = Math.round((passedCount / testResults.length) * 100);
  163. console.log(`\nTotal Tests: ${testResults.length}`);
  164. console.log(`Passed: ${passedCount} (${passRate}%)`);
  165. console.log(`Failed: ${failedCount} (${100 - passRate}%)`);
  166. console.log(`\nTest Results:`);
  167. testResults.forEach((result, idx) => {
  168. const status = result.passed ? '✅' : '❌';
  169. console.log(` ${status} ${result.id}`);
  170. if (!result.passed) {
  171. console.log(` Issues: ${result.issues.length}`);
  172. }
  173. });
  174. if (failedCount > 0) {
  175. console.log(`\n${'='.repeat(80)}`);
  176. console.log('FAILED TESTS - DETAILED ISSUES');
  177. console.log('='.repeat(80));
  178. testResults.filter(r => !r.passed).forEach(result => {
  179. console.log(`\n${result.id}:`);
  180. result.issues.forEach(issue => console.log(` ${issue}`));
  181. });
  182. }
  183. console.log('\n' + '='.repeat(80));
  184. console.log(`FINAL RESULT: ${failedCount === 0 ? '✅ ALL TESTS PASSED' : '❌ SOME TESTS FAILED'}`);
  185. console.log('='.repeat(80));
  186. process.exit(failedCount > 0 ? 1 : 0);
  187. }
  188. main().catch(error => {
  189. console.error('Error running tests:', error);
  190. process.exit(1);
  191. });