logic855
/
OpenAgentsControl
mirror of https://github.com/darrenhinde/OpenAgentsControl.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
							/**
 * OpenAgent Synthetic Test Runner
 * 
 * Loads synthetic test sessions, runs evaluators, compares actual vs expected results
 */

const fs = require('fs');
const path = require('path');

// Import framework from evals/framework
const {
  ApprovalGateEvaluator,
  ContextLoadingEvaluator,
  DelegationEvaluator,
  ToolUsageEvaluator
} = require('../../framework/dist');

// Mock SessionInfo for synthetic tests
function createMockSessionInfo(testId) {
  return {
    id: `synthetic_${testId}`,
    version: '1.0',
    title: `Synthetic Test: ${testId}`,
    time: {
      created: Date.now(),
      updated: Date.now()
    }
  };
}

// Load test cases
function loadTestCases(testsDir) {
  const testCases = [];
  const categories = fs.readdirSync(testsDir);
  
  for (const category of categories) {
    const categoryPath = path.join(testsDir, category);
    if (!fs.statSync(categoryPath).isDirectory()) continue;
    
    const tests = fs.readdirSync(categoryPath);
    for (const testName of tests) {
      const testPath = path.join(categoryPath, testName);
      if (!fs.statSync(testPath).isDirectory()) continue;
      
      const timelinePath = path.join(testPath, 'timeline.json');
      const expectedPath = path.join(testPath, 'expected.json');
      
      if (fs.existsSync(timelinePath) && fs.existsSync(expectedPath)) {
        testCases.push({
          id: testName,
          category,
          timeline: JSON.parse(fs.readFileSync(timelinePath, 'utf-8')),
          expected: JSON.parse(fs.readFileSync(expectedPath, 'utf-8'))
        });
      }
    }
  }
  
  return testCases;
}

// Compare actual vs expected
function compareResults(actual, expected, evaluatorName) {
  const issues = [];
  
  // Check passed
  if (actual.passed !== expected.passed) {
    issues.push(`  ✗ Passed mismatch: got ${actual.passed}, expected ${expected.passed}`);
  }
  
  // Check score
  if (actual.score !== expected.score) {
    issues.push(`  ✗ Score mismatch: got ${actual.score}, expected ${expected.score}`);
  }
  
  // Check violation count
  if (actual.violations.length !== expected.violation_count) {
    issues.push(`  ✗ Violation count: got ${actual.violations.length}, expected ${expected.violation_count}`);
  }
  
  // Check violation types (if violations exist)
  if (expected.violations && expected.violations.length > 0) {
    for (const expectedViolation of expected.violations) {
      const found = actual.violations.some(v => 
        v.type === expectedViolation.type && 
        v.severity === expectedViolation.severity
      );
      if (!found) {
        issues.push(`  ✗ Missing violation: ${expectedViolation.type} (${expectedViolation.severity})`);
      }
    }
  }
  
  return issues;
}

// Run single test
async function runTest(testCase) {
  console.log(`\n${'='.repeat(80)}`);
  console.log(`TEST: ${testCase.id}`);
  console.log(`Category: ${testCase.category}`);
  console.log(`Description: ${testCase.expected.description}`);
  console.log('='.repeat(80));
  
  const sessionInfo = createMockSessionInfo(testCase.id);
  const timeline = testCase.timeline;
  
  // Create evaluators
  const evaluators = {
    ApprovalGateEvaluator: new ApprovalGateEvaluator(),
    ContextLoadingEvaluator: new ContextLoadingEvaluator(),
    DelegationEvaluator: new DelegationEvaluator(),
    ToolUsageEvaluator: new ToolUsageEvaluator()
  };
  
  const results = {};
  const allIssues = [];
  
  // Run each evaluator
  for (const [name, evaluator] of Object.entries(evaluators)) {
    console.log(`\nRunning ${name}...`);
    const actual = await evaluator.evaluate(timeline, sessionInfo);
    const expected = testCase.expected.expected_results[name];
    
    results[name] = actual;
    
    // Display actual results
    console.log(`  Status: ${actual.passed ? '✓ PASS' : '✗ FAIL'}`);
    console.log(`  Score: ${actual.score}/100`);
    console.log(`  Violations: ${actual.violations.length}`);
    
    if (actual.violations.length > 0) {
      actual.violations.forEach(v => {
        console.log(`    - [${v.severity.toUpperCase()}] ${v.type}: ${v.message}`);
      });
    }
    
    // Compare with expected
    const issues = compareResults(actual, expected, name);
    if (issues.length > 0) {
      console.log(`\n  ❌ ISSUES FOUND:`);
      issues.forEach(issue => console.log(issue));
      allIssues.push(...issues.map(i => `${name}: ${i}`));
    } else {
      console.log(`  ✅ Matches expected behavior`);
    }
  }
  
  // Overall test result
  const testPassed = allIssues.length === 0;
  console.log(`\n${'─'.repeat(80)}`);
  console.log(`TEST RESULT: ${testPassed ? '✅ PASS' : '❌ FAIL'}`);
  if (!testPassed) {
    console.log(`\nIssues (${allIssues.length}):`);
    allIssues.forEach(issue => console.log(`  ${issue}`));
  }
  
  return {
    id: testCase.id,
    passed: testPassed,
    issues: allIssues,
    results
  };
}

// Main
async function main() {
  console.log('='.repeat(80));
  console.log('OPENAGENT SYNTHETIC TEST SUITE');
  console.log('='.repeat(80));
  
  const testsDir = path.join(__dirname, 'tests');
  const testCases = loadTestCases(testsDir);
  
  console.log(`\nFound ${testCases.length} test cases:\n`);
  testCases.forEach((tc, idx) => {
    console.log(`  ${idx + 1}. ${tc.category}/${tc.id}`);
  });
  
  // Run all tests
  const testResults = [];
  for (const testCase of testCases) {
    const result = await runTest(testCase);
    testResults.push(result);
  }
  
  // Summary
  console.log('\n\n' + '='.repeat(80));
  console.log('TEST SUMMARY');
  console.log('='.repeat(80));
  
  const passedCount = testResults.filter(r => r.passed).length;
  const failedCount = testResults.length - passedCount;
  const passRate = Math.round((passedCount / testResults.length) * 100);
  
  console.log(`\nTotal Tests: ${testResults.length}`);
  console.log(`Passed: ${passedCount} (${passRate}%)`);
  console.log(`Failed: ${failedCount} (${100 - passRate}%)`);
  
  console.log(`\nTest Results:`);
  testResults.forEach((result, idx) => {
    const status = result.passed ? '✅' : '❌';
    console.log(`  ${status} ${result.id}`);
    if (!result.passed) {
      console.log(`     Issues: ${result.issues.length}`);
    }
  });
  
  if (failedCount > 0) {
    console.log(`\n${'='.repeat(80)}`);
    console.log('FAILED TESTS - DETAILED ISSUES');
    console.log('='.repeat(80));
    
    testResults.filter(r => !r.passed).forEach(result => {
      console.log(`\n${result.id}:`);
      result.issues.forEach(issue => console.log(`  ${issue}`));
    });
  }
  
  console.log('\n' + '='.repeat(80));
  console.log(`FINAL RESULT: ${failedCount === 0 ? '✅ ALL TESTS PASSED' : '❌ SOME TESTS FAILED'}`);
  console.log('='.repeat(80));
  
  process.exit(failedCount > 0 ? 1 : 0);
}

main().catch(error => {
  console.error('Error running tests:', error);
  process.exit(1);
});