4 months ago · 12d087b55f
--- a/evals/framework/src/sdk/event-logger.ts
+++ b/evals/framework/src/sdk/event-logger.ts
@@ -0,0 +1,128 @@
 
				+/**
			
 
				+ * EventLogger - Event logging utilities
			
 
				+ * 
			
 
				+ * Handles logging of server events with meaningful details.
			
 
				+ * Extracted from test-runner.ts for better modularity.
			
 
				+ */
			
 
				+
			
 
				+import type { ServerEvent } from './event-stream-handler.js';
			
 
				+
			
 
				+/**
			
 
				+ * Log event with meaningful details
			
 
				+ * 
			
 
				+ * Event properties structure varies by type:
			
 
				+ * - session.created/updated: { id, title, ... }
			
 
				+ * - message.updated: { id, sessionID, role, ... }
			
 
				+ * - part.updated: { id, messageID, type, tool?, input?, output?, ... }
			
 
				+ */
			
 
				+export function logEvent(event: ServerEvent): void {
			
 
				+  const props = event.properties || {};
			
 
				+  
			
 
				+  switch (event.type) {
			
 
				+    case 'session.created':
			
 
				+      console.log(`📋 Session created`);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'session.updated':
			
 
				+      // Session updates are frequent but not very informative
			
 
				+      // Skip logging unless there's something specific
			
 
				+      break;
			
 
				+      
			
 
				+    case 'message.created':
			
 
				+      console.log(`💬 New message (${props.role || 'assistant'})`);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'message.updated':
			
 
				+      // Message updates happen frequently during streaming
			
 
				+      // Only log role changes or completion
			
 
				+      if (props.role === 'user') {
			
 
				+        console.log(`👤 User message received`);
			
 
				+      }
			
 
				+      // Skip assistant message updates (too noisy)
			
 
				+      break;
			
 
				+      
			
 
				+    case 'part.created':
			
 
				+    case 'part.updated':
			
 
				+      logPartEvent(props);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'permission.request':
			
 
				+      console.log(`🔐 Permission requested: ${props.tool || 'unknown'}`);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'permission.response':
			
 
				+      console.log(`🔐 Permission ${props.response === 'once' || props.approved ? 'granted' : 'denied'}`);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'tool.call':
			
 
				+      console.log(`🔧 Tool call: ${props.tool || props.name || 'unknown'}`);
			
 
				+      break;
			
 
				+      
			
 
				+    case 'tool.result':
			
 
				+      const success = props.error ? '❌' : '✅';
			
 
				+      console.log(`${success} Tool result: ${props.tool || 'unknown'}`);
			
 
				+      break;
			
 
				+      
			
 
				+    default:
			
 
				+      // Skip unknown events to reduce noise
			
 
				+      break;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Log part events (tools, text, etc.)
			
 
				+ */
			
 
				+function logPartEvent(props: any): void {
			
 
				+  if (props.type === 'tool') {
			
 
				+    const toolName = props.tool || 'unknown';
			
 
				+    const status = props.state?.status || props.status || '';
			
 
				+    
			
 
				+    // Only log when tool starts or completes
			
 
				+    if (status === 'running' || status === 'pending') {
			
 
				+      console.log(`🔧 Tool: ${toolName} (starting)`);
			
 
				+      
			
 
				+      // Show tool input preview
			
 
				+      const input = props.state?.input || props.input || {};
			
 
				+      if (input.command) {
			
 
				+        const cmd = input.command.substring(0, 70);
			
 
				+        console.log(`   └─ ${cmd}${input.command.length > 70 ? '...' : ''}`);
			
 
				+      } else if (input.filePath) {
			
 
				+        console.log(`   └─ ${input.filePath}`);
			
 
				+      } else if (input.pattern) {
			
 
				+        console.log(`   └─ pattern: ${input.pattern}`);
			
 
				+      }
			
 
				+    } else if (status === 'completed') {
			
 
				+      console.log(`✅ Tool: ${toolName} (completed)`);
			
 
				+    } else if (status === 'error') {
			
 
				+      console.log(`❌ Tool: ${toolName} (error)`);
			
 
				+    }
			
 
				+  } else if (props.type === 'text') {
			
 
				+    // Text parts - show preview of assistant response
			
 
				+    const text = props.text || '';
			
 
				+    if (text.length > 0) {
			
 
				+      const preview = text.substring(0, 100).replace(/\n/g, ' ');
			
 
				+      console.log(`📝 ${preview}${text.length > 100 ? '...' : ''}`);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Create a logger that respects debug mode
			
 
				+ */
			
 
				+export function createLogger(debug: boolean): {
			
 
				+  log: (message: string) => void;
			
 
				+  logEvent: (event: ServerEvent) => void;
			
 
				+} {
			
 
				+  return {
			
 
				+    log: (message: string) => {
			
 
				+      if (debug || message.includes('PASSED') || message.includes('FAILED')) {
			
 
				+        console.log(message);
			
 
				+      }
			
 
				+    },
			
 
				+    logEvent: (event: ServerEvent) => {
			
 
				+      if (debug) {
			
 
				+        logEvent(event);
			
 
				+      }
			
 
				+    },
			
 
				+  };
			
 
				+}
			
--- a/evals/framework/src/sdk/index.ts
+++ b/evals/framework/src/sdk/index.ts
@@ -20,3 +20,22 @@ export { AutoApproveStrategy } from './approval/auto-approve-strategy.js';
 
				 export { AutoDenyStrategy } from './approval/auto-deny-strategy.js';
			
 
				 export { SmartApprovalStrategy } from './approval/smart-approval-strategy.js';
			
 
				 export type { SmartApprovalConfig } from './approval/smart-approval-strategy.js';
			
 
				+
			
 
				+// Test execution (modular components)
			
 
				+export { TestRunner } from './test-runner.js';
			
 
				+export type { TestRunnerConfig, TestResult } from './test-runner.js';
			
 
				+
			
 
				+export { TestExecutor } from './test-executor.js';
			
 
				+export type { ExecutionConfig, ExecutionResult, ExecutionLogger } from './test-executor.js';
			
 
				+
			
 
				+export { ResultValidator } from './result-validator.js';
			
 
				+export type { ValidationLogger } from './result-validator.js';
			
 
				+
			
 
				+export { logEvent, createLogger } from './event-logger.js';
			
 
				+
			
 
				+// Test case loading
			
 
				+export { loadTestCase, loadTestCases } from './test-case-loader.js';
			
 
				+export type { TestCase, BehaviorExpectation } from './test-case-schema.js';
			
 
				+
			
 
				+// Result saving
			
 
				+export { ResultSaver } from './result-saver.js';
			
--- a/evals/framework/src/sdk/result-validator.ts
+++ b/evals/framework/src/sdk/result-validator.ts
@@ -0,0 +1,253 @@
 
				+/**
			
 
				+ * ResultValidator - Test result validation logic
			
 
				+ * 
			
 
				+ * Handles validation of test results against expected outcomes:
			
 
				+ * - Behavior expectations (mustUseTools, etc.)
			
 
				+ * - Expected violations (positive/negative tests)
			
 
				+ * - Legacy expected format (deprecated)
			
 
				+ * - Default pass/fail logic
			
 
				+ * 
			
 
				+ * Extracted from test-runner.ts for better modularity.
			
 
				+ */
			
 
				+
			
 
				+import type { TestCase } from './test-case-schema.js';
			
 
				+import type { ServerEvent } from './event-stream-handler.js';
			
 
				+import type { AggregatedResult } from '../evaluators/evaluator-runner.js';
			
 
				+
			
 
				+/**
			
 
				+ * Logger interface for dependency injection
			
 
				+ */
			
 
				+export interface ValidationLogger {
			
 
				+  log(message: string): void;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * ResultValidator handles test result validation
			
 
				+ */
			
 
				+export class ResultValidator {
			
 
				+  constructor(private readonly logger: ValidationLogger) {}
			
 
				+
			
 
				+  /**
			
 
				+   * Evaluate if test result matches expected outcome
			
 
				+   * 
			
 
				+   * Evaluation priority:
			
 
				+   * 1. Check for execution errors
			
 
				+   * 2. Check behavior expectations (if defined)
			
 
				+   * 3. Check expected violations (if defined)
			
 
				+   * 4. Check deprecated expected format (if defined)
			
 
				+   * 5. Default: pass if no errors
			
 
				+   */
			
 
				+  validate(
			
 
				+    testCase: TestCase,
			
 
				+    events: ServerEvent[],
			
 
				+    errors: string[],
			
 
				+    evaluation?: AggregatedResult
			
 
				+  ): boolean {
			
 
				+    // Support both old and new schema
			
 
				+    const expected = testCase.expected;
			
 
				+    const behavior = testCase.behavior;
			
 
				+    const expectedViolations = testCase.expectedViolations;
			
 
				+
			
 
				+    // If there were execution errors and test expects to pass, it fails
			
 
				+    if (errors.length > 0 && expected?.pass !== false) {
			
 
				+      this.logger.log(`Test failed due to execution errors: ${errors.join(', ')}`);
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    // =========================================================================
			
 
				+    // Check behavior evaluator results FIRST (most important)
			
 
				+    // =========================================================================
			
 
				+    if (behavior && evaluation) {
			
 
				+      if (!this.checkBehaviorExpectations(evaluation)) {
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // =========================================================================
			
 
				+    // Check expected violations (new format)
			
 
				+    // =========================================================================
			
 
				+    const expectedViolationTypes = new Set<string>();
			
 
				+    
			
 
				+    if (expectedViolations && evaluation) {
			
 
				+      const violationResult = this.checkExpectedViolations(expectedViolations, evaluation, expectedViolationTypes);
			
 
				+      if (!violationResult) {
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // =========================================================================
			
 
				+    // Check deprecated expected format
			
 
				+    // =========================================================================
			
 
				+    if (expected) {
			
 
				+      const legacyResult = this.checkLegacyExpected(expected, events, errors, evaluation);
			
 
				+      if (legacyResult !== null) {
			
 
				+        return legacyResult;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // =========================================================================
			
 
				+    // Default: pass if no errors and no unexpected error-level violations
			
 
				+    // =========================================================================
			
 
				+    if (evaluation && evaluation.violationsBySeverity.error > 0) {
			
 
				+      // Filter out expected violations
			
 
				+      const unexpectedErrors = evaluation.allViolations.filter(v => 
			
 
				+        v.severity === 'error' && !expectedViolationTypes.has(v.type)
			
 
				+      );
			
 
				+      
			
 
				+      if (unexpectedErrors.length > 0) {
			
 
				+        this.logger.log(`Test failed: ${unexpectedErrors.length} unexpected error-level violations`);
			
 
				+        unexpectedErrors.forEach(v => this.logger.log(`  - ${v.type}: ${v.message}`));
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    return errors.length === 0;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Check behavior evaluator results
			
 
				+   */
			
 
				+  private checkBehaviorExpectations(evaluation: AggregatedResult): boolean {
			
 
				+    // Find the behavior evaluator result
			
 
				+    const behaviorResult = evaluation.evaluatorResults.find(r => r.evaluator === 'behavior');
			
 
				+    
			
 
				+    if (behaviorResult) {
			
 
				+      // Check if behavior evaluator passed
			
 
				+      if (!behaviorResult.passed) {
			
 
				+        this.logger.log(`Behavior validation failed: ${behaviorResult.violations.length} violations`);
			
 
				+        behaviorResult.violations.forEach(v => {
			
 
				+          this.logger.log(`  - [${v.severity}] ${v.type}: ${v.message}`);
			
 
				+        });
			
 
				+        return false;
			
 
				+      }
			
 
				+      
			
 
				+      // Check for error-level violations from behavior evaluator
			
 
				+      const behaviorErrors = behaviorResult.violations.filter(v => v.severity === 'error');
			
 
				+      if (behaviorErrors.length > 0) {
			
 
				+        this.logger.log(`Behavior validation has ${behaviorErrors.length} error-level violations`);
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Check expected violations (new format)
			
 
				+   */
			
 
				+  private checkExpectedViolations(
			
 
				+    expectedViolations: TestCase['expectedViolations'],
			
 
				+    evaluation: AggregatedResult,
			
 
				+    expectedViolationTypes: Set<string>
			
 
				+  ): boolean {
			
 
				+    if (!expectedViolations) return true;
			
 
				+
			
 
				+    for (const expectedViolation of expectedViolations) {
			
 
				+      // Map rule names to violation type patterns
			
 
				+      const rulePatterns: Record<string, string[]> = {
			
 
				+        'approval-gate': ['approval', 'missing-approval'],
			
 
				+        'context-loading': ['context', 'no-context-loaded', 'missing-context'],
			
 
				+        'delegation': ['delegation', 'missing-delegation'],
			
 
				+        'tool-usage': ['tool', 'suboptimal-tool'],
			
 
				+        'stop-on-failure': ['stop', 'failure'],
			
 
				+        'confirm-cleanup': ['cleanup', 'confirm'],
			
 
				+      };
			
 
				+
			
 
				+      const patterns = rulePatterns[expectedViolation.rule] || [expectedViolation.rule];
			
 
				+      
			
 
				+      const actualViolations = evaluation.allViolations.filter(v => 
			
 
				+        patterns.some(pattern => v.type.toLowerCase().includes(pattern.toLowerCase()))
			
 
				+      );
			
 
				+
			
 
				+      if (expectedViolation.shouldViolate) {
			
 
				+        // Negative test: Should have violation
			
 
				+        if (actualViolations.length === 0) {
			
 
				+          this.logger.log(`Expected ${expectedViolation.rule} violation but none found`);
			
 
				+          return false;
			
 
				+        }
			
 
				+        this.logger.log(`✓ Expected violation '${expectedViolation.rule}' found`);
			
 
				+        // Mark these violations as expected so we don't fail on them later
			
 
				+        actualViolations.forEach(v => expectedViolationTypes.add(v.type));
			
 
				+      } else {
			
 
				+        // Positive test: Should NOT have violation
			
 
				+        if (actualViolations.length > 0) {
			
 
				+          this.logger.log(`Unexpected ${expectedViolation.rule} violation found: ${actualViolations[0].message}`);
			
 
				+          return false;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Check legacy expected format (deprecated)
			
 
				+   * Returns null if no decision made, true/false otherwise
			
 
				+   */
			
 
				+  private checkLegacyExpected(
			
 
				+    expected: NonNullable<TestCase['expected']>,
			
 
				+    events: ServerEvent[],
			
 
				+    errors: string[],
			
 
				+    evaluation?: AggregatedResult
			
 
				+  ): boolean | null {
			
 
				+    // Check minimum messages (deprecated)
			
 
				+    if (expected.minMessages !== undefined) {
			
 
				+      const messageEvents = events.filter(e => e.type.includes('message'));
			
 
				+      if (messageEvents.length < expected.minMessages) {
			
 
				+        this.logger.log(`Expected at least ${expected.minMessages} messages, got ${messageEvents.length}`);
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Check maximum messages (deprecated)
			
 
				+    if (expected.maxMessages !== undefined) {
			
 
				+      const messageEvents = events.filter(e => e.type.includes('message'));
			
 
				+      if (messageEvents.length > expected.maxMessages) {
			
 
				+        this.logger.log(`Expected at most ${expected.maxMessages} messages, got ${messageEvents.length}`);
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Check expected violations (deprecated format)
			
 
				+    if (expected.violations && evaluation) {
			
 
				+      const expectedViolationTypes = expected.violations.map(v => v.rule);
			
 
				+      const actualViolationTypes = evaluation.allViolations.map(v => {
			
 
				+        if (v.type.includes('approval')) return 'approval-gate' as const;
			
 
				+        if (v.type.includes('context')) return 'context-loading' as const;
			
 
				+        if (v.type.includes('delegation')) return 'delegation' as const;
			
 
				+        if (v.type.includes('tool')) return 'tool-usage' as const;
			
 
				+        return 'unknown' as const;
			
 
				+      });
			
 
				+
			
 
				+      for (const expectedType of expectedViolationTypes) {
			
 
				+        if (['approval-gate', 'context-loading', 'delegation', 'tool-usage'].includes(expectedType)) {
			
 
				+          if (!actualViolationTypes.includes(expectedType as any)) {
			
 
				+            this.logger.log(`Expected violation '${expectedType}' not found`);
			
 
				+            return false;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      if (!expected.pass && evaluation.totalViolations === 0) {
			
 
				+        this.logger.log('Expected violations but none found');
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // If test expects to pass, check no critical violations
			
 
				+    if (expected.pass && evaluation) {
			
 
				+      if (evaluation.violationsBySeverity.error > 0) {
			
 
				+        this.logger.log(`Expected pass but found ${evaluation.violationsBySeverity.error} error-level violations`);
			
 
				+        return false;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Use expected.pass if specified
			
 
				+    if (expected.pass !== undefined) {
			
 
				+      return expected.pass ? errors.length === 0 : true;
			
 
				+    }
			
 
				+
			
 
				+    // No decision made by legacy checks
			
 
				+    return null;
			
 
				+  }
			
 
				+}
			
--- a/evals/framework/src/sdk/test-executor.ts
+++ b/evals/framework/src/sdk/test-executor.ts
@@ -0,0 +1,392 @@
 
				+/**
			
 
				+ * TestExecutor - Core test execution logic
			
 
				+ * 
			
 
				+ * Handles the actual execution of test cases:
			
 
				+ * - Session creation and management
			
 
				+ * - Prompt sending (single and multi-turn)
			
 
				+ * - Event handling and collection
			
 
				+ * - Timeout management (simple and smart)
			
 
				+ * 
			
 
				+ * Extracted from test-runner.ts for better modularity.
			
 
				+ */
			
 
				+
			
 
				+import { ClientManager } from './client-manager.js';
			
 
				+import { EventStreamHandler } from './event-stream-handler.js';
			
 
				+import type { TestCase } from './test-case-schema.js';
			
 
				+import type { ApprovalStrategy } from './approval/approval-strategy.js';
			
 
				+import type { ServerEvent } from './event-stream-handler.js';
			
 
				+
			
 
				+/**
			
 
				+ * Configuration for test execution
			
 
				+ */
			
 
				+export interface ExecutionConfig {
			
 
				+  /** Default timeout for tests (ms) */
			
 
				+  defaultTimeout: number;
			
 
				+  /** Project path for working directory */
			
 
				+  projectPath: string;
			
 
				+  /** Default model to use */
			
 
				+  defaultModel: string;
			
 
				+  /** Enable debug logging */
			
 
				+  debug: boolean;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Result of test execution (before evaluation)
			
 
				+ */
			
 
				+export interface ExecutionResult {
			
 
				+  /** Session ID created for this test */
			
 
				+  sessionId: string;
			
 
				+  /** Events captured during test */
			
 
				+  events: ServerEvent[];
			
 
				+  /** Errors encountered during execution */
			
 
				+  errors: string[];
			
 
				+  /** Number of approvals given */
			
 
				+  approvalsGiven: number;
			
 
				+  /** Duration of execution (ms) */
			
 
				+  duration: number;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Logger interface for dependency injection
			
 
				+ */
			
 
				+export interface ExecutionLogger {
			
 
				+  log(message: string): void;
			
 
				+  logEvent(event: ServerEvent): void;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * TestExecutor handles the core test execution logic
			
 
				+ */
			
 
				+export class TestExecutor {
			
 
				+  constructor(
			
 
				+    private readonly client: ClientManager,
			
 
				+    private readonly eventHandler: EventStreamHandler,
			
 
				+    private readonly config: ExecutionConfig,
			
 
				+    private readonly logger: ExecutionLogger
			
 
				+  ) {}
			
 
				+
			
 
				+  /**
			
 
				+   * Execute a single test case
			
 
				+   */
			
 
				+  async execute(
			
 
				+    testCase: TestCase,
			
 
				+    approvalStrategy: ApprovalStrategy
			
 
				+  ): Promise<ExecutionResult> {
			
 
				+    const startTime = Date.now();
			
 
				+    const errors: string[] = [];
			
 
				+    const events: ServerEvent[] = [];
			
 
				+    let sessionId = '';
			
 
				+    let approvalsGiven = 0;
			
 
				+
			
 
				+    try {
			
 
				+      this.logger.log(`\n${'='.repeat(60)}`);
			
 
				+      this.logger.log(`Running test: ${testCase.id} - ${testCase.name}`);
			
 
				+      this.logger.log(`${'='.repeat(60)}`);
			
 
				+      this.logger.log(`Approval strategy: ${approvalStrategy.describe()}`);
			
 
				+
			
 
				+      // Setup event handler
			
 
				+      this.eventHandler.removeAllHandlers();
			
 
				+      
			
 
				+      this.eventHandler.onAny((event) => {
			
 
				+        events.push(event);
			
 
				+        if (this.config.debug) {
			
 
				+          this.logger.logEvent(event);
			
 
				+        }
			
 
				+      });
			
 
				+
			
 
				+      this.eventHandler.onPermission(async (event) => {
			
 
				+        const approved = await approvalStrategy.shouldApprove(event);
			
 
				+        approvalsGiven++;
			
 
				+        this.logger.log(`Permission ${approved ? 'APPROVED' : 'DENIED'}: ${event.properties.tool || 'unknown'}`);
			
 
				+        return approved;
			
 
				+      });
			
 
				+
			
 
				+      // Start event listener in background
			
 
				+      const evtHandler = this.eventHandler;
			
 
				+      this.eventHandler.startListening().catch(err => {
			
 
				+        if (evtHandler.listening()) {
			
 
				+          errors.push(`Event stream error: ${err.message}`);
			
 
				+        }
			
 
				+      });
			
 
				+
			
 
				+      // Wait for event handler to connect
			
 
				+      await this.sleep(2000);
			
 
				+
			
 
				+      // Create session
			
 
				+      this.logger.log('Creating session...');
			
 
				+      const session = await this.client.createSession({
			
 
				+        title: testCase.name,
			
 
				+      });
			
 
				+      sessionId = session.id;
			
 
				+      this.logger.log(`Session created: ${sessionId}`);
			
 
				+
			
 
				+      // Send prompt(s)
			
 
				+      await this.sendPrompts(testCase, sessionId, errors);
			
 
				+
			
 
				+      // Give time for final events to arrive
			
 
				+      await this.sleep(3000);
			
 
				+
			
 
				+      // Stop event handler
			
 
				+      this.eventHandler.stopListening();
			
 
				+
			
 
				+      // Validate agent if specified
			
 
				+      if (testCase.agent) {
			
 
				+        await this.validateAgent(testCase, sessionId, errors);
			
 
				+      }
			
 
				+
			
 
				+      const duration = Date.now() - startTime;
			
 
				+
			
 
				+      return {
			
 
				+        sessionId,
			
 
				+        events,
			
 
				+        errors,
			
 
				+        approvalsGiven,
			
 
				+        duration,
			
 
				+      };
			
 
				+    } catch (error) {
			
 
				+      const duration = Date.now() - startTime;
			
 
				+      errors.push(`Test execution failed: ${(error as Error).message}`);
			
 
				+
			
 
				+      this.logger.log(`\nTest FAILED with exception`);
			
 
				+      this.logger.log(`Error: ${(error as Error).message}`);
			
 
				+
			
 
				+      return {
			
 
				+        sessionId,
			
 
				+        events,
			
 
				+        errors,
			
 
				+        approvalsGiven,
			
 
				+        duration,
			
 
				+      };
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Send prompts for a test case (single or multi-turn)
			
 
				+   */
			
 
				+  private async sendPrompts(
			
 
				+    testCase: TestCase,
			
 
				+    sessionId: string,
			
 
				+    errors: string[]
			
 
				+  ): Promise<void> {
			
 
				+    const timeout = testCase.timeout || this.config.defaultTimeout;
			
 
				+    const modelToUse = testCase.model || this.config.defaultModel;
			
 
				+    const agentToUse = testCase.agent || 'openagent';
			
 
				+    
			
 
				+    this.logger.log(`Agent: ${agentToUse}`);
			
 
				+    this.logger.log(`Model: ${modelToUse}`);
			
 
				+    
			
 
				+    // Check if multi-message test
			
 
				+    if (testCase.prompts && testCase.prompts.length > 0) {
			
 
				+      await this.sendMultiTurnPrompts(testCase, sessionId, timeout, modelToUse, agentToUse);
			
 
				+    } else {
			
 
				+      await this.sendSinglePrompt(testCase, sessionId, timeout, modelToUse, agentToUse);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Send multiple prompts for multi-turn tests
			
 
				+   */
			
 
				+  private async sendMultiTurnPrompts(
			
 
				+    testCase: TestCase,
			
 
				+    sessionId: string,
			
 
				+    timeout: number,
			
 
				+    modelToUse: string,
			
 
				+    agentToUse: string
			
 
				+  ): Promise<void> {
			
 
				+    this.logger.log(`Sending ${testCase.prompts!.length} prompts (multi-turn)...`);
			
 
				+    this.logger.log(`Using smart timeout: ${timeout}ms per prompt, max ${timeout * 2}ms absolute`);
			
 
				+    
			
 
				+    for (let i = 0; i < testCase.prompts!.length; i++) {
			
 
				+      const msg = testCase.prompts![i];
			
 
				+      this.logger.log(`\nPrompt ${i + 1}/${testCase.prompts!.length}:`);
			
 
				+      this.logger.log(`  Text: ${msg.text.substring(0, 100)}${msg.text.length > 100 ? '...' : ''}`);
			
 
				+      if (msg.expectContext) {
			
 
				+        this.logger.log(`  Expects context: ${msg.contextFile || 'yes'}`);
			
 
				+      }
			
 
				+      
			
 
				+      // Add delay if specified
			
 
				+      if (msg.delayMs && i > 0) {
			
 
				+        this.logger.log(`  Waiting ${msg.delayMs}ms before sending...`);
			
 
				+        await this.sleep(msg.delayMs);
			
 
				+      }
			
 
				+      
			
 
				+      const promptPromise = this.client.sendPrompt(sessionId, {
			
 
				+        text: msg.text,
			
 
				+        agent: agentToUse,
			
 
				+        model: modelToUse ? this.parseModel(modelToUse) : undefined,
			
 
				+        directory: this.config.projectPath,
			
 
				+      });
			
 
				+      
			
 
				+      await this.withSmartTimeout(
			
 
				+        promptPromise,
			
 
				+        timeout,
			
 
				+        timeout * 2,
			
 
				+        `Prompt ${i + 1} execution timed out`
			
 
				+      );
			
 
				+      this.logger.log(`  Completed`);
			
 
				+      
			
 
				+      // Small delay between messages
			
 
				+      if (i < testCase.prompts!.length - 1) {
			
 
				+        await this.sleep(1000);
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    this.logger.log('\nAll prompts completed');
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Send a single prompt
			
 
				+   */
			
 
				+  private async sendSinglePrompt(
			
 
				+    testCase: TestCase,
			
 
				+    sessionId: string,
			
 
				+    timeout: number,
			
 
				+    modelToUse: string,
			
 
				+    agentToUse: string
			
 
				+  ): Promise<void> {
			
 
				+    this.logger.log('Sending prompt...');
			
 
				+    this.logger.log(`Prompt: ${testCase.prompt!.substring(0, 100)}${testCase.prompt!.length > 100 ? '...' : ''}`);
			
 
				+    
			
 
				+    const promptPromise = this.client.sendPrompt(sessionId, {
			
 
				+      text: testCase.prompt!,
			
 
				+      agent: agentToUse,
			
 
				+      model: modelToUse ? this.parseModel(modelToUse) : undefined,
			
 
				+      directory: this.config.projectPath,
			
 
				+    });
			
 
				+
			
 
				+    await this.withTimeout(promptPromise, timeout, 'Prompt execution timed out');
			
 
				+    this.logger.log('Prompt completed');
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate that the correct agent was used
			
 
				+   */
			
 
				+  private async validateAgent(
			
 
				+    testCase: TestCase,
			
 
				+    sessionId: string,
			
 
				+    errors: string[]
			
 
				+  ): Promise<void> {
			
 
				+    this.logger.log(`Validating agent: ${testCase.agent}...`);
			
 
				+    try {
			
 
				+      const sessionInfo = await this.client.getSession(sessionId);
			
 
				+      const messages = sessionInfo.messages;
			
 
				+      
			
 
				+      if (messages && messages.length > 0) {
			
 
				+        const firstMessage = messages[0].info as any;
			
 
				+        const actualAgent = firstMessage.agent;
			
 
				+        
			
 
				+        if (actualAgent && actualAgent !== testCase.agent) {
			
 
				+          errors.push(`Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
			
 
				+          this.logger.log(`  ❌ Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
			
 
				+        } else if (actualAgent) {
			
 
				+          this.logger.log(`  ✅ Agent verified: ${actualAgent}`);
			
 
				+        } else {
			
 
				+          this.logger.log(`  ⚠️  Agent not set in message`);
			
 
				+        }
			
 
				+      }
			
 
				+    } catch (error) {
			
 
				+      this.logger.log(`  Warning: Could not validate agent: ${(error as Error).message}`);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Parse model string (provider/model format)
			
 
				+   */
			
 
				+  private parseModel(model: string): { providerID: string; modelID: string } {
			
 
				+    const [providerID, modelID] = model.split('/');
			
 
				+    if (!providerID || !modelID) {
			
 
				+      throw new Error(`Invalid model format: ${model}. Expected provider/model`);
			
 
				+    }
			
 
				+    return { providerID, modelID };
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Sleep for ms
			
 
				+   */
			
 
				+  private sleep(ms: number): Promise<void> {
			
 
				+    return new Promise(resolve => setTimeout(resolve, ms));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Run promise with timeout
			
 
				+   */
			
 
				+  private async withTimeout<T>(promise: Promise<T>, timeoutMs: number, message: string): Promise<T> {
			
 
				+    return Promise.race([
			
 
				+      promise,
			
 
				+      new Promise<T>((_, reject) =>
			
 
				+        setTimeout(() => reject(new Error(message)), timeoutMs)
			
 
				+      ),
			
 
				+    ]);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Run promise with smart timeout that monitors activity
			
 
				+   * - Checks if events are still coming in
			
 
				+   * - Extends timeout if activity detected
			
 
				+   * - Has absolute maximum timeout
			
 
				+   */
			
 
				+  private async withSmartTimeout<T>(
			
 
				+    promise: Promise<T>,
			
 
				+    baseTimeoutMs: number,
			
 
				+    maxTimeoutMs: number,
			
 
				+    message: string
			
 
				+  ): Promise<T> {
			
 
				+    const startTime = Date.now();
			
 
				+    let lastActivityTime = startTime;
			
 
				+    let isActive = true;
			
 
				+
			
 
				+    // Monitor event activity
			
 
				+    const activityMonitor = setInterval(() => {
			
 
				+      const now = Date.now();
			
 
				+      const timeSinceLastActivity = now - lastActivityTime;
			
 
				+      const totalTime = now - startTime;
			
 
				+
			
 
				+      // Check if we've exceeded absolute max timeout
			
 
				+      if (totalTime > maxTimeoutMs) {
			
 
				+        isActive = false;
			
 
				+        clearInterval(activityMonitor);
			
 
				+        return;
			
 
				+      }
			
 
				+
			
 
				+      // If no activity for baseTimeout, consider it stalled
			
 
				+      if (timeSinceLastActivity > baseTimeoutMs) {
			
 
				+        isActive = false;
			
 
				+        clearInterval(activityMonitor);
			
 
				+      }
			
 
				+    }, 1000);
			
 
				+
			
 
				+    // Update last activity time when events arrive
			
 
				+    this.eventHandler.onAny(() => {
			
 
				+      lastActivityTime = Date.now();
			
 
				+    });
			
 
				+
			
 
				+    try {
			
 
				+      const result = await Promise.race([
			
 
				+        promise,
			
 
				+        new Promise<T>((_, reject) => {
			
 
				+          const checkTimeout = setInterval(() => {
			
 
				+            const now = Date.now();
			
 
				+            const totalTime = now - startTime;
			
 
				+            const timeSinceActivity = now - lastActivityTime;
			
 
				+
			
 
				+            if (totalTime > maxTimeoutMs) {
			
 
				+              clearInterval(checkTimeout);
			
 
				+              clearInterval(activityMonitor);
			
 
				+              reject(new Error(`${message} (absolute max timeout: ${maxTimeoutMs}ms)`));
			
 
				+            } else if (timeSinceActivity > baseTimeoutMs && !isActive) {
			
 
				+              clearInterval(checkTimeout);
			
 
				+              clearInterval(activityMonitor);
			
 
				+              reject(new Error(`${message} (no activity for ${baseTimeoutMs}ms)`));
			
 
				+            }
			
 
				+          }, 1000);
			
 
				+        })
			
 
				+      ]);
			
 
				+
			
 
				+      clearInterval(activityMonitor);
			
 
				+      return result;
			
 
				+    } catch (error) {
			
 
				+      clearInterval(activityMonitor);
			
 
				+      throw error;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/evals/framework/src/sdk/test-runner.ts
+++ b/evals/framework/src/sdk/test-runner.ts
@@ -1,3 +1,18 @@
 
				+/**
			
 
				+ * TestRunner - Orchestrates test execution
			
 
				+ * 
			
 
				+ * This is a thin orchestrator that coordinates:
			
 
				+ * - Server lifecycle management
			
 
				+ * - Test execution via TestExecutor
			
 
				+ * - Result validation via ResultValidator
			
 
				+ * - Evaluator management
			
 
				+ * 
			
 
				+ * The actual execution and validation logic has been extracted to:
			
 
				+ * - test-executor.ts - Core test execution
			
 
				+ * - result-validator.ts - Result validation
			
 
				+ * - event-logger.ts - Event logging utilities
			
 
				+ */
			
 
				+
			
 
				 import { ServerManager } from './server-manager.js';
			
 
				 import { ClientManager } from './client-manager.js';
			
 
				 import { EventStreamHandler } from './event-stream-handler.js';
			
@@ -12,6 +27,9 @@ import { ContextLoadingEvaluator } from '../evaluators/context-loading-evaluator
 
				 import { DelegationEvaluator } from '../evaluators/delegation-evaluator.js';
			
 
				 import { ToolUsageEvaluator } from '../evaluators/tool-usage-evaluator.js';
			
 
				 import { BehaviorEvaluator } from '../evaluators/behavior-evaluator.js';
			
 
				+import { TestExecutor } from './test-executor.js';
			
 
				+import { ResultValidator } from './result-validator.js';
			
 
				+import { createLogger } from './event-logger.js';
			
 
				 import type { TestCase } from './test-case-schema.js';
			
 
				 import type { ApprovalStrategy } from './approval/approval-strategy.js';
			
 
				 import type { ServerEvent } from './event-stream-handler.js';
			
@@ -119,12 +137,18 @@ export interface TestResult {
 
				   evaluation?: AggregatedResult;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * TestRunner orchestrates the test execution process
			
 
				+ */
			
 
				 export class TestRunner {
			
 
				   private server: ServerManager;
			
 
				   private client: ClientManager | null = null;
			
 
				   private eventHandler: EventStreamHandler | null = null;
			
 
				   private config: Required<TestRunnerConfig>;
			
 
				   private evaluatorRunner: EvaluatorRunner | null = null;
			
 
				+  private executor: TestExecutor | null = null;
			
 
				+  private validator: ResultValidator;
			
 
				+  private logger: ReturnType<typeof createLogger>;
			
 
				 
			
 
				   constructor(config: TestRunnerConfig = {}) {
			
 
				     // Find git root for agent detection
			
@@ -136,62 +160,85 @@ export class TestRunner {
 
				       defaultTimeout: config.defaultTimeout || 60000,
			
 
				       projectPath: config.projectPath || gitRoot,
			
 
				       runEvaluators: config.runEvaluators ?? true,
			
 
				-      defaultModel: config.defaultModel || 'opencode/grok-code', // Free tier default (fixed model name)
			
 
				+      defaultModel: config.defaultModel || 'opencode/grok-code',
			
 
				     };
			
 
				 
			
 
				+    // Create logger
			
 
				+    this.logger = createLogger(this.config.debug);
			
 
				+
			
 
				+    // Create validator
			
 
				+    this.validator = new ResultValidator(this.logger);
			
 
				+
			
 
				     // Start server from git root with default agent
			
 
				-    // Note: Individual tests can override the agent per-session
			
 
				     this.server = new ServerManager({
			
 
				       port: this.config.port,
			
 
				       timeout: 10000,
			
 
				-      cwd: gitRoot, // CRITICAL: Start server from git root to detect agent
			
 
				-      debug: this.config.debug, // Pass debug flag to server
			
 
				-      agent: 'openagent', // Default agent for all tests
			
 
				+      cwd: gitRoot,
			
 
				+      debug: this.config.debug,
			
 
				+      agent: 'openagent',
			
 
				     });
			
 
				 
			
 
				     if (this.config.debug) {
			
 
				       console.log(`[TestRunner] Git root: ${gitRoot}`);
			
 
				       console.log(`[TestRunner] Server will start from: ${gitRoot} with agent: openagent`);
			
 
				     }
			
 
				-
			
 
				-    // Note: Evaluators will be setup in start() after SDK client is available
			
 
				   }
			
 
				 
			
 
				   /**
			
 
				    * Start the test runner (starts opencode server)
			
 
				    */
			
 
				   async start(): Promise<void> {
			
 
				-    this.log('Starting opencode server...');
			
 
				+    this.logger.log('Starting opencode server...');
			
 
				     const { url } = await this.server.start();
			
 
				-    this.log(`Server started at ${url}`);
			
 
				+    this.logger.log(`Server started at ${url}`);
			
 
				 
			
 
				     this.client = new ClientManager({ baseUrl: url });
			
 
				     this.eventHandler = new EventStreamHandler(url);
			
 
				 
			
 
				-    // Setup evaluators now that SDK client is available
			
 
				+    // Create executor
			
 
				+    this.executor = new TestExecutor(
			
 
				+      this.client,
			
 
				+      this.eventHandler,
			
 
				+      {
			
 
				+        defaultTimeout: this.config.defaultTimeout,
			
 
				+        projectPath: this.config.projectPath,
			
 
				+        defaultModel: this.config.defaultModel,
			
 
				+        debug: this.config.debug,
			
 
				+      },
			
 
				+      this.logger
			
 
				+    );
			
 
				+
			
 
				+    // Setup evaluators
			
 
				     if (this.config.runEvaluators && this.client) {
			
 
				-      const sessionStoragePath = join(homedir(), '.local', 'share', 'opencode');
			
 
				-      
			
 
				-      // Create SessionReader with SDK client for reliable session retrieval
			
 
				-      const sdkClient = this.client.getClient();
			
 
				-      const sessionReader = new SessionReader(sdkClient, sessionStoragePath);
			
 
				-      const timelineBuilder = new TimelineBuilder(sessionReader);
			
 
				-
			
 
				-      this.evaluatorRunner = new EvaluatorRunner({
			
 
				-        sessionReader,
			
 
				-        timelineBuilder,
			
 
				-        sdkClient,
			
 
				-        evaluators: [
			
 
				-          new ApprovalGateEvaluator(),
			
 
				-          new ContextLoadingEvaluator(),
			
 
				-          new DelegationEvaluator(),
			
 
				-          new ToolUsageEvaluator(),
			
 
				-        ],
			
 
				-      });
			
 
				-
			
 
				-      if (this.config.debug) {
			
 
				-        this.log('[TestRunner] Evaluators initialized with SDK client');
			
 
				-      }
			
 
				+      this.setupEvaluators();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Setup evaluators with SDK client
			
 
				+   */
			
 
				+  private setupEvaluators(): void {
			
 
				+    if (!this.client) return;
			
 
				+
			
 
				+    const sessionStoragePath = join(homedir(), '.local', 'share', 'opencode');
			
 
				+    const sdkClient = this.client.getClient();
			
 
				+    const sessionReader = new SessionReader(sdkClient, sessionStoragePath);
			
 
				+    const timelineBuilder = new TimelineBuilder(sessionReader);
			
 
				+
			
 
				+    this.evaluatorRunner = new EvaluatorRunner({
			
 
				+      sessionReader,
			
 
				+      timelineBuilder,
			
 
				+      sdkClient,
			
 
				+      evaluators: [
			
 
				+        new ApprovalGateEvaluator(),
			
 
				+        new ContextLoadingEvaluator(),
			
 
				+        new DelegationEvaluator(),
			
 
				+        new ToolUsageEvaluator(),
			
 
				+      ],
			
 
				+    });
			
 
				+
			
 
				+    if (this.config.debug) {
			
 
				+      this.logger.log('[TestRunner] Evaluators initialized with SDK client');
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -199,242 +246,116 @@ export class TestRunner {
 
				    * Stop the test runner (stops server)
			
 
				    */
			
 
				   async stop(): Promise<void> {
			
 
				-    this.log('Stopping event handler...');
			
 
				+    this.logger.log('Stopping event handler...');
			
 
				     if (this.eventHandler) {
			
 
				       this.eventHandler.stopListening();
			
 
				       this.eventHandler = null;
			
 
				     }
			
 
				 
			
 
				-    this.log('Stopping server...');
			
 
				+    this.logger.log('Stopping server...');
			
 
				     await this.server.stop();
			
 
				     this.client = null;
			
 
				+    this.executor = null;
			
 
				   }
			
 
				 
			
 
				   /**
			
 
				    * Run a single test case
			
 
				    */
			
 
				   async runTest(testCase: TestCase): Promise<TestResult> {
			
 
				-    if (!this.client || !this.eventHandler) {
			
 
				+    if (!this.client || !this.eventHandler || !this.executor) {
			
 
				       throw new Error('Test runner not started. Call start() first.');
			
 
				     }
			
 
				 
			
 
				-    const startTime = Date.now();
			
 
				-    const errors: string[] = [];
			
 
				-    const events: ServerEvent[] = [];
			
 
				-    let sessionId = '';
			
 
				-    let approvalsGiven = 0;
			
 
				-
			
 
				-    try {
			
 
				-      this.log(`\n${'='.repeat(60)}`);
			
 
				-      this.log(`Running test: ${testCase.id} - ${testCase.name}`);
			
 
				-      this.log(`${'='.repeat(60)}`);
			
 
				+    // Create approval strategy
			
 
				+    const approvalStrategy = this.createApprovalStrategy(testCase);
			
 
				 
			
 
				-      // Create approval strategy
			
 
				-      const approvalStrategy = this.createApprovalStrategy(testCase);
			
 
				-      this.log(`Approval strategy: ${approvalStrategy.describe()}`);
			
 
				+    // Execute test
			
 
				+    const executionResult = await this.executor.execute(testCase, approvalStrategy);
			
 
				 
			
 
				-      // Setup event handler
			
 
				-      this.eventHandler.removeAllHandlers();
			
 
				-      
			
 
				-      this.eventHandler.onAny((event) => {
			
 
				-        events.push(event);
			
 
				-        if (this.config.debug) {
			
 
				-          this.logEvent(event);
			
 
				-        }
			
 
				-      });
			
 
				-
			
 
				-      this.eventHandler.onPermission(async (event) => {
			
 
				-        const approved = await approvalStrategy.shouldApprove(event);
			
 
				-        approvalsGiven++;
			
 
				-        this.log(`Permission ${approved ? 'APPROVED' : 'DENIED'}: ${event.properties.tool || 'unknown'}`);
			
 
				-        return approved;
			
 
				-      });
			
 
				-
			
 
				-      // Start event listener in background
			
 
				-      const evtHandler = this.eventHandler;
			
 
				-      this.eventHandler.startListening().catch(err => {
			
 
				-        if (evtHandler.listening()) {
			
 
				-          errors.push(`Event stream error: ${err.message}`);
			
 
				-        }
			
 
				-      });
			
 
				-
			
 
				-      // Wait for event handler to connect
			
 
				-      await this.sleep(2000);
			
 
				-
			
 
				-      // Create session (agent selection happens in sendPrompt, not here)
			
 
				-      this.log('Creating session...');
			
 
				-      const session = await this.client.createSession({
			
 
				-        title: testCase.name,
			
 
				-      });
			
 
				-      sessionId = session.id;
			
 
				-      this.log(`Session created: ${sessionId}`);
			
 
				-
			
 
				-      // Send prompt(s) with agent selection
			
 
				-      const timeout = testCase.timeout || this.config.defaultTimeout;
			
 
				-      const modelToUse = testCase.model || this.config.defaultModel;
			
 
				-      const agentToUse = testCase.agent || 'openagent'; // Default to openagent
			
 
				-      
			
 
				-      this.log(`Agent: ${agentToUse}`);
			
 
				-      this.log(`Model: ${modelToUse}`);
			
 
				-      
			
 
				-      // Check if multi-message test
			
 
				-      if (testCase.prompts && testCase.prompts.length > 0) {
			
 
				-        this.log(`Sending ${testCase.prompts.length} prompts (multi-turn)...`);
			
 
				-        
			
 
				-        for (let i = 0; i < testCase.prompts.length; i++) {
			
 
				-          const msg = testCase.prompts[i];
			
 
				-          this.log(`\nPrompt ${i + 1}/${testCase.prompts.length}:`);
			
 
				-          this.log(`  Text: ${msg.text.substring(0, 100)}${msg.text.length > 100 ? '...' : ''}`);
			
 
				-          if (msg.expectContext) {
			
 
				-            this.log(`  Expects context: ${msg.contextFile || 'yes'}`);
			
 
				-          }
			
 
				-          
			
 
				-          // Add delay if specified
			
 
				-          if (msg.delayMs && i > 0) {
			
 
				-            this.log(`  Waiting ${msg.delayMs}ms before sending...`);
			
 
				-            await this.sleep(msg.delayMs);
			
 
				-          }
			
 
				-          
			
 
				-          const promptPromise = this.client.sendPrompt(sessionId, {
			
 
				-            text: msg.text,
			
 
				-            agent: agentToUse, // ✅ Agent selection happens here!
			
 
				-            model: modelToUse ? this.parseModel(modelToUse) : undefined,
			
 
				-            directory: this.config.projectPath, // Pass working directory
			
 
				-          });
			
 
				-          
			
 
				-          await this.withTimeout(promptPromise, timeout, `Prompt ${i + 1} execution timed out`);
			
 
				-          this.log(`  Completed`);
			
 
				-          
			
 
				-          // Small delay between messages
			
 
				-          if (i < testCase.prompts.length - 1) {
			
 
				-            await this.sleep(1000);
			
 
				-          }
			
 
				-        }
			
 
				-        
			
 
				-        this.log('\nAll prompts completed');
			
 
				-      } else {
			
 
				-        // Single message test
			
 
				-        this.log('Sending prompt...');
			
 
				-        this.log(`Prompt: ${testCase.prompt!.substring(0, 100)}${testCase.prompt!.length > 100 ? '...' : ''}`);
			
 
				-        
			
 
				-        const promptPromise = this.client.sendPrompt(sessionId, {
			
 
				-          text: testCase.prompt!,
			
 
				-          agent: agentToUse, // ✅ Agent selection happens here!
			
 
				-          model: modelToUse ? this.parseModel(modelToUse) : undefined,
			
 
				-          directory: this.config.projectPath, // Pass working directory
			
 
				-        });
			
 
				-
			
 
				-        await this.withTimeout(promptPromise, timeout, 'Prompt execution timed out');
			
 
				-        this.log('Prompt completed');
			
 
				-      }
			
 
				-
			
 
				-      // Give time for final events to arrive
			
 
				-      await this.sleep(3000);
			
 
				+    // Run evaluators if enabled
			
 
				+    let evaluation: AggregatedResult | undefined;
			
 
				+    if (this.config.runEvaluators && this.evaluatorRunner && executionResult.sessionId) {
			
 
				+      evaluation = await this.runEvaluators(testCase, executionResult.sessionId);
			
 
				+    }
			
 
				 
			
 
				-      // Stop event handler
			
 
				-      this.eventHandler.stopListening();
			
 
				+    // Validate result
			
 
				+    const passed = this.validator.validate(
			
 
				+      testCase,
			
 
				+      executionResult.events,
			
 
				+      executionResult.errors,
			
 
				+      evaluation
			
 
				+    );
			
 
				+
			
 
				+    // Log summary
			
 
				+    this.logTestSummary(passed, executionResult, evaluation);
			
 
				+
			
 
				+    return {
			
 
				+      testCase,
			
 
				+      sessionId: executionResult.sessionId,
			
 
				+      passed,
			
 
				+      errors: executionResult.errors,
			
 
				+      events: executionResult.events,
			
 
				+      duration: executionResult.duration,
			
 
				+      approvalsGiven: executionResult.approvalsGiven,
			
 
				+      evaluation,
			
 
				+    };
			
 
				+  }
			
 
				 
			
 
				-      const duration = Date.now() - startTime;
			
 
				+  /**
			
 
				+   * Run evaluators for a test
			
 
				+   */
			
 
				+  private async runEvaluators(
			
 
				+    testCase: TestCase,
			
 
				+    sessionId: string
			
 
				+  ): Promise<AggregatedResult | undefined> {
			
 
				+    if (!this.evaluatorRunner) return undefined;
			
 
				 
			
 
				-      // Validate agent is correct
			
 
				-      if (testCase.agent) {
			
 
				-        this.log(`Validating agent: ${testCase.agent}...`);
			
 
				-        try {
			
 
				-          const sessionInfo = await this.client.getSession(sessionId);
			
 
				-          const messages = sessionInfo.messages;
			
 
				-          
			
 
				-          if (messages && messages.length > 0) {
			
 
				-            const firstMessage = messages[0].info as any; // SDK types may not include agent field
			
 
				-            const actualAgent = firstMessage.agent;
			
 
				-            
			
 
				-            if (actualAgent && actualAgent !== testCase.agent) {
			
 
				-              errors.push(`Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
			
 
				-              this.log(`  ❌ Agent mismatch: expected '${testCase.agent}', got '${actualAgent}'`);
			
 
				-            } else if (actualAgent) {
			
 
				-              this.log(`  ✅ Agent verified: ${actualAgent}`);
			
 
				-            } else {
			
 
				-              this.log(`  ⚠️  Agent not set in message`);
			
 
				-            }
			
 
				-          }
			
 
				-        } catch (error) {
			
 
				-          this.log(`  Warning: Could not validate agent: ${(error as Error).message}`);
			
 
				-        }
			
 
				+    this.logger.log('Running evaluators...');
			
 
				+    
			
 
				+    // Add behavior evaluator if test case has behavior expectations
			
 
				+    if (testCase.behavior) {
			
 
				+      this.logger.log('Adding behavior evaluator for test expectations...');
			
 
				+      const behaviorEvaluator = new BehaviorEvaluator(testCase.behavior);
			
 
				+      this.evaluatorRunner.register(behaviorEvaluator);
			
 
				+    }
			
 
				+    
			
 
				+    try {
			
 
				+      const evaluation = await this.evaluatorRunner.runAll(sessionId);
			
 
				+      this.logger.log(`Evaluators completed: ${evaluation.totalViolations} violations found`);
			
 
				+      
			
 
				+      if (evaluation && evaluation.totalViolations > 0) {
			
 
				+        this.logger.log(`  Errors: ${evaluation.violationsBySeverity.error}`);
			
 
				+        this.logger.log(`  Warnings: ${evaluation.violationsBySeverity.warning}`);
			
 
				       }
			
 
				-
			
 
				-      // Run evaluators if enabled
			
 
				-      let evaluation: AggregatedResult | undefined;
			
 
				-      if (this.config.runEvaluators && this.evaluatorRunner) {
			
 
				-        this.log('Running evaluators...');
			
 
				-        
			
 
				-        // Add behavior evaluator if test case has behavior expectations
			
 
				-        if (testCase.behavior) {
			
 
				-          this.log('Adding behavior evaluator for test expectations...');
			
 
				-          const behaviorEvaluator = new BehaviorEvaluator(testCase.behavior);
			
 
				-          this.evaluatorRunner.register(behaviorEvaluator);
			
 
				-        }
			
 
				-        
			
 
				-        // No need to wait for disk writes - we're using SDK client directly!
			
 
				-        // The SDK has the session data in memory and can return it immediately.
			
 
				-        
			
 
				-        try {
			
 
				-          evaluation = await this.evaluatorRunner.runAll(sessionId);
			
 
				-          this.log(`Evaluators completed: ${evaluation.totalViolations} violations found`);
			
 
				-          
			
 
				-          if (evaluation && evaluation.totalViolations > 0) {
			
 
				-            this.log(`  Errors: ${evaluation.violationsBySeverity.error}`);
			
 
				-            this.log(`  Warnings: ${evaluation.violationsBySeverity.warning}`);
			
 
				-          }
			
 
				-          
			
 
				-          // Clean up behavior evaluator after use
			
 
				-          if (testCase.behavior) {
			
 
				-            this.evaluatorRunner.unregister('behavior');
			
 
				-          }
			
 
				-        } catch (error) {
			
 
				-          this.log(`Warning: Evaluators failed: ${(error as Error).message}`);
			
 
				-          errors.push(`Evaluator error: ${(error as Error).message}`);
			
 
				-        }
			
 
				+      
			
 
				+      // Clean up behavior evaluator after use
			
 
				+      if (testCase.behavior) {
			
 
				+        this.evaluatorRunner.unregister('behavior');
			
 
				       }
			
 
				 
			
 
				-      // Determine if test passed
			
 
				-      const passed = this.evaluateResult(testCase, events, errors, evaluation);
			
 
				-
			
 
				-      this.log(`\nTest ${passed ? 'PASSED' : 'FAILED'}`);
			
 
				-      this.log(`Duration: ${duration}ms`);
			
 
				-      this.log(`Events captured: ${events.length}`);
			
 
				-      this.log(`Approvals given: ${approvalsGiven}`);
			
 
				-      this.log(`Errors: ${errors.length}`);
			
 
				-
			
 
				-      return {
			
 
				-        testCase,
			
 
				-        sessionId,
			
 
				-        passed,
			
 
				-        errors,
			
 
				-        events,
			
 
				-        duration,
			
 
				-        approvalsGiven,
			
 
				-        evaluation,
			
 
				-      };
			
 
				+      return evaluation;
			
 
				     } catch (error) {
			
 
				-      const duration = Date.now() - startTime;
			
 
				-      errors.push(`Test execution failed: ${(error as Error).message}`);
			
 
				-
			
 
				-      this.log(`\nTest FAILED with exception`);
			
 
				-      this.log(`Error: ${(error as Error).message}`);
			
 
				-
			
 
				-      return {
			
 
				-        testCase,
			
 
				-        sessionId,
			
 
				-        passed: false,
			
 
				-        errors,
			
 
				-        events,
			
 
				-        duration,
			
 
				-        approvalsGiven,
			
 
				-        evaluation: undefined,
			
 
				-      };
			
 
				+      this.logger.log(`Warning: Evaluators failed: ${(error as Error).message}`);
			
 
				+      return undefined;
			
 
				     }
			
 
				   }
			
 
				 
			
 
				   /**
			
 
				+   * Log test summary
			
 
				+   */
			
 
				+  private logTestSummary(
			
 
				+    passed: boolean,
			
 
				+    executionResult: { duration: number; events: ServerEvent[]; approvalsGiven: number; errors: string[] },
			
 
				+    evaluation?: AggregatedResult
			
 
				+  ): void {
			
 
				+    this.logger.log(`\nTest ${passed ? 'PASSED' : 'FAILED'}`);
			
 
				+    this.logger.log(`Duration: ${executionResult.duration}ms`);
			
 
				+    this.logger.log(`Events captured: ${executionResult.events.length}`);
			
 
				+    this.logger.log(`Approvals given: ${executionResult.approvalsGiven}`);
			
 
				+    this.logger.log(`Errors: ${executionResult.errors.length}`);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				    * Run multiple test cases
			
 
				    */
			
 
				   async runTests(testCases: TestCase[]): Promise<TestResult[]> {
			
@@ -448,12 +369,12 @@ export class TestRunner {
 
				       if (this.client && result.sessionId && !this.config.debug) {
			
 
				         try {
			
 
				           await this.client.deleteSession(result.sessionId);
			
 
				-          this.log(`Cleaned up session: ${result.sessionId}\n`);
			
 
				+          this.logger.log(`Cleaned up session: ${result.sessionId}\n`);
			
 
				         } catch (error) {
			
 
				-          this.log(`Failed to clean up session: ${(error as Error).message}\n`);
			
 
				+          this.logger.log(`Failed to clean up session: ${(error as Error).message}\n`);
			
 
				         }
			
 
				       } else if (this.config.debug) {
			
 
				-        this.log(`Debug mode: Keeping session ${result.sessionId} for inspection\n`);
			
 
				+        this.logger.log(`Debug mode: Keeping session ${result.sessionId} for inspection\n`);
			
 
				       }
			
 
				     }
			
 
				 
			
@@ -487,313 +408,4 @@ export class TestRunner {
 
				         throw new Error(`Unknown approval strategy: ${(strategy as any).type}`);
			
 
				     }
			
 
				   }
			
 
				-
			
 
				-  /**
			
 
				-   * Evaluate if test result matches expected outcome
			
 
				-   * 
			
 
				-   * Evaluation priority:
			
 
				-   * 1. Check for execution errors
			
 
				-   * 2. Check behavior expectations (if defined)
			
 
				-   * 3. Check expected violations (if defined)
			
 
				-   * 4. Check deprecated expected format (if defined)
			
 
				-   * 5. Default: pass if no errors
			
 
				-   */
			
 
				-  private evaluateResult(
			
 
				-    testCase: TestCase,
			
 
				-    events: ServerEvent[],
			
 
				-    errors: string[],
			
 
				-    evaluation?: AggregatedResult
			
 
				-  ): boolean {
			
 
				-    // Support both old and new schema
			
 
				-    const expected = testCase.expected;
			
 
				-    const behavior = testCase.behavior;
			
 
				-    const expectedViolations = testCase.expectedViolations;
			
 
				-
			
 
				-    // If there were execution errors and test expects to pass, it fails
			
 
				-    if (errors.length > 0 && expected?.pass !== false) {
			
 
				-      this.log(`Test failed due to execution errors: ${errors.join(', ')}`);
			
 
				-      return false;
			
 
				-    }
			
 
				-
			
 
				-    // =========================================================================
			
 
				-    // NEW: Check behavior evaluator results FIRST (most important)
			
 
				-    // =========================================================================
			
 
				-    if (behavior && evaluation) {
			
 
				-      // Find the behavior evaluator result
			
 
				-      const behaviorResult = evaluation.evaluatorResults.find(r => r.evaluator === 'behavior');
			
 
				-      
			
 
				-      if (behaviorResult) {
			
 
				-        // Check if behavior evaluator passed
			
 
				-        if (!behaviorResult.passed) {
			
 
				-          this.log(`Behavior validation failed: ${behaviorResult.violations.length} violations`);
			
 
				-          behaviorResult.violations.forEach(v => {
			
 
				-            this.log(`  - [${v.severity}] ${v.type}: ${v.message}`);
			
 
				-          });
			
 
				-          return false;
			
 
				-        }
			
 
				-        
			
 
				-        // Check for error-level violations from behavior evaluator
			
 
				-        const behaviorErrors = behaviorResult.violations.filter(v => v.severity === 'error');
			
 
				-        if (behaviorErrors.length > 0) {
			
 
				-          this.log(`Behavior validation has ${behaviorErrors.length} error-level violations`);
			
 
				-          return false;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // =========================================================================
			
 
				-    // Check expected violations (new format)
			
 
				-    // =========================================================================
			
 
				-    // Track which violations were expected so we don't fail on them later
			
 
				-    const expectedViolationTypes = new Set<string>();
			
 
				-    
			
 
				-    if (expectedViolations && evaluation) {
			
 
				-      for (const expectedViolation of expectedViolations) {
			
 
				-        // Map rule names to violation type patterns
			
 
				-        const rulePatterns: Record<string, string[]> = {
			
 
				-          'approval-gate': ['approval', 'missing-approval'],
			
 
				-          'context-loading': ['context', 'no-context-loaded', 'missing-context'],
			
 
				-          'delegation': ['delegation', 'missing-delegation'],
			
 
				-          'tool-usage': ['tool', 'suboptimal-tool'],
			
 
				-          'stop-on-failure': ['stop', 'failure'],
			
 
				-          'confirm-cleanup': ['cleanup', 'confirm'],
			
 
				-        };
			
 
				-
			
 
				-        const patterns = rulePatterns[expectedViolation.rule] || [expectedViolation.rule];
			
 
				-        
			
 
				-        const actualViolations = evaluation.allViolations.filter(v => 
			
 
				-          patterns.some(pattern => v.type.toLowerCase().includes(pattern.toLowerCase()))
			
 
				-        );
			
 
				-
			
 
				-        if (expectedViolation.shouldViolate) {
			
 
				-          // Negative test: Should have violation
			
 
				-          if (actualViolations.length === 0) {
			
 
				-            this.log(`Expected ${expectedViolation.rule} violation but none found`);
			
 
				-            return false;
			
 
				-          }
			
 
				-          this.log(`✓ Expected violation '${expectedViolation.rule}' found`);
			
 
				-          // Mark these violations as expected so we don't fail on them later
			
 
				-          actualViolations.forEach(v => expectedViolationTypes.add(v.type));
			
 
				-        } else {
			
 
				-          // Positive test: Should NOT have violation
			
 
				-          if (actualViolations.length > 0) {
			
 
				-            this.log(`Unexpected ${expectedViolation.rule} violation found: ${actualViolations[0].message}`);
			
 
				-            return false;
			
 
				-          }
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // =========================================================================
			
 
				-    // Check deprecated expected format
			
 
				-    // =========================================================================
			
 
				-    if (expected) {
			
 
				-      // Check minimum messages (deprecated)
			
 
				-      if (expected.minMessages !== undefined) {
			
 
				-        const messageEvents = events.filter(e => e.type.includes('message'));
			
 
				-        if (messageEvents.length < expected.minMessages) {
			
 
				-          this.log(`Expected at least ${expected.minMessages} messages, got ${messageEvents.length}`);
			
 
				-          return false;
			
 
				-        }
			
 
				-      }
			
 
				-
			
 
				-      // Check maximum messages (deprecated)
			
 
				-      if (expected.maxMessages !== undefined) {
			
 
				-        const messageEvents = events.filter(e => e.type.includes('message'));
			
 
				-        if (messageEvents.length > expected.maxMessages) {
			
 
				-          this.log(`Expected at most ${expected.maxMessages} messages, got ${messageEvents.length}`);
			
 
				-          return false;
			
 
				-        }
			
 
				-      }
			
 
				-
			
 
				-      // Check expected violations (deprecated format)
			
 
				-      if (expected.violations && evaluation) {
			
 
				-        const expectedViolationTypes = expected.violations.map(v => v.rule);
			
 
				-        const actualViolationTypes = evaluation.allViolations.map(v => {
			
 
				-          if (v.type.includes('approval')) return 'approval-gate' as const;
			
 
				-          if (v.type.includes('context')) return 'context-loading' as const;
			
 
				-          if (v.type.includes('delegation')) return 'delegation' as const;
			
 
				-          if (v.type.includes('tool')) return 'tool-usage' as const;
			
 
				-          return 'unknown' as const;
			
 
				-        });
			
 
				-
			
 
				-        for (const expectedType of expectedViolationTypes) {
			
 
				-          if (['approval-gate', 'context-loading', 'delegation', 'tool-usage'].includes(expectedType)) {
			
 
				-            if (!actualViolationTypes.includes(expectedType as any)) {
			
 
				-              this.log(`Expected violation '${expectedType}' not found`);
			
 
				-              return false;
			
 
				-            }
			
 
				-          }
			
 
				-        }
			
 
				-
			
 
				-        if (!expected.pass && evaluation.totalViolations === 0) {
			
 
				-          this.log('Expected violations but none found');
			
 
				-          return false;
			
 
				-        }
			
 
				-      }
			
 
				-
			
 
				-      // If test expects to pass, check no critical violations
			
 
				-      if (expected.pass && evaluation) {
			
 
				-        if (evaluation.violationsBySeverity.error > 0) {
			
 
				-          this.log(`Expected pass but found ${evaluation.violationsBySeverity.error} error-level violations`);
			
 
				-          return false;
			
 
				-        }
			
 
				-      }
			
 
				-
			
 
				-      // Use expected.pass if specified
			
 
				-      if (expected.pass !== undefined) {
			
 
				-        return expected.pass ? errors.length === 0 : true;
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // =========================================================================
			
 
				-    // Default: pass if no errors and no unexpected error-level violations
			
 
				-    // =========================================================================
			
 
				-    if (evaluation && evaluation.violationsBySeverity.error > 0) {
			
 
				-      // Filter out expected violations
			
 
				-      const unexpectedErrors = evaluation.allViolations.filter(v => 
			
 
				-        v.severity === 'error' && !expectedViolationTypes.has(v.type)
			
 
				-      );
			
 
				-      
			
 
				-      if (unexpectedErrors.length > 0) {
			
 
				-        this.log(`Test failed: ${unexpectedErrors.length} unexpected error-level violations`);
			
 
				-        unexpectedErrors.forEach(v => this.log(`  - ${v.type}: ${v.message}`));
			
 
				-        return false;
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    return errors.length === 0;
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Parse model string (provider/model format)
			
 
				-   */
			
 
				-  private parseModel(model: string): { providerID: string; modelID: string } {
			
 
				-    const [providerID, modelID] = model.split('/');
			
 
				-    if (!providerID || !modelID) {
			
 
				-      throw new Error(`Invalid model format: ${model}. Expected provider/model`);
			
 
				-    }
			
 
				-    return { providerID, modelID };
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Sleep for ms
			
 
				-   */
			
 
				-  private sleep(ms: number): Promise<void> {
			
 
				-    return new Promise(resolve => setTimeout(resolve, ms));
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Run promise with timeout
			
 
				-   */
			
 
				-  private async withTimeout<T>(promise: Promise<T>, timeoutMs: number, message: string): Promise<T> {
			
 
				-    return Promise.race([
			
 
				-      promise,
			
 
				-      new Promise<T>((_, reject) =>
			
 
				-        setTimeout(() => reject(new Error(message)), timeoutMs)
			
 
				-      ),
			
 
				-    ]);
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Log message
			
 
				-   */
			
 
				-  private log(message: string): void {
			
 
				-    if (this.config.debug || message.includes('PASSED') || message.includes('FAILED')) {
			
 
				-      console.log(message);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Log event with meaningful details
			
 
				-   * 
			
 
				-   * Event properties structure varies by type:
			
 
				-   * - session.created/updated: { id, title, ... }
			
 
				-   * - message.updated: { id, sessionID, role, ... }
			
 
				-   * - part.updated: { id, messageID, type, tool?, input?, output?, ... }
			
 
				-   */
			
 
				-  private logEvent(event: ServerEvent): void {
			
 
				-    const props = event.properties || {};
			
 
				-    
			
 
				-    switch (event.type) {
			
 
				-      case 'session.created':
			
 
				-        console.log(`📋 Session created`);
			
 
				-        break;
			
 
				-        
			
 
				-      case 'session.updated':
			
 
				-        // Session updates are frequent but not very informative
			
 
				-        // Skip logging unless there's something specific
			
 
				-        break;
			
 
				-        
			
 
				-      case 'message.created':
			
 
				-        console.log(`💬 New message (${props.role || 'assistant'})`);
			
 
				-        break;
			
 
				-        
			
 
				-      case 'message.updated':
			
 
				-        // Message updates happen frequently during streaming
			
 
				-        // Only log role changes or completion
			
 
				-        if (props.role === 'user') {
			
 
				-          console.log(`👤 User message received`);
			
 
				-        }
			
 
				-        // Skip assistant message updates (too noisy)
			
 
				-        break;
			
 
				-        
			
 
				-      case 'part.created':
			
 
				-      case 'part.updated':
			
 
				-        // Parts contain the actual content - tools, text, etc.
			
 
				-        if (props.type === 'tool') {
			
 
				-          const toolName = props.tool || 'unknown';
			
 
				-          const status = props.state?.status || props.status || '';
			
 
				-          
			
 
				-          // Only log when tool starts or completes
			
 
				-          if (status === 'running' || status === 'pending') {
			
 
				-            console.log(`🔧 Tool: ${toolName} (starting)`);
			
 
				-            
			
 
				-            // Show tool input preview
			
 
				-            const input = props.state?.input || props.input || {};
			
 
				-            if (input.command) {
			
 
				-              const cmd = input.command.substring(0, 70);
			
 
				-              console.log(`   └─ ${cmd}${input.command.length > 70 ? '...' : ''}`);
			
 
				-            } else if (input.filePath) {
			
 
				-              console.log(`   └─ ${input.filePath}`);
			
 
				-            } else if (input.pattern) {
			
 
				-              console.log(`   └─ pattern: ${input.pattern}`);
			
 
				-            }
			
 
				-          } else if (status === 'completed') {
			
 
				-            console.log(`✅ Tool: ${toolName} (completed)`);
			
 
				-          } else if (status === 'error') {
			
 
				-            console.log(`❌ Tool: ${toolName} (error)`);
			
 
				-          }
			
 
				-        } else if (props.type === 'text') {
			
 
				-          // Text parts - show preview of assistant response
			
 
				-          const text = props.text || '';
			
 
				-          if (text.length > 0) {
			
 
				-            const preview = text.substring(0, 100).replace(/\n/g, ' ');
			
 
				-            console.log(`📝 ${preview}${text.length > 100 ? '...' : ''}`);
			
 
				-          }
			
 
				-        }
			
 
				-        break;
			
 
				-        
			
 
				-      case 'permission.request':
			
 
				-        console.log(`🔐 Permission requested: ${props.tool || 'unknown'}`);
			
 
				-        break;
			
 
				-        
			
 
				-      case 'permission.response':
			
 
				-        console.log(`🔐 Permission ${props.response === 'once' || props.approved ? 'granted' : 'denied'}`);
			
 
				-        break;
			
 
				-        
			
 
				-      case 'tool.call':
			
 
				-        console.log(`🔧 Tool call: ${props.tool || props.name || 'unknown'}`);
			
 
				-        break;
			
 
				-        
			
 
				-      case 'tool.result':
			
 
				-        const success = props.error ? '❌' : '✅';
			
 
				-        console.log(`${success} Tool result: ${props.tool || 'unknown'}`);
			
 
				-        break;
			
 
				-        
			
 
				-      default:
			
 
				-        // Skip unknown events to reduce noise
			
 
				-        break;
			
 
				-    }
			
 
				-  }
			
 
				 }