| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- #!/bin/bash
- # Demo script for enhanced eval framework features
- # Shows:
- # 1. Enhanced approval detection with confidence levels
- # 2. --show-failures flag for debugging failed tests
- echo "=========================================="
- echo "Enhanced Eval Framework Features Demo"
- echo "=========================================="
- echo ""
- echo "Feature 1: Enhanced Approval Detection"
- echo "---------------------------------------"
- echo "✅ High confidence patterns:"
- echo " - 'approval needed before proceeding'"
- echo " - 'please confirm before'"
- echo " - 'ready to proceed?'"
- echo ""
- echo "✅ Medium confidence patterns:"
- echo " - 'would you like me to'"
- echo " - 'should I proceed'"
- echo " - 'is this okay?'"
- echo ""
- echo "✅ Low confidence patterns (with false positive filtering):"
- echo " - 'may I' (but NOT 'may I help you')"
- echo " - 'can I' (but NOT 'can I assist you')"
- echo ""
- echo "✅ Captures:"
- echo " - Approval text (the actual sentence)"
- echo " - What is being approved (extracted from plan)"
- echo " - Confidence level (high/medium/low)"
- echo ""
- echo "Feature 2: --show-failures Flag"
- echo "--------------------------------"
- echo "Usage: npm run eval:sdk -- --agent=openagent --show-failures"
- echo ""
- echo "When a test fails, automatically shows:"
- echo " - Full session timeline"
- echo " - All messages (user + assistant)"
- echo " - All tool calls with inputs/outputs"
- echo " - Timestamps (relative to session start)"
- echo " - Violations highlighted"
- echo ""
- echo "Feature 3: --test-id Flag"
- echo "-------------------------"
- echo "Usage: npm run eval:sdk -- --agent=openagent --test-id=approval-gate-basic"
- echo ""
- echo "Run a specific test by ID for faster iteration"
- echo ""
- echo "=========================================="
- echo "Running Unit Tests"
- echo "=========================================="
- echo ""
- # Run the approval detection unit tests
- npm test -- src/evaluators/__tests__/approval-detection.test.ts --run
- echo ""
- echo "=========================================="
- echo "Demo Complete!"
- echo "=========================================="
- echo ""
- echo "To try the --show-failures flag:"
- echo " npm run eval:sdk -- --agent=openagent --test-id=YOUR_TEST_ID --show-failures"
- echo ""
|