demo-enhanced-features.sh 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/bin/bash
  2. # Demo script for enhanced eval framework features
  3. # Shows:
  4. # 1. Enhanced approval detection with confidence levels
  5. # 2. --show-failures flag for debugging failed tests
  6. echo "=========================================="
  7. echo "Enhanced Eval Framework Features Demo"
  8. echo "=========================================="
  9. echo ""
  10. echo "Feature 1: Enhanced Approval Detection"
  11. echo "---------------------------------------"
  12. echo "✅ High confidence patterns:"
  13. echo " - 'approval needed before proceeding'"
  14. echo " - 'please confirm before'"
  15. echo " - 'ready to proceed?'"
  16. echo ""
  17. echo "✅ Medium confidence patterns:"
  18. echo " - 'would you like me to'"
  19. echo " - 'should I proceed'"
  20. echo " - 'is this okay?'"
  21. echo ""
  22. echo "✅ Low confidence patterns (with false positive filtering):"
  23. echo " - 'may I' (but NOT 'may I help you')"
  24. echo " - 'can I' (but NOT 'can I assist you')"
  25. echo ""
  26. echo "✅ Captures:"
  27. echo " - Approval text (the actual sentence)"
  28. echo " - What is being approved (extracted from plan)"
  29. echo " - Confidence level (high/medium/low)"
  30. echo ""
  31. echo "Feature 2: --show-failures Flag"
  32. echo "--------------------------------"
  33. echo "Usage: npm run eval:sdk -- --agent=openagent --show-failures"
  34. echo ""
  35. echo "When a test fails, automatically shows:"
  36. echo " - Full session timeline"
  37. echo " - All messages (user + assistant)"
  38. echo " - All tool calls with inputs/outputs"
  39. echo " - Timestamps (relative to session start)"
  40. echo " - Violations highlighted"
  41. echo ""
  42. echo "Feature 3: --test-id Flag"
  43. echo "-------------------------"
  44. echo "Usage: npm run eval:sdk -- --agent=openagent --test-id=approval-gate-basic"
  45. echo ""
  46. echo "Run a specific test by ID for faster iteration"
  47. echo ""
  48. echo "=========================================="
  49. echo "Running Unit Tests"
  50. echo "=========================================="
  51. echo ""
  52. # Run the approval detection unit tests
  53. npm test -- src/evaluators/__tests__/approval-detection.test.ts --run
  54. echo ""
  55. echo "=========================================="
  56. echo "Demo Complete!"
  57. echo "=========================================="
  58. echo ""
  59. echo "To try the --show-failures flag:"
  60. echo " npm run eval:sdk -- --agent=openagent --test-id=YOUR_TEST_ID --show-failures"
  61. echo ""