run-evaluations.yml 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. name: Run Evaluations
  2. on:
  3. workflow_dispatch:
  4. inputs:
  5. agent:
  6. description: 'Agent to test'
  7. required: false
  8. default: 'openagent'
  9. type: choice
  10. options:
  11. - openagent
  12. - opencoder
  13. - system-builder
  14. pattern:
  15. description: 'Test pattern (glob)'
  16. required: false
  17. default: '**/golden/*.yaml'
  18. type: string
  19. model:
  20. description: 'Model to use (provider/model)'
  21. required: false
  22. default: 'opencode/grok-code'
  23. type: string
  24. seed:
  25. description: 'Seed for reproducible randomness'
  26. required: false
  27. default: 'ci-evaluation-seed'
  28. type: string
  29. timeout:
  30. description: 'Test timeout in milliseconds'
  31. required: false
  32. default: '120000'
  33. type: string
  34. schedule:
  35. # Run daily at 2 AM UTC
  36. - cron: '0 2 * * *'
  37. push:
  38. branches: [main]
  39. paths:
  40. - 'evals/framework/**'
  41. - 'evals/agents/**'
  42. - '.github/workflows/evals/**'
  43. jobs:
  44. run-evaluations:
  45. runs-on: ubuntu-latest
  46. steps:
  47. - name: Checkout code
  48. uses: actions/checkout@v4
  49. - name: Setup Node.js
  50. uses: actions/setup-node@v4
  51. with:
  52. node-version: '20'
  53. cache: 'npm'
  54. cache-dependency-path: evals/framework/package-lock.json
  55. - name: Install dependencies
  56. working-directory: evals/framework
  57. run: npm ci
  58. - name: Build framework
  59. working-directory: evals/framework
  60. run: npm run build
  61. - name: Install OpenCode CLI
  62. run: |
  63. # Install OpenCode CLI if available
  64. # This step may need to be adjusted based on OpenCode installation method
  65. echo "OpenCode CLI installation would go here"
  66. - name: Run evaluations
  67. working-directory: evals/framework
  68. env:
  69. OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
  70. OPENCODE_SESSION_STORAGE: /tmp/opencode-sessions
  71. run: |
  72. # Create isolated session directory
  73. mkdir -p /tmp/opencode-sessions
  74. # Run evaluations with isolation and seeding
  75. npm run eval:sdk \
  76. -- --agent=${{ github.event.inputs.agent || 'openagent' }} \
  77. --pattern="${{ github.event.inputs.pattern || '**/golden/*.yaml' }}" \
  78. --model=${{ github.event.inputs.model || 'opencode/grok-code' }} \
  79. --seed=${{ github.event.inputs.seed || 'ci-evaluation-seed' }} \
  80. --timeout=${{ github.event.inputs.timeout || '120000' }} \
  81. --isolate-environment \
  82. --debug
  83. - name: Upload results
  84. uses: actions/upload-artifact@v4
  85. if: always()
  86. with:
  87. name: evaluation-results
  88. path: |
  89. evals/results/
  90. evals/test_tmp/
  91. retention-days: 30
  92. - name: Generate summary
  93. if: always()
  94. run: |
  95. if [ -f "evals/results/latest.json" ]; then
  96. echo "## Evaluation Results" >> $GITHUB_STEP_SUMMARY
  97. echo "" >> $GITHUB_STEP_SUMMARY
  98. # Extract summary from latest results
  99. node -e "
  100. const results = require('./evals/results/latest.json');
  101. const passed = results.filter(r => r.passed).length;
  102. const total = results.length;
  103. const failed = total - passed;
  104. console.log(\`| ✅ ${passed}/${total} tests passed\`);
  105. console.log(\`| ❌ ${failed} failures\`);
  106. console.log('');
  107. if (failed > 0) {
  108. console.log('### Failed Tests');
  109. results.filter(r => !r.passed).forEach(r => {
  110. console.log(\`- ${r.testCase.id}: ${r.errors.join(', ')}\`);
  111. });
  112. }
  113. " >> $GITHUB_STEP_SUMMARY
  114. else
  115. echo "## No Results Generated" >> $GITHUB_STEP_SUMMARY
  116. echo "Evaluation run may have failed to complete." >> $GITHUB_STEP_SUMMARY
  117. fi
  118. - name: Cleanup
  119. if: always()
  120. run: |
  121. # Clean up temporary session storage
  122. rm -rf /tmp/opencode-sessions