| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- name: Run Evaluations
- on:
- workflow_dispatch:
- inputs:
- agent:
- description: 'Agent to test'
- required: false
- default: 'openagent'
- type: choice
- options:
- - openagent
- - opencoder
- - system-builder
- pattern:
- description: 'Test pattern (glob)'
- required: false
- default: '**/golden/*.yaml'
- type: string
- model:
- description: 'Model to use (provider/model)'
- required: false
- default: 'opencode/grok-code'
- type: string
- seed:
- description: 'Seed for reproducible randomness'
- required: false
- default: 'ci-evaluation-seed'
- type: string
- timeout:
- description: 'Test timeout in milliseconds'
- required: false
- default: '120000'
- type: string
- schedule:
- # Run daily at 2 AM UTC
- - cron: '0 2 * * *'
- push:
- branches: [main]
- paths:
- - 'evals/framework/**'
- - 'evals/agents/**'
- - '.github/workflows/evals/**'
- jobs:
- run-evaluations:
- runs-on: ubuntu-latest
-
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: '20'
- cache: 'npm'
- cache-dependency-path: evals/framework/package-lock.json
-
- - name: Install dependencies
- working-directory: evals/framework
- run: npm ci
-
- - name: Build framework
- working-directory: evals/framework
- run: npm run build
-
- - name: Install OpenCode CLI
- run: |
- # Install OpenCode CLI if available
- # This step may need to be adjusted based on OpenCode installation method
- echo "OpenCode CLI installation would go here"
-
- - name: Run evaluations
- working-directory: evals/framework
- env:
- OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
- OPENCODE_SESSION_STORAGE: /tmp/opencode-sessions
- run: |
- # Create isolated session directory
- mkdir -p /tmp/opencode-sessions
-
- # Run evaluations with isolation and seeding
- npm run eval:sdk \
- -- --agent=${{ github.event.inputs.agent || 'openagent' }} \
- --pattern="${{ github.event.inputs.pattern || '**/golden/*.yaml' }}" \
- --model=${{ github.event.inputs.model || 'opencode/grok-code' }} \
- --seed=${{ github.event.inputs.seed || 'ci-evaluation-seed' }} \
- --timeout=${{ github.event.inputs.timeout || '120000' }} \
- --isolate-environment \
- --debug
-
- - name: Upload results
- uses: actions/upload-artifact@v4
- if: always()
- with:
- name: evaluation-results
- path: |
- evals/results/
- evals/test_tmp/
- retention-days: 30
-
- - name: Generate summary
- if: always()
- run: |
- if [ -f "evals/results/latest.json" ]; then
- echo "## Evaluation Results" >> $GITHUB_STEP_SUMMARY
- echo "" >> $GITHUB_STEP_SUMMARY
-
- # Extract summary from latest results
- node -e "
- const results = require('./evals/results/latest.json');
- const passed = results.filter(r => r.passed).length;
- const total = results.length;
- const failed = total - passed;
-
- console.log(\`| ✅ ${passed}/${total} tests passed\`);
- console.log(\`| ❌ ${failed} failures\`);
- console.log('');
-
- if (failed > 0) {
- console.log('### Failed Tests');
- results.filter(r => !r.passed).forEach(r => {
- console.log(\`- ${r.testCase.id}: ${r.errors.join(', ')}\`);
- });
- }
- " >> $GITHUB_STEP_SUMMARY
- else
- echo "## No Results Generated" >> $GITHUB_STEP_SUMMARY
- echo "Evaluation run may have failed to complete." >> $GITHUB_STEP_SUMMARY
- fi
-
- - name: Cleanup
- if: always()
- run: |
- # Clean up temporary session storage
- rm -rf /tmp/opencode-sessions
|