Quick reference for debugging and validating opencoder tests.
cd evals/framework
# Run single test with debug
npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug
# Run all opencoder tests with debug
npm run eval:sdk -- --agent=opencoder --debug
# Run specific category
npm run eval:sdk -- --agent=opencoder --pattern="context-loading/*.yaml" --debug
# 1. Run test with --debug flag
# 2. Copy session ID from output (e.g., "Session created: ses_4ff9f7975ffeWYqM564A5ooo4y")
# 3. View conversation:
./scripts/debug/show-test-conversation.sh ses_4ff9f7975ffeWYqM564A5ooo4y
# Latest results (summary only)
cat evals/results/latest.json | jq '.'
# Historical results
ls -lt evals/results/history/2025-12/
# View specific result
cat evals/results/history/2025-12/08-235037-opencoder.json | jq '.'
# Session messages
~/.local/share/opencode/storage/message/ses_XXXXX/*.json
# Message parts (tool calls, text, results)
~/.local/share/opencode/storage/part/msg_XXXXX/*.json
# List all sessions (most recent first)
ls -lt ~/.local/share/opencode/storage/message/ | head -20
# Find specific session
find ~/.local/share/opencode/storage/message -name "ses_*" -type d | grep "ses_4ff9f7975ffeWYqM564A5ooo4y"
SESSION_ID="ses_4ff9f7975ffeWYqM564A5ooo4y"
# List all messages in session
ls -la ~/.local/share/opencode/storage/message/$SESSION_ID/
# View message content
cat ~/.local/share/opencode/storage/message/$SESSION_ID/msg_*.json | jq '.summary.body'
MESSAGE_ID="msg_b006086a5001CXI2Ks0mFkyPxU"
# List message parts
ls -la ~/.local/share/opencode/storage/part/$MESSAGE_ID/
# View all parts
for file in ~/.local/share/opencode/storage/part/$MESSAGE_ID/*.json; do
cat "$file" | jq '.'
done
What to check:
How to verify:
npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug
# Look for "Approval needed" in output
What to check:
.opencode/context/core/standards/code.mdHow to verify:
npm run eval:sdk -- --agent=opencoder --pattern="context-loading/*.yaml" --debug
# Check test output for:
# "✓ Loaded: .opencode/context/core/standards/code.md"
# "✓ Timing: Context loaded XXXXms before execution"
What to check:
How to verify:
npm run eval:sdk -- --agent=opencoder --pattern="delegation/*.yaml" --debug
# Look for "task-manager" or multi-step plan
✅ test-name - Test Description
Duration: 23291ms
Events: 28
Approvals: 0
Context Loading:
✓ Loaded: /path/to/context/file.md
✓ Timing: Context loaded 25272ms before execution
Violations: 0 (0 errors, 0 warnings)
Errors (test fails):
missing-approval - Execution without approvalmissing-required-tool - Expected tool not usedinsufficient-tool-calls - Not enough tool callsexecution-before-read - Modified without reading firstWarnings (test passes with warnings):
insufficient-read - Low read/execution ratioSolution: Session may have been cleaned up. Run test again with --debug flag.
Solution: Test expectations may be wrong. Check test YAML file:
cat evals/agents/opencoder/tests/planning/planning-approval-workflow.yaml
Solution: Tool calls are in separate part files:
ls ~/.local/share/opencode/storage/part/msg_XXXXX/
Solution: Check if context file exists:
ls -la .opencode/context/core/standards/code.md
# Run test
npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug
# Get session ID from output
SESSION_ID="ses_XXXXX"
# Check for approval request
cat ~/.local/share/opencode/storage/message/$SESSION_ID/*.json | \
jq -r '.summary.body' | \
grep -i "approval needed"
# Run test
npm run eval:sdk -- --agent=opencoder --pattern="context-loading/*.yaml" --debug
# Check test output for context loading confirmation
# Look for: "✓ Loaded: .opencode/context/core/standards/code.md"
# Get session ID from test output
SESSION_ID="ses_XXXXX"
# View all tool calls in order
for msg in ~/.local/share/opencode/storage/message/$SESSION_ID/*.json; do
MSG_ID=$(cat "$msg" | jq -r '.id')
if [ -d ~/.local/share/opencode/storage/part/$MSG_ID ]; then
echo "Message: $MSG_ID"
cat ~/.local/share/opencode/storage/part/$MSG_ID/*.json | \
jq -r 'select(.type == "tool") | " \(.tool): \(.input)"'
fi
done
id: my-test-name
name: Human Readable Test Name
description: |
What this test validates
category: developer # or business, creative, edge-case
agent: opencoder
model: anthropic/claude-sonnet-4-5
prompt: |
Your test prompt here
behavior:
mustContain:
- "Expected text in response"
mustNotUseTools: [write, edit] # Tools that should NOT be used
mustUseTools: [read] # Tools that MUST be used
expectedViolations:
- rule: approval-gate
shouldViolate: false # false = should NOT violate
severity: error
approvalStrategy:
type: auto-approve
timeout: 60000
tags:
- tag1
- tag2
prompts:
- text: "First prompt"
expectContext: false
- text: "approve"
delayMs: 2000
expectContext: true
contextFile: "code.md"
# Set debug environment variable
DEBUG=* npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug
# Run test twice and compare
npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug > run1.log
npm run eval:sdk -- --agent=opencoder --pattern="planning/*.yaml" --debug > run2.log
diff run1.log run2.log
SESSION_ID="ses_XXXXX"
# Create tool call report
echo "Tool Calls in Session: $SESSION_ID"
echo "======================================"
for msg in ~/.local/share/opencode/storage/message/$SESSION_ID/*.json; do
MSG_ID=$(cat "$msg" | jq -r '.id')
ROLE=$(cat "$msg" | jq -r '.role')
if [ "$ROLE" = "assistant" ] && [ -d ~/.local/share/opencode/storage/part/$MSG_ID ]; then
cat ~/.local/share/opencode/storage/part/$MSG_ID/*.json | \
jq -r 'select(.type == "tool") | "\(.tool): \(.input | tostring)"'
fi
done
TEST_VALIDATION_REPORT.mdconfig/config.yaml../../.opencode/agent/opencoder.md../../evals/framework/scripts/debug/--debug flag when investigating test failuresLast Updated: 2025-12-08