Reusable framework for evaluating OpenCode agent behavior. Can be used to test any agent against defined standards.
framework/
├── src/
│ ├── collector/ # Session data collection
│ │ ├── session-reader.ts
│ │ ├── message-parser.ts
│ │ └── timeline-builder.ts
│ ├── evaluators/ # Evaluation logic
│ │ ├── base-evaluator.ts
│ │ ├── approval-gate.ts
│ │ ├── context-loading.ts
│ │ ├── delegation.ts
│ │ ├── tool-usage.ts
│ │ └── model-selection.ts
│ ├── runner/ # Test execution
│ │ ├── test-runner.ts
│ │ └── session-analyzer.ts
│ ├── reporters/ # Result reporting
│ │ ├── console-reporter.ts
│ │ ├── json-reporter.ts
│ │ └── markdown-reporter.ts
│ ├── types/ # TypeScript types
│ │ └── index.ts
│ ├── config.ts # Configuration
│ └── index.ts # Main exports
├── tests/ # Framework tests
├── package.json
├── tsconfig.json
└── README.md
npm install
import { SessionReader, ApprovalGateEvaluator, TimelineBuilder } from '@evals/framework';
// Read session
const reader = new SessionReader('/path/to/project');
const sessionInfo = reader.getSessionInfo('ses_xxxxx');
const messages = reader.getMessages('ses_xxxxx');
// Build timeline
const builder = new TimelineBuilder(reader);
const timeline = builder.buildTimeline('ses_xxxxx');
// Evaluate
const evaluator = new ApprovalGateEvaluator();
const result = evaluator.evaluate(timeline);
console.log(`Passed: ${result.passed}`);
console.log(`Score: ${result.score}/100`);
console.log(`Violations: ${result.violations.length}`);
import { TestRunner } from '@evals/framework';
const runner = new TestRunner({
projectPath: process.cwd(),
evaluatorsPath: './src/evaluators',
resultsPath: '../results'
});
// Load test cases
const testCases = runner.loadTestCases('../opencode/openagent/test-cases/approval-gates.yaml');
// Run tests
const suite = runner.runAll(testCases);
console.log(`Pass Rate: ${suite.summary.passRate}%`);
SessionReader - Read OpenCode session files
const reader = new SessionReader(projectPath);
const info = reader.getSessionInfo(sessionId);
const messages = reader.getMessages(sessionId);
const parts = reader.getParts(sessionId, messageId);
MessageParser - Parse message structure
const parser = new MessageParser();
const agent = parser.getAgent(message);
const model = parser.getModel(message);
const metrics = parser.getMetrics(message);
TimelineBuilder - Build event timeline
const builder = new TimelineBuilder(reader);
const timeline = builder.buildTimeline(sessionId);
const toolCalls = builder.filterByType(timeline, 'tool_call');
All evaluators extend BaseEvaluator and implement:
evaluate(timeline: TimelineEvent[]): EvaluationResult
ApprovalGateEvaluator - Check approval before execution
const evaluator = new ApprovalGateEvaluator();
const result = evaluator.evaluate(timeline);
ContextLoadingEvaluator - Verify context loading
const evaluator = new ContextLoadingEvaluator();
const result = evaluator.evaluate(timeline);
DelegationEvaluator - Validate delegation decisions
const evaluator = new DelegationEvaluator();
const result = evaluator.evaluate(timeline);
ToolUsageEvaluator - Check tool selection
const evaluator = new ToolUsageEvaluator();
const result = evaluator.evaluate(timeline);
ExecutionBalanceEvaluator - Assess balance and ordering of read vs execution actions
import { ExecutionBalanceEvaluator } from '@evals/framework';
const evaluator = new ExecutionBalanceEvaluator();
const result = await evaluator.evaluate(timeline);
console.log(result.meta?.ratio); // read/exec ratio
Violations it may produce:
execution-before-read (error): first execution tool used before any read tools.insufficient-read (warning): fewer reads than executions overall.TestRunner - Execute test suites
const runner = new TestRunner(config);
const testCases = runner.loadTestCases(path);
const suite = runner.runAll(testCases);
SessionAnalyzer - Analyze historical sessions
const analyzer = new SessionAnalyzer(reader);
const result = analyzer.analyze(sessionId, testCase);
ConsoleReporter - Pretty console output
const reporter = new ConsoleReporter();
reporter.report(testSuite);
JSONReporter - Machine-readable JSON
const reporter = new JSONReporter();
reporter.export(testSuite, 'results.json');
MarkdownReporter - Documentation format
const reporter = new MarkdownReporter();
reporter.generate(testSuite, 'report.md');
interface SessionInfo {
id: string;
version: string;
title: string;
time: { created: number; updated: number };
}
interface Message {
id: string;
role: 'user' | 'assistant';
sessionID: string;
mode?: string;
modelID?: string;
providerID?: string;
tokens?: TokenUsage;
cost?: number;
time: { created: number; completed?: number };
}
interface TimelineEvent {
timestamp: number;
type: 'user_message' | 'assistant_message' | 'tool_call' | 'patch';
agent?: string;
model?: string;
data: any;
}
interface EvaluationResult {
evaluator: string;
passed: boolean;
score: number;
violations: Violation[];
evidence: Evidence[];
}
interface TestResult {
testCaseId: string;
sessionId: string;
passed: boolean;
score: number;
evaluationResults: EvaluationResult[];
metadata: any;
}
// config.ts
export const config = {
projectPath: process.cwd(),
sessionStoragePath: '~/.local/share/opencode/',
resultsPath: '../results/',
passThreshold: 75,
evaluators: {
'approval-gate': ApprovalGateEvaluator,
'context-loading': ContextLoadingEvaluator,
'delegation': DelegationEvaluator,
'tool-usage': ToolUsageEvaluator,
}
};
npm install
npm run build
# Run all tests
npm test
# Run specific test
npm test -- session-reader
# Watch mode
npm run test:watch
# Build TypeScript
npm run build
# Watch mode
npm run build:watch
npm run lint
npm run lint:fix
src/evaluators/BaseEvaluatorevaluate() methodindex.tsFor a detailed step-by-step contributor guide see: docs/contributing/ADDING_EVALUATOR.md
Example:
// src/evaluators/my-evaluator.ts
import { BaseEvaluator } from './base-evaluator';
import { EvaluationResult, TimelineEvent } from '../types';
export class MyEvaluator extends BaseEvaluator {
evaluate(timeline: TimelineEvent[]): EvaluationResult {
// Your evaluation logic
const checks = [
{ name: 'check1', passed: true, weight: 50 },
{ name: 'check2', passed: false, weight: 50 }
];
const score = this.calculateScore(checks);
const violations = this.findViolations(timeline);
return {
evaluator: 'my-evaluator',
passed: score >= 75,
score,
violations,
evidence: []
};
}
}
See API.md for complete API documentation.
MIT
Development and debugging scripts are organized in the scripts/ directory:
scripts/
├── debug/ # Session and event debugging
├── test/ # Framework component tests
├── utils/ # Utility scripts (batch runner, etc.)
└── README.md # Script documentation
See scripts/README.md for detailed usage.
# Run tests in batches
./scripts/utils/run-tests-batch.sh openagent 3 10
# Debug a session
node scripts/debug/inspect-session.mjs
# Test framework component
npx tsx scripts/test/test-timeline.ts