agent-validator.ts 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086
  1. import type { Plugin } from "@opencode-ai/plugin"
  2. import { tool } from "@opencode-ai/plugin"
  3. import { writeFile } from "fs/promises"
  4. import path from "path"
  5. /**
  6. * Helper function to check if a message contains approval request language
  7. */
  8. function checkForApprovalLanguage(msg: any): boolean {
  9. if (!msg.parts) return false
  10. const approvalKeywords = [
  11. "approval",
  12. "approve",
  13. "proceed",
  14. "confirm",
  15. "permission",
  16. "before proceeding",
  17. "should i",
  18. "may i",
  19. "can i proceed",
  20. ]
  21. for (const part of msg.parts) {
  22. if (part.type === "text" && part.text) {
  23. const text = part.text.toLowerCase()
  24. if (approvalKeywords.some(keyword => text.includes(keyword))) {
  25. return true
  26. }
  27. }
  28. }
  29. return false
  30. }
  31. /**
  32. * Helper function to check if a user message contains approval response
  33. */
  34. function checkForUserApproval(msg: any): boolean {
  35. if (!msg.parts) return false
  36. const userApprovalKeywords = [
  37. "proceed",
  38. "approved",
  39. "yes",
  40. "go ahead",
  41. "ok",
  42. "okay",
  43. "sure",
  44. "do it",
  45. "continue",
  46. ]
  47. for (const part of msg.parts) {
  48. if (part.type === "text" && part.text) {
  49. const text = part.text.toLowerCase().trim()
  50. // Check for exact matches or phrases containing approval keywords
  51. if (userApprovalKeywords.some(keyword => text === keyword || text.includes(keyword))) {
  52. return true
  53. }
  54. }
  55. }
  56. return false
  57. }
  58. /**
  59. * Agent Validation Plugin
  60. *
  61. * Validates that agents follow their defined prompts and execution rules.
  62. * Tracks tool calls, approval gates, delegation decisions, and critical rule compliance.
  63. */
  64. export const AgentValidatorPlugin: Plugin = async ({ client, project, directory }) => {
  65. // Track agent behavior in real-time
  66. const behaviorLog: Array<{
  67. timestamp: number
  68. sessionID: string
  69. agent: string
  70. event: string
  71. data: any
  72. }> = []
  73. // Track tool execution for approval gate validation
  74. const toolExecutionTracker = new Map<string, {
  75. approvalRequested: boolean
  76. toolsExecuted: string[]
  77. timestamp: number
  78. }>()
  79. // Track current agent for each session
  80. const sessionAgentTracker = new Map<string, string>()
  81. return {
  82. // Listen to all events
  83. async event(input) {
  84. const { event } = input
  85. // Silently track events (removed console.log to reduce noise)
  86. // Track session-level events for validation
  87. if (event.type === "message.updated") {
  88. const msg = event.properties.info
  89. behaviorLog.push({
  90. timestamp: Date.now(),
  91. sessionID: msg.sessionID,
  92. agent: msg.role === "user" ? msg.agent : "assistant",
  93. event: "message_created",
  94. data: {
  95. messageID: msg.id,
  96. role: msg.role,
  97. },
  98. })
  99. }
  100. },
  101. // Capture agent information from chat messages
  102. "chat.message": async (input, output) => {
  103. const { sessionID, agent } = input
  104. // Track which agent is currently active for this session
  105. if (agent) {
  106. sessionAgentTracker.set(sessionID, agent)
  107. }
  108. },
  109. // Monitor tool execution
  110. "tool.execute.before": async (input, output) => {
  111. const { tool, sessionID, callID } = input
  112. const key = `${sessionID}-${callID}`
  113. // Silently track tools (removed console.log to reduce noise)
  114. // Get current agent for this session
  115. const currentAgent = sessionAgentTracker.get(sessionID) || "unknown"
  116. // Track context file reads
  117. if (tool === "read") {
  118. const filePath = output.args?.filePath || output.args?.target_file
  119. if (filePath && filePath.includes(".opencode/")) {
  120. // Context file read detected - track silently
  121. behaviorLog.push({
  122. timestamp: Date.now(),
  123. sessionID,
  124. agent: currentAgent,
  125. event: "context_file_read",
  126. data: {
  127. tool: "read",
  128. filePath,
  129. callID,
  130. },
  131. })
  132. }
  133. }
  134. // Track execution tools that require approval
  135. const executionTools = ["bash", "write", "edit", "task"]
  136. if (executionTools.includes(tool)) {
  137. // Track execution tool silently
  138. const tracker = toolExecutionTracker.get(sessionID) || {
  139. approvalRequested: false,
  140. toolsExecuted: [],
  141. timestamp: Date.now(),
  142. }
  143. // Check recent messages for approval flow
  144. try {
  145. const messagesResponse = await client.session.messages({
  146. path: { id: sessionID },
  147. })
  148. const messages = messagesResponse.data || []
  149. // Look at last few messages for approval pattern
  150. const recentMessages = messages.slice(-5)
  151. for (let i = 0; i < recentMessages.length - 1; i++) {
  152. const msg = recentMessages[i]
  153. const nextMsg = recentMessages[i + 1]
  154. const role = msg.info?.role
  155. const nextRole = nextMsg.info?.role
  156. if (role === "assistant" && checkForApprovalLanguage(msg) &&
  157. nextRole === "user" && checkForUserApproval(nextMsg)) {
  158. tracker.approvalRequested = true
  159. // Approval flow detected - tracked silently
  160. break
  161. }
  162. }
  163. } catch (err) {
  164. // Error checking messages - continue silently
  165. }
  166. tracker.toolsExecuted.push(tool)
  167. toolExecutionTracker.set(sessionID, tracker)
  168. behaviorLog.push({
  169. timestamp: Date.now(),
  170. sessionID,
  171. agent: currentAgent,
  172. event: "execution_tool_called",
  173. data: {
  174. tool,
  175. callID,
  176. args: output.args,
  177. approvalRequested: tracker.approvalRequested,
  178. },
  179. })
  180. }
  181. },
  182. // Track tool execution results
  183. "tool.execute.after": async (input, output) => {
  184. const { tool, sessionID } = input
  185. // Track tool completion silently
  186. const currentAgent = sessionAgentTracker.get(sessionID) || "unknown"
  187. behaviorLog.push({
  188. timestamp: Date.now(),
  189. sessionID,
  190. agent: currentAgent,
  191. event: "tool_executed",
  192. data: {
  193. tool,
  194. title: output.title,
  195. metadata: output.metadata,
  196. },
  197. })
  198. },
  199. // Provide validation tools
  200. tool: {
  201. // Validate current session
  202. validate_session: tool({
  203. description: "Validate that the current agent session is following its defined prompt rules and execution patterns. Returns a detailed validation report.",
  204. args: {
  205. include_details: tool.schema.boolean()
  206. .optional()
  207. .describe("Include detailed evidence for each validation check"),
  208. },
  209. async execute(args, context) {
  210. const { sessionID } = context
  211. try {
  212. // Fetch session messages using SDK
  213. const messagesResponse = await client.session.messages({
  214. path: { id: sessionID },
  215. })
  216. if (messagesResponse.error) {
  217. return `Error fetching session: ${messagesResponse.error}`
  218. }
  219. const messages = messagesResponse.data || []
  220. // Analyze agent behavior
  221. const validation = await validateSessionBehavior({
  222. sessionID,
  223. messages,
  224. behaviorLog: behaviorLog.filter(log => log.sessionID === sessionID),
  225. includeDetails: args.include_details ?? false,
  226. })
  227. return formatValidationReport(validation)
  228. } catch (err) {
  229. return `Validation error: ${err instanceof Error ? err.message : String(err)}`
  230. }
  231. },
  232. }),
  233. // Check approval gate compliance
  234. check_approval_gates: tool({
  235. description: "Check if approval gates were properly enforced before execution operations (bash, write, edit, task). Returns compliance status.",
  236. args: {},
  237. async execute(args, context) {
  238. const { sessionID } = context
  239. const tracker = toolExecutionTracker.get(sessionID)
  240. if (!tracker) {
  241. return "No execution operations tracked in this session."
  242. }
  243. const { approvalRequested, toolsExecuted } = tracker
  244. const violations = approvalRequested ? [] : toolsExecuted
  245. if (violations.length === 0) {
  246. return `✅ Approval gate compliance: PASSED\n\nAll ${toolsExecuted.length} execution operation(s) were properly approved.`
  247. }
  248. return `⚠️ Approval gate compliance: FAILED\n\nExecuted ${violations.length} operation(s) without approval:\n${violations.map(t => ` - ${t}`).join("\n")}\n\nCritical rule violated: approval_gate`
  249. },
  250. }),
  251. // Export validation report
  252. export_validation_report: tool({
  253. description: "Export a comprehensive validation report for the current session to a markdown file",
  254. args: {
  255. output_path: tool.schema.string()
  256. .optional()
  257. .describe("Path to save the report (defaults to .tmp/validation-{sessionID}.md)"),
  258. },
  259. async execute(args, context) {
  260. const { sessionID } = context
  261. try {
  262. const messagesResponse = await client.session.messages({
  263. path: { id: sessionID },
  264. })
  265. if (messagesResponse.error) {
  266. return `Error fetching session: ${messagesResponse.error}`
  267. }
  268. const messages = messagesResponse.data || []
  269. const validation = await validateSessionBehavior({
  270. sessionID,
  271. messages,
  272. behaviorLog: behaviorLog.filter(log => log.sessionID === sessionID),
  273. includeDetails: true,
  274. })
  275. const report = generateDetailedReport(validation, messages)
  276. const outputPath = args.output_path || path.join(directory, `.tmp/validation-${sessionID.slice(0, 8)}.md`)
  277. await writeFile(outputPath, report, "utf-8")
  278. return `✅ Validation report exported to: ${outputPath}\n\n${formatValidationReport(validation)}`
  279. } catch (err) {
  280. return `Export error: ${err instanceof Error ? err.message : String(err)}`
  281. }
  282. },
  283. }),
  284. // Analyze delegation decisions
  285. analyze_delegation: tool({
  286. description: "Analyze whether delegation decisions followed the 4+ file rule and complexity criteria",
  287. args: {},
  288. async execute(args, context) {
  289. const { sessionID } = context
  290. const messagesResponse = await client.session.messages({
  291. path: { id: sessionID },
  292. })
  293. if (messagesResponse.error) {
  294. return `Error: ${messagesResponse.error}`
  295. }
  296. const messages = messagesResponse.data || []
  297. const analysis = analyzeDelegationDecisions(messages)
  298. return formatDelegationAnalysis(analysis)
  299. },
  300. }),
  301. // Analyze context file reads
  302. analyze_context_reads: tool({
  303. description: "Show all context files that were read during the session (e.g., .opencode/agent/openagent.md)",
  304. args: {},
  305. async execute(args, context) {
  306. const { sessionID } = context
  307. // Filter behavior log for context file reads
  308. const contextReads = behaviorLog.filter(
  309. log => log.sessionID === sessionID && log.event === "context_file_read"
  310. )
  311. if (contextReads.length === 0) {
  312. return "📚 No context files read in this session yet.\n\nContext files are in `.opencode/` directories (agent definitions, workflows, standards, etc.)"
  313. }
  314. const lines: string[] = [
  315. `## Context Files Read`,
  316. ``,
  317. `**Total reads:** ${contextReads.length}`,
  318. ``,
  319. ]
  320. // Group by file path
  321. const fileReadCounts = new Map<string, number>()
  322. contextReads.forEach(log => {
  323. const filePath = log.data.filePath
  324. fileReadCounts.set(filePath, (fileReadCounts.get(filePath) || 0) + 1)
  325. })
  326. // Sort by read count (most read first)
  327. const sorted = Array.from(fileReadCounts.entries()).sort((a, b) => b[1] - a[1])
  328. lines.push(`### Files Read:`)
  329. sorted.forEach(([filePath, count]) => {
  330. const fileName = filePath.split('/').pop()
  331. const readText = count === 1 ? "read" : "reads"
  332. lines.push(`- **${fileName}** (${count} ${readText})`)
  333. lines.push(` \`${filePath}\``)
  334. })
  335. lines.push(``)
  336. lines.push(`### Timeline:`)
  337. contextReads.forEach((log, idx) => {
  338. const time = new Date(log.timestamp).toLocaleTimeString()
  339. const fileName = log.data.filePath.split('/').pop()
  340. lines.push(`${idx + 1}. [${time}] ${fileName}`)
  341. })
  342. return lines.join("\n")
  343. },
  344. }),
  345. // Check context loading compliance
  346. check_context_compliance: tool({
  347. description: "Check if required context files were read BEFORE executing tasks (e.g., read docs.md before writing documentation)",
  348. args: {},
  349. async execute(args, context) {
  350. const { sessionID } = context
  351. const messagesResponse = await client.session.messages({
  352. path: { id: sessionID },
  353. })
  354. if (messagesResponse.error) {
  355. return `Error: ${messagesResponse.error}`
  356. }
  357. const messages = messagesResponse.data || []
  358. const sessionBehaviorLog = behaviorLog.filter(log => log.sessionID === sessionID)
  359. const checks = analyzeContextLoadingCompliance(messages, sessionBehaviorLog)
  360. if (checks.length === 0) {
  361. return "📋 No tasks detected that require specific context files.\n\nContext loading rules apply when:\n- Writing documentation → should read standards/docs.md\n- Writing code → should read standards/code.md\n- Reviewing code → should read workflows/review.md\n- Delegating tasks → should read workflows/delegation.md\n- Writing tests → should read standards/tests.md"
  362. }
  363. const passed = checks.filter(c => c.passed).length
  364. const failed = checks.filter(c => !c.passed).length
  365. const score = Math.round((passed / checks.length) * 100)
  366. const lines: string[] = [
  367. `## Context Loading Compliance`,
  368. ``,
  369. `**Score:** ${score}%`,
  370. `- ✅ Compliant: ${passed}`,
  371. `- ⚠️ Non-compliant: ${failed}`,
  372. ``,
  373. ]
  374. if (failed > 0) {
  375. lines.push(`### ⚠️ Issues Found:`)
  376. checks.filter(c => !c.passed).forEach(check => {
  377. lines.push(`- ${check.details}`)
  378. })
  379. lines.push(``)
  380. }
  381. if (passed > 0) {
  382. lines.push(`### ✅ Compliant Actions:`)
  383. checks.filter(c => c.passed).forEach(check => {
  384. lines.push(`- ${check.details}`)
  385. })
  386. lines.push(``)
  387. }
  388. lines.push(`### Context Loading Rules:`)
  389. lines.push(`According to OpenAgent prompt, the agent should:`)
  390. lines.push(`1. Detect task type from user request`)
  391. lines.push(`2. Read required context file FIRST`)
  392. lines.push(`3. Then execute task following those standards`)
  393. lines.push(``)
  394. lines.push(`**Pattern:** "Fetch context BEFORE starting work, not during or after"`)
  395. return lines.join("\n")
  396. },
  397. }),
  398. // Analyze which agents were used
  399. analyze_agent_usage: tool({
  400. description: "Show which agents were active during the session and what tools they used",
  401. args: {},
  402. async execute(args, context) {
  403. const { sessionID } = context
  404. const sessionBehaviorLog = behaviorLog.filter(log => log.sessionID === sessionID)
  405. if (sessionBehaviorLog.length === 0) {
  406. return "📊 No agent activity tracked yet in this session."
  407. }
  408. // Group by agent
  409. const agentStats = new Map<string, {
  410. toolCalls: Map<string, number>
  411. events: string[]
  412. firstSeen: number
  413. lastSeen: number
  414. }>()
  415. sessionBehaviorLog.forEach(log => {
  416. const agent = log.agent || "unknown"
  417. if (!agentStats.has(agent)) {
  418. agentStats.set(agent, {
  419. toolCalls: new Map(),
  420. events: [],
  421. firstSeen: log.timestamp,
  422. lastSeen: log.timestamp
  423. })
  424. }
  425. const stats = agentStats.get(agent)!
  426. stats.lastSeen = log.timestamp
  427. stats.events.push(log.event)
  428. // Track tool usage
  429. if (log.event === "execution_tool_called" || log.event === "tool_executed") {
  430. const tool = log.data.tool
  431. stats.toolCalls.set(tool, (stats.toolCalls.get(tool) || 0) + 1)
  432. }
  433. })
  434. const lines: string[] = [
  435. `## Agent Usage Report`,
  436. ``,
  437. `**Agents detected:** ${agentStats.size}`,
  438. `**Total events:** ${sessionBehaviorLog.length}`,
  439. ``,
  440. ]
  441. // Sort agents by first seen
  442. const sortedAgents = Array.from(agentStats.entries()).sort((a, b) => a[1].firstSeen - b[1].firstSeen)
  443. sortedAgents.forEach(([agent, stats]) => {
  444. const duration = stats.lastSeen - stats.firstSeen
  445. const durationStr = duration > 0 ? `${Math.round(duration / 1000)}s` : "instant"
  446. lines.push(`### ${agent === "unknown" ? "Unknown Agent" : agent}`)
  447. lines.push(``)
  448. lines.push(`**Active duration:** ${durationStr}`)
  449. lines.push(`**Events:** ${stats.events.length}`)
  450. if (stats.toolCalls.size > 0) {
  451. lines.push(``)
  452. lines.push(`**Tools used:**`)
  453. const sortedTools = Array.from(stats.toolCalls.entries()).sort((a, b) => b[1] - a[1])
  454. sortedTools.forEach(([tool, count]) => {
  455. lines.push(`- ${tool}: ${count}x`)
  456. })
  457. }
  458. lines.push(``)
  459. })
  460. return lines.join("\n")
  461. },
  462. }),
  463. // Debug tool to inspect tracking
  464. debug_validator: tool({
  465. description: "Debug tool to inspect what the validator is tracking (behavior log, messages, etc.)",
  466. args: {},
  467. async execute(args, context) {
  468. const { sessionID } = context
  469. // Debug tool - gather information silently
  470. // Get messages from SDK
  471. const messagesResponse = await client.session.messages({
  472. path: { id: sessionID },
  473. })
  474. const messages = messagesResponse.data || []
  475. const sessionBehaviorLog = behaviorLog.filter(log => log.sessionID === sessionID)
  476. const tracker = toolExecutionTracker.get(sessionID)
  477. const debug = {
  478. sessionID,
  479. behaviorLogEntries: sessionBehaviorLog.length,
  480. behaviorLogSampleFirst: sessionBehaviorLog.slice(0, 3),
  481. behaviorLogSampleLast: sessionBehaviorLog.slice(-3),
  482. messagesCount: messages.length,
  483. messagesSample: messages.slice(0, 2).map(m => ({
  484. role: m.info?.role,
  485. partsCount: m.parts?.length,
  486. partTypes: m.parts?.map((p: any) => p.type),
  487. })),
  488. toolTracker: tracker ? {
  489. approvalRequested: tracker.approvalRequested,
  490. toolsExecuted: tracker.toolsExecuted,
  491. } : null,
  492. allBehaviorLogs: behaviorLog.length,
  493. }
  494. return `## Debug Information\n\n\`\`\`json\n${JSON.stringify(debug, null, 2)}\n\`\`\`\n\n**Analysis:**\n- Behavior log entries for this session: ${sessionBehaviorLog.length}\n- Total behavior log entries: ${behaviorLog.length}\n- Messages in session: ${messages.length}\n- Tool execution tracker: ${tracker ? 'Active' : 'None'}`
  495. },
  496. }),
  497. },
  498. }
  499. }
  500. // Validation logic
  501. interface ValidationCheck {
  502. rule: string
  503. passed: boolean
  504. severity: "info" | "warning" | "error"
  505. details: string
  506. evidence?: any
  507. }
  508. interface ValidationResult {
  509. sessionID: string
  510. checks: ValidationCheck[]
  511. summary: {
  512. passed: number
  513. failed: number
  514. warnings: number
  515. score: number
  516. }
  517. }
  518. async function validateSessionBehavior(input: {
  519. sessionID: string
  520. messages: any[]
  521. behaviorLog: any[]
  522. includeDetails: boolean
  523. }): Promise<ValidationResult> {
  524. const checks: ValidationCheck[] = []
  525. // Check 1: Tool usage patterns
  526. const toolUsage = analyzeToolUsage(input.messages)
  527. checks.push(...toolUsage)
  528. // Check 2: Approval gate enforcement
  529. const approvalChecks = analyzeApprovalGates(input.messages, input.behaviorLog)
  530. checks.push(...approvalChecks)
  531. // Check 3: Lazy context loading
  532. const contextChecks = analyzeContextLoading(input.messages)
  533. checks.push(...contextChecks)
  534. // Check 4: Delegation appropriateness
  535. const delegationChecks = analyzeDelegation(input.messages)
  536. checks.push(...delegationChecks)
  537. // Check 5: Critical rule compliance
  538. const criticalChecks = analyzeCriticalRules(input.messages)
  539. checks.push(...criticalChecks)
  540. // Check 6: Context loading compliance (read required files BEFORE execution)
  541. const contextComplianceChecks = analyzeContextLoadingCompliance(input.messages, input.behaviorLog)
  542. checks.push(...contextComplianceChecks)
  543. // Calculate summary
  544. const passed = checks.filter(c => c.passed).length
  545. const failed = checks.filter(c => !c.passed && c.severity === "error").length
  546. const warnings = checks.filter(c => !c.passed && c.severity === "warning").length
  547. const score = checks.length > 0 ? Math.round((passed / checks.length) * 100) : 0
  548. return {
  549. sessionID: input.sessionID,
  550. checks,
  551. summary: { passed, failed, warnings, score },
  552. }
  553. }
  554. function analyzeToolUsage(messages: any[]): ValidationCheck[] {
  555. const checks: ValidationCheck[] = []
  556. for (const msg of messages) {
  557. // Messages have structure: { info: Message, parts: Part[] }
  558. const role = msg.info?.role || msg.role
  559. if (role !== "assistant") continue
  560. const tools = extractToolsFromMessage(msg)
  561. if (tools.length > 0) {
  562. checks.push({
  563. rule: "tool_usage",
  564. passed: true,
  565. severity: "info",
  566. details: `Used ${tools.length} tool(s): ${tools.join(", ")}`,
  567. })
  568. }
  569. }
  570. return checks
  571. }
  572. function analyzeApprovalGates(messages: any[], behaviorLog: any[]): ValidationCheck[] {
  573. const checks: ValidationCheck[] = []
  574. const executionTools = ["bash", "write", "edit", "task"]
  575. for (let i = 0; i < messages.length; i++) {
  576. const msg = messages[i]
  577. const role = msg.info?.role || msg.role
  578. if (role !== "assistant") continue
  579. const tools = extractToolsFromMessage(msg)
  580. const executionOps = tools.filter(t => executionTools.includes(t))
  581. if (executionOps.length > 0) {
  582. // Check if approval language is present in this message OR in recent previous messages
  583. let hasApprovalRequest = checkForApprovalLanguage(msg)
  584. // Look back up to 3 messages to find approval request
  585. if (!hasApprovalRequest) {
  586. for (let j = Math.max(0, i - 3); j < i; j++) {
  587. const prevMsg = messages[j]
  588. const prevRole = prevMsg.info?.role || prevMsg.role
  589. if (prevRole === "assistant" && checkForApprovalLanguage(prevMsg)) {
  590. // Check if there's a user approval response after the request
  591. if (j + 1 < messages.length) {
  592. const userResponse = messages[j + 1]
  593. const userRole = userResponse.info?.role || userResponse.role
  594. if (userRole === "user" && checkForUserApproval(userResponse)) {
  595. hasApprovalRequest = true
  596. break
  597. }
  598. }
  599. }
  600. }
  601. }
  602. checks.push({
  603. rule: "approval_gate_enforcement",
  604. passed: hasApprovalRequest,
  605. severity: hasApprovalRequest ? "info" : "warning",
  606. details: hasApprovalRequest
  607. ? `Properly requested approval before ${executionOps.length} execution op(s)`
  608. : `⚠️ Executed ${executionOps.length} operation(s) without explicit approval request`,
  609. evidence: { executionOps, hasApprovalRequest },
  610. })
  611. }
  612. }
  613. return checks
  614. }
  615. function analyzeContextLoading(messages: any[]): ValidationCheck[] {
  616. const checks: ValidationCheck[] = []
  617. for (const msg of messages) {
  618. const role = msg.info?.role || msg.role
  619. if (role !== "assistant") continue
  620. // Look for read operations on .opencode/context/ files
  621. const contextReads = extractContextReads(msg)
  622. if (contextReads.length > 0) {
  623. checks.push({
  624. rule: "lazy_context_loading",
  625. passed: true,
  626. severity: "info",
  627. details: `Lazy-loaded ${contextReads.length} context file(s): ${contextReads.join(", ")}`,
  628. })
  629. }
  630. }
  631. return checks
  632. }
  633. function analyzeDelegation(messages: any[]): ValidationCheck[] {
  634. const checks: ValidationCheck[] = []
  635. for (const msg of messages) {
  636. const role = msg.info?.role || msg.role
  637. if (role !== "assistant") continue
  638. const tools = extractToolsFromMessage(msg)
  639. const hasDelegation = tools.includes("task")
  640. const writeEditCount = tools.filter(t => t === "write" || t === "edit").length
  641. if (hasDelegation) {
  642. const shouldDelegate = writeEditCount >= 4
  643. checks.push({
  644. rule: "delegation_appropriateness",
  645. passed: shouldDelegate,
  646. severity: shouldDelegate ? "info" : "warning",
  647. details: shouldDelegate
  648. ? `Appropriately delegated (${writeEditCount} files)`
  649. : `Delegated but only ${writeEditCount} files (< 4 threshold)`,
  650. })
  651. } else if (writeEditCount >= 4) {
  652. checks.push({
  653. rule: "delegation_appropriateness",
  654. passed: false,
  655. severity: "warning",
  656. details: `Should have delegated (${writeEditCount} files >= 4 threshold)`,
  657. })
  658. }
  659. }
  660. return checks
  661. }
  662. function analyzeCriticalRules(messages: any[]): ValidationCheck[] {
  663. const checks: ValidationCheck[] = []
  664. // Look for auto-fix attempts after errors
  665. for (let i = 0; i < messages.length - 1; i++) {
  666. const msg = messages[i]
  667. const nextMsg = messages[i + 1]
  668. const role = msg.info?.role || msg.role
  669. const metadata = msg.info?.metadata || msg.metadata
  670. if (role === "assistant" && metadata?.error) {
  671. const nextTools = extractToolsFromMessage(nextMsg)
  672. const hasAutoFix = nextTools.some(t => ["write", "edit", "bash"].includes(t))
  673. if (hasAutoFix) {
  674. checks.push({
  675. rule: "stop_on_failure",
  676. passed: false,
  677. severity: "error",
  678. details: "⛔ Auto-fix attempted after error - violates stop_on_failure rule",
  679. evidence: { error: metadata.error, autoFixTools: nextTools },
  680. })
  681. }
  682. }
  683. }
  684. return checks
  685. }
  686. function analyzeContextLoadingCompliance(messages: any[], behaviorLog: any[]): ValidationCheck[] {
  687. const checks: ValidationCheck[] = []
  688. // Define required context files for different task types
  689. const contextRules = [
  690. {
  691. taskKeywords: ["write doc", "create doc", "documentation", "write readme", "document"],
  692. requiredFile: "standards/docs.md",
  693. taskType: "documentation"
  694. },
  695. {
  696. taskKeywords: ["write code", "create function", "implement", "add feature", "build"],
  697. requiredFile: "standards/code.md",
  698. taskType: "code writing"
  699. },
  700. {
  701. taskKeywords: ["review code", "check code", "analyze code", "code review"],
  702. requiredFile: "workflows/review.md",
  703. taskType: "code review"
  704. },
  705. {
  706. taskKeywords: ["delegate", "create task", "subagent"],
  707. requiredFile: "workflows/delegation.md",
  708. taskType: "delegation"
  709. },
  710. {
  711. taskKeywords: ["write test", "create test", "test coverage", "unit test"],
  712. requiredFile: "standards/tests.md",
  713. taskType: "testing"
  714. }
  715. ]
  716. // Get all context file reads from behavior log
  717. const contextReads = behaviorLog
  718. .filter(log => log.event === "context_file_read")
  719. .map(log => ({
  720. timestamp: log.timestamp,
  721. filePath: log.data.filePath
  722. }))
  723. // Analyze each message for task execution
  724. for (let i = 0; i < messages.length; i++) {
  725. const msg = messages[i]
  726. const role = msg.info?.role || msg.role
  727. if (role !== "assistant") continue
  728. const tools = extractToolsFromMessage(msg)
  729. const executionTools = tools.filter(t => ["write", "edit", "bash", "task"].includes(t))
  730. if (executionTools.length === 0) continue
  731. // Get message text to detect task type
  732. const messageText = extractMessageText(msg).toLowerCase()
  733. // Check if this message matches any context loading rules
  734. for (const rule of contextRules) {
  735. const matchesTask = rule.taskKeywords.some(keyword => messageText.includes(keyword))
  736. if (matchesTask) {
  737. // Check if required context file was read BEFORE this message
  738. const msgTimestamp = msg.info?.timestamp || Date.now()
  739. const contextReadBefore = contextReads.some(read =>
  740. read.filePath.includes(rule.requiredFile) && read.timestamp < msgTimestamp
  741. )
  742. checks.push({
  743. rule: "context_loading_compliance",
  744. passed: contextReadBefore,
  745. severity: contextReadBefore ? "info" : "warning",
  746. details: contextReadBefore
  747. ? `✅ Loaded ${rule.requiredFile} before ${rule.taskType}`
  748. : `⚠️ Did not load ${rule.requiredFile} before ${rule.taskType} task`,
  749. evidence: {
  750. taskType: rule.taskType,
  751. requiredFile: rule.requiredFile,
  752. contextReadBefore,
  753. executionTools
  754. }
  755. })
  756. }
  757. }
  758. }
  759. return checks
  760. }
  761. function analyzeDelegationDecisions(messages: any[]): {
  762. delegations: number
  763. appropriate: number
  764. inappropriate: number
  765. fileCountStats: number[]
  766. } {
  767. const stats = {
  768. delegations: 0,
  769. appropriate: 0,
  770. inappropriate: 0,
  771. fileCountStats: [] as number[],
  772. }
  773. for (const msg of messages) {
  774. const role = msg.info?.role || msg.role
  775. if (role !== "assistant") continue
  776. const tools = extractToolsFromMessage(msg)
  777. const hasDelegation = tools.includes("task")
  778. const writeEditCount = tools.filter(t => t === "write" || t === "edit").length
  779. if (hasDelegation) {
  780. stats.delegations++
  781. stats.fileCountStats.push(writeEditCount)
  782. if (writeEditCount >= 4) {
  783. stats.appropriate++
  784. } else {
  785. stats.inappropriate++
  786. }
  787. }
  788. }
  789. return stats
  790. }
  791. // Helper functions
  792. function extractToolsFromMessage(msg: any): string[] {
  793. const tools: string[] = []
  794. // Messages from SDK have structure: { info: Message, parts: Part[] }
  795. const parts = msg.parts || []
  796. for (const part of parts) {
  797. // Check for tool type (from SDK: part.type === "tool")
  798. if (part.type === "tool" && part.tool) {
  799. tools.push(part.tool)
  800. }
  801. // Also check for tool-invocation format (legacy)
  802. if (part.type === "tool-invocation" && part.toolInvocation) {
  803. tools.push(part.toolInvocation.toolName)
  804. }
  805. }
  806. return tools
  807. }
  808. function extractMessageText(msg: any): string {
  809. if (!msg.parts) return ""
  810. let text = ""
  811. for (const part of msg.parts) {
  812. if (part.type === "text" && part.text) {
  813. text += part.text + " "
  814. }
  815. }
  816. return text.trim()
  817. }
  818. function extractContextReads(msg: any): string[] {
  819. const contextFiles: string[] = []
  820. if (!msg.parts) return contextFiles
  821. for (const part of msg.parts) {
  822. if (part.type === "tool-invocation" &&
  823. part.toolInvocation?.toolName === "read" &&
  824. part.toolInvocation?.args?.target_file?.includes(".opencode/context/")) {
  825. contextFiles.push(part.toolInvocation.args.target_file)
  826. }
  827. }
  828. return contextFiles
  829. }
  830. // Formatting functions
  831. function formatValidationReport(validation: ValidationResult): string {
  832. const { summary, checks } = validation
  833. const lines: string[] = [
  834. `## Validation Report`,
  835. ``,
  836. `**Score:** ${summary.score}%`,
  837. `- ✅ Passed: ${summary.passed}`,
  838. `- ⚠️ Warnings: ${summary.warnings}`,
  839. `- ❌ Failed: ${summary.failed}`,
  840. ``,
  841. ]
  842. // Group by severity
  843. const errors = checks.filter(c => !c.passed && c.severity === "error")
  844. const warnings = checks.filter(c => !c.passed && c.severity === "warning")
  845. if (errors.length > 0) {
  846. lines.push(`### ❌ Errors`)
  847. errors.forEach(check => {
  848. lines.push(`- **${check.rule}**: ${check.details}`)
  849. })
  850. lines.push(``)
  851. }
  852. if (warnings.length > 0) {
  853. lines.push(`### ⚠️ Warnings`)
  854. warnings.forEach(check => {
  855. lines.push(`- **${check.rule}**: ${check.details}`)
  856. })
  857. lines.push(``)
  858. }
  859. return lines.join("\n")
  860. }
  861. function formatDelegationAnalysis(analysis: any): string {
  862. const lines: string[] = [
  863. `## Delegation Analysis`,
  864. ``,
  865. `**Total delegations:** ${analysis.delegations}`,
  866. `- ✅ Appropriate: ${analysis.appropriate}`,
  867. `- ⚠️ Questionable: ${analysis.inappropriate}`,
  868. ``,
  869. ]
  870. if (analysis.fileCountStats.length > 0) {
  871. const avg = analysis.fileCountStats.reduce((a: number, b: number) => a + b, 0) / analysis.fileCountStats.length
  872. lines.push(`**File count per delegation:**`)
  873. lines.push(`- Average: ${avg.toFixed(1)} files`)
  874. lines.push(`- Range: ${Math.min(...analysis.fileCountStats)} - ${Math.max(...analysis.fileCountStats)} files`)
  875. lines.push(`- Threshold: 4+ files`)
  876. }
  877. return lines.join("\n")
  878. }
  879. function generateDetailedReport(validation: ValidationResult, messages: any[]): string {
  880. const lines: string[] = [
  881. `# Agent Validation Report`,
  882. ``,
  883. `**Session:** ${validation.sessionID}`,
  884. `**Generated:** ${new Date().toISOString()}`,
  885. `**Messages analyzed:** ${messages.length}`,
  886. ``,
  887. formatValidationReport(validation),
  888. ``,
  889. `## Detailed Checks`,
  890. ``,
  891. ]
  892. validation.checks.forEach(check => {
  893. const icon = check.passed ? "✅" : check.severity === "error" ? "❌" : "⚠️"
  894. lines.push(`### ${icon} ${check.rule}`)
  895. lines.push(``)
  896. lines.push(check.details)
  897. lines.push(``)
  898. if (check.evidence) {
  899. lines.push(`**Evidence:**`)
  900. lines.push(`\`\`\`json`)
  901. lines.push(JSON.stringify(check.evidence, null, 2))
  902. lines.push(`\`\`\``)
  903. lines.push(``)
  904. }
  905. })
  906. return lines.join("\n")
  907. }
  908. export default AgentValidatorPlugin