index.html 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073
  1. <!DOCTYPE html>
  2. <html lang="en">
  3. <head>
  4. <meta charset="UTF-8">
  5. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6. <title>OpenCode Agent Test Dashboard</title>
  7. <style>
  8. /* CSS Variables for theming */
  9. :root {
  10. --bg-primary: #ffffff;
  11. --bg-secondary: #f8f9fa;
  12. --bg-card: #ffffff;
  13. --text-primary: #212529;
  14. --text-secondary: #6c757d;
  15. --border-color: #dee2e6;
  16. --success: #28a745;
  17. --danger: #dc3545;
  18. --warning: #ffc107;
  19. --info: #17a2b8;
  20. --primary: #007bff;
  21. --shadow: rgba(0, 0, 0, 0.1);
  22. }
  23. [data-theme="dark"] {
  24. --bg-primary: #1a1a1a;
  25. --bg-secondary: #2d2d2d;
  26. --bg-card: #242424;
  27. --text-primary: #e9ecef;
  28. --text-secondary: #adb5bd;
  29. --border-color: #495057;
  30. --shadow: rgba(0, 0, 0, 0.3);
  31. }
  32. * {
  33. margin: 0;
  34. padding: 0;
  35. box-sizing: border-box;
  36. }
  37. body {
  38. font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
  39. background: var(--bg-primary);
  40. color: var(--text-primary);
  41. line-height: 1.6;
  42. transition: background-color 0.3s, color 0.3s;
  43. }
  44. .container {
  45. max-width: 1400px;
  46. margin: 0 auto;
  47. padding: 20px;
  48. }
  49. /* Header */
  50. header {
  51. background: var(--bg-card);
  52. border-bottom: 2px solid var(--border-color);
  53. padding: 20px 0;
  54. margin-bottom: 30px;
  55. box-shadow: 0 2px 4px var(--shadow);
  56. }
  57. .header-content {
  58. display: flex;
  59. justify-content: space-between;
  60. align-items: center;
  61. flex-wrap: wrap;
  62. gap: 20px;
  63. }
  64. h1 {
  65. font-size: 28px;
  66. font-weight: 600;
  67. color: var(--text-primary);
  68. }
  69. .header-actions {
  70. display: flex;
  71. gap: 10px;
  72. align-items: center;
  73. }
  74. /* Buttons */
  75. button {
  76. padding: 8px 16px;
  77. border: 1px solid var(--border-color);
  78. background: var(--bg-card);
  79. color: var(--text-primary);
  80. border-radius: 6px;
  81. cursor: pointer;
  82. font-size: 14px;
  83. transition: all 0.2s;
  84. }
  85. button:hover {
  86. background: var(--bg-secondary);
  87. transform: translateY(-1px);
  88. }
  89. button.primary {
  90. background: var(--primary);
  91. color: white;
  92. border-color: var(--primary);
  93. }
  94. button.primary:hover {
  95. background: #0056b3;
  96. }
  97. /* Filters */
  98. .filters {
  99. background: var(--bg-card);
  100. padding: 20px;
  101. border-radius: 8px;
  102. margin-bottom: 30px;
  103. box-shadow: 0 2px 4px var(--shadow);
  104. }
  105. .filter-row {
  106. display: flex;
  107. gap: 15px;
  108. flex-wrap: wrap;
  109. align-items: center;
  110. }
  111. .filter-group {
  112. display: flex;
  113. flex-direction: column;
  114. gap: 5px;
  115. }
  116. .filter-group label {
  117. font-size: 12px;
  118. font-weight: 600;
  119. color: var(--text-secondary);
  120. text-transform: uppercase;
  121. }
  122. select, input {
  123. padding: 8px 12px;
  124. border: 1px solid var(--border-color);
  125. background: var(--bg-primary);
  126. color: var(--text-primary);
  127. border-radius: 6px;
  128. font-size: 14px;
  129. }
  130. /* Stats Cards */
  131. .stats-grid {
  132. display: grid;
  133. grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
  134. gap: 20px;
  135. margin-bottom: 30px;
  136. }
  137. .stat-card {
  138. background: var(--bg-card);
  139. padding: 20px;
  140. border-radius: 8px;
  141. box-shadow: 0 2px 4px var(--shadow);
  142. border-left: 4px solid var(--primary);
  143. }
  144. .stat-card.success {
  145. border-left-color: var(--success);
  146. }
  147. .stat-card.danger {
  148. border-left-color: var(--danger);
  149. }
  150. .stat-card.warning {
  151. border-left-color: var(--warning);
  152. }
  153. .stat-label {
  154. font-size: 12px;
  155. font-weight: 600;
  156. color: var(--text-secondary);
  157. text-transform: uppercase;
  158. margin-bottom: 8px;
  159. }
  160. .stat-value {
  161. font-size: 32px;
  162. font-weight: 700;
  163. color: var(--text-primary);
  164. }
  165. .stat-subtitle {
  166. font-size: 14px;
  167. color: var(--text-secondary);
  168. margin-top: 5px;
  169. }
  170. /* Chart Container */
  171. .chart-container {
  172. background: var(--bg-card);
  173. padding: 20px;
  174. border-radius: 8px;
  175. margin-bottom: 30px;
  176. box-shadow: 0 2px 4px var(--shadow);
  177. }
  178. .chart-container h2 {
  179. font-size: 18px;
  180. margin-bottom: 15px;
  181. color: var(--text-primary);
  182. }
  183. #trendChart {
  184. max-height: 300px;
  185. }
  186. /* Test Results Table */
  187. .results-container {
  188. background: var(--bg-card);
  189. border-radius: 8px;
  190. box-shadow: 0 2px 4px var(--shadow);
  191. overflow: hidden;
  192. }
  193. .results-header {
  194. padding: 20px;
  195. border-bottom: 1px solid var(--border-color);
  196. display: flex;
  197. justify-content: space-between;
  198. align-items: center;
  199. }
  200. .results-header h2 {
  201. font-size: 18px;
  202. color: var(--text-primary);
  203. }
  204. .search-box {
  205. position: relative;
  206. }
  207. .search-box input {
  208. padding-left: 35px;
  209. width: 300px;
  210. }
  211. .search-box::before {
  212. content: "🔍";
  213. position: absolute;
  214. left: 12px;
  215. top: 50%;
  216. transform: translateY(-50%);
  217. }
  218. table {
  219. width: 100%;
  220. border-collapse: collapse;
  221. }
  222. thead {
  223. background: var(--bg-secondary);
  224. }
  225. th {
  226. padding: 12px 16px;
  227. text-align: left;
  228. font-size: 12px;
  229. font-weight: 600;
  230. color: var(--text-secondary);
  231. text-transform: uppercase;
  232. cursor: pointer;
  233. user-select: none;
  234. }
  235. th:hover {
  236. background: var(--border-color);
  237. }
  238. th.sortable::after {
  239. content: " ↕";
  240. opacity: 0.3;
  241. }
  242. th.sort-asc::after {
  243. content: " ↑";
  244. opacity: 1;
  245. }
  246. th.sort-desc::after {
  247. content: " ↓";
  248. opacity: 1;
  249. }
  250. td {
  251. padding: 12px 16px;
  252. border-bottom: 1px solid var(--border-color);
  253. }
  254. tr:hover {
  255. background: var(--bg-secondary);
  256. }
  257. .status-badge {
  258. display: inline-block;
  259. padding: 4px 8px;
  260. border-radius: 4px;
  261. font-size: 12px;
  262. font-weight: 600;
  263. }
  264. .status-badge.passed {
  265. background: #d4edda;
  266. color: #155724;
  267. }
  268. .status-badge.failed {
  269. background: #f8d7da;
  270. color: #721c24;
  271. }
  272. [data-theme="dark"] .status-badge.passed {
  273. background: #1e4620;
  274. color: #7dce82;
  275. }
  276. [data-theme="dark"] .status-badge.failed {
  277. background: #5a1f1f;
  278. color: #f5a3a3;
  279. }
  280. .category-badge {
  281. display: inline-block;
  282. padding: 4px 8px;
  283. border-radius: 4px;
  284. font-size: 11px;
  285. font-weight: 600;
  286. background: var(--bg-secondary);
  287. color: var(--text-secondary);
  288. }
  289. .variant-badge {
  290. display: inline-block;
  291. padding: 4px 8px;
  292. border-radius: 4px;
  293. font-size: 11px;
  294. font-weight: 600;
  295. background: #e3f2fd;
  296. color: #1565c0;
  297. }
  298. .variant-badge.default {
  299. background: var(--bg-secondary);
  300. color: var(--text-secondary);
  301. }
  302. [data-theme="dark"] .variant-badge {
  303. background: #1a237e;
  304. color: #90caf9;
  305. }
  306. [data-theme="dark"] .variant-badge.default {
  307. background: var(--bg-secondary);
  308. color: var(--text-secondary);
  309. }
  310. .expandable-row {
  311. cursor: pointer;
  312. }
  313. .details-row {
  314. display: none;
  315. background: var(--bg-secondary);
  316. }
  317. .details-row.show {
  318. display: table-row;
  319. }
  320. .details-content {
  321. padding: 20px;
  322. }
  323. .violation-item {
  324. padding: 10px;
  325. margin: 5px 0;
  326. border-left: 3px solid var(--danger);
  327. background: var(--bg-card);
  328. border-radius: 4px;
  329. }
  330. .violation-item.warning {
  331. border-left-color: var(--warning);
  332. }
  333. /* Loading State */
  334. .loading {
  335. text-align: center;
  336. padding: 40px;
  337. color: var(--text-secondary);
  338. }
  339. .spinner {
  340. border: 3px solid var(--border-color);
  341. border-top: 3px solid var(--primary);
  342. border-radius: 50%;
  343. width: 40px;
  344. height: 40px;
  345. animation: spin 1s linear infinite;
  346. margin: 0 auto 20px;
  347. }
  348. @keyframes spin {
  349. 0% { transform: rotate(0deg); }
  350. 100% { transform: rotate(360deg); }
  351. }
  352. /* Empty State */
  353. .empty-state {
  354. text-align: center;
  355. padding: 60px 20px;
  356. color: var(--text-secondary);
  357. }
  358. .empty-state-icon {
  359. font-size: 64px;
  360. margin-bottom: 20px;
  361. opacity: 0.3;
  362. }
  363. /* Responsive */
  364. @media (max-width: 768px) {
  365. .header-content {
  366. flex-direction: column;
  367. align-items: flex-start;
  368. }
  369. .filter-row {
  370. flex-direction: column;
  371. align-items: stretch;
  372. }
  373. .search-box input {
  374. width: 100%;
  375. }
  376. table {
  377. font-size: 14px;
  378. }
  379. th, td {
  380. padding: 8px;
  381. }
  382. }
  383. /* Theme Toggle */
  384. .theme-toggle {
  385. background: none;
  386. border: none;
  387. font-size: 24px;
  388. cursor: pointer;
  389. padding: 8px;
  390. }
  391. </style>
  392. </head>
  393. <body>
  394. <header>
  395. <div class="container">
  396. <div class="header-content">
  397. <h1>🎯 OpenCode Agent Test Dashboard</h1>
  398. <div class="header-actions">
  399. <button id="refreshBtn" class="primary">🔄 Refresh</button>
  400. <button id="exportBtn">📊 Export CSV</button>
  401. <button class="theme-toggle" id="themeToggle" title="Toggle dark mode">🌙</button>
  402. </div>
  403. </div>
  404. </div>
  405. </header>
  406. <div class="container">
  407. <!-- Filters -->
  408. <div class="filters">
  409. <div class="filter-row">
  410. <div class="filter-group">
  411. <label>Agent</label>
  412. <select id="agentFilter">
  413. <option value="all">All Agents</option>
  414. </select>
  415. </div>
  416. <div class="filter-group">
  417. <label>Category</label>
  418. <select id="categoryFilter">
  419. <option value="all">All Categories</option>
  420. <option value="developer">Developer</option>
  421. <option value="business">Business</option>
  422. <option value="creative">Creative</option>
  423. <option value="edge-case">Edge Case</option>
  424. </select>
  425. </div>
  426. <div class="filter-group">
  427. <label>Status</label>
  428. <select id="statusFilter">
  429. <option value="all">All Tests</option>
  430. <option value="passed">Passed Only</option>
  431. <option value="failed">Failed Only</option>
  432. </select>
  433. </div>
  434. <div class="filter-group">
  435. <label>Prompt Variant</label>
  436. <select id="variantFilter">
  437. <option value="all">All Variants</option>
  438. </select>
  439. </div>
  440. <div class="filter-group">
  441. <label>Time Range</label>
  442. <select id="timeFilter">
  443. <option value="latest">Latest Run</option>
  444. <option value="today">Today</option>
  445. <option value="week">Last 7 Days</option>
  446. <option value="month">Last 30 Days</option>
  447. </select>
  448. </div>
  449. </div>
  450. </div>
  451. <!-- Stats Cards -->
  452. <div class="stats-grid" id="statsGrid">
  453. <div class="stat-card">
  454. <div class="stat-label">Total Tests</div>
  455. <div class="stat-value" id="totalTests">-</div>
  456. <div class="stat-subtitle">Across all agents</div>
  457. </div>
  458. <div class="stat-card success">
  459. <div class="stat-label">Pass Rate</div>
  460. <div class="stat-value" id="passRate">-</div>
  461. <div class="stat-subtitle" id="passedCount">- passed</div>
  462. </div>
  463. <div class="stat-card danger">
  464. <div class="stat-label">Failed Tests</div>
  465. <div class="stat-value" id="failedTests">-</div>
  466. <div class="stat-subtitle" id="failedSubtitle">-</div>
  467. </div>
  468. <div class="stat-card warning">
  469. <div class="stat-label">Avg Duration</div>
  470. <div class="stat-value" id="avgDuration">-</div>
  471. <div class="stat-subtitle">Per test</div>
  472. </div>
  473. </div>
  474. <!-- Trend Chart -->
  475. <div class="chart-container">
  476. <h2>📈 Pass Rate Trend (Last 30 Days)</h2>
  477. <canvas id="trendChart"></canvas>
  478. </div>
  479. <!-- Test Results Table -->
  480. <div class="results-container">
  481. <div class="results-header">
  482. <h2>Test Results</h2>
  483. <div class="search-box">
  484. <input type="text" id="searchInput" placeholder="Search tests...">
  485. </div>
  486. </div>
  487. <div id="tableContainer">
  488. <div class="loading">
  489. <div class="spinner"></div>
  490. <p>Loading test results...</p>
  491. </div>
  492. </div>
  493. </div>
  494. </div>
  495. <!-- Chart.js from CDN -->
  496. <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
  497. <script>
  498. // Dashboard State
  499. let allResults = [];
  500. let filteredResults = [];
  501. let currentSort = { column: null, direction: 'asc' };
  502. let trendChart = null;
  503. // Initialize Dashboard
  504. document.addEventListener('DOMContentLoaded', () => {
  505. initializeTheme();
  506. setupEventListeners();
  507. loadResults();
  508. });
  509. // Theme Management
  510. function initializeTheme() {
  511. const savedTheme = localStorage.getItem('theme') || 'light';
  512. document.documentElement.setAttribute('data-theme', savedTheme);
  513. updateThemeIcon(savedTheme);
  514. }
  515. function toggleTheme() {
  516. const current = document.documentElement.getAttribute('data-theme');
  517. const newTheme = current === 'dark' ? 'light' : 'dark';
  518. document.documentElement.setAttribute('data-theme', newTheme);
  519. localStorage.setItem('theme', newTheme);
  520. updateThemeIcon(newTheme);
  521. }
  522. function updateThemeIcon(theme) {
  523. document.getElementById('themeToggle').textContent = theme === 'dark' ? '☀️' : '🌙';
  524. }
  525. // Event Listeners
  526. function setupEventListeners() {
  527. document.getElementById('themeToggle').addEventListener('click', toggleTheme);
  528. document.getElementById('refreshBtn').addEventListener('click', loadResults);
  529. document.getElementById('exportBtn').addEventListener('click', exportToCSV);
  530. document.getElementById('searchInput').addEventListener('input', applyFilters);
  531. document.getElementById('agentFilter').addEventListener('change', applyFilters);
  532. document.getElementById('categoryFilter').addEventListener('change', applyFilters);
  533. document.getElementById('statusFilter').addEventListener('change', applyFilters);
  534. document.getElementById('variantFilter').addEventListener('change', applyFilters);
  535. document.getElementById('timeFilter').addEventListener('change', loadResults);
  536. }
  537. // Load Results
  538. async function loadResults() {
  539. showLoading();
  540. try {
  541. const timeFilter = document.getElementById('timeFilter').value;
  542. const results = await fetchResults(timeFilter);
  543. allResults = results;
  544. populateAgentFilter(results);
  545. populateVariantFilter(results);
  546. applyFilters();
  547. updateStats(results);
  548. updateTrendChart(results);
  549. } catch (error) {
  550. showError('Failed to load results: ' + error.message);
  551. }
  552. }
  553. // Fetch Results
  554. async function fetchResults(timeFilter) {
  555. try {
  556. if (timeFilter === 'latest') {
  557. // Load latest.json
  558. const response = await fetch('latest.json');
  559. if (!response.ok) {
  560. throw new Error('Cannot load latest.json. See instructions below.');
  561. }
  562. const data = await response.json();
  563. return [data];
  564. } else {
  565. // Load from history
  566. const files = await fetchHistoryFiles(timeFilter);
  567. const results = await Promise.all(
  568. files.map(file => fetch(file).then(r => r.json()))
  569. );
  570. return results;
  571. }
  572. } catch (error) {
  573. // If fetch fails (CORS/local file), show helpful message
  574. throw new Error('Cannot load results from local file. Please use one of these methods:\n\n' +
  575. '1. Serve via HTTP:\n' +
  576. ' cd evals/results && python3 -m http.server 8000\n' +
  577. ' Then open: http://localhost:8000\n\n' +
  578. '2. Use browser flag:\n' +
  579. ' Chrome: --allow-file-access-from-files\n\n' +
  580. 'Original error: ' + error.message);
  581. }
  582. }
  583. // Fetch History Files
  584. async function fetchHistoryFiles(timeFilter) {
  585. // For now, we'll just load latest.json
  586. // In a real implementation, you'd need a file listing endpoint
  587. // or generate an index.json with all available files
  588. return ['latest.json'];
  589. }
  590. // Populate Agent Filter
  591. function populateAgentFilter(results) {
  592. const agents = [...new Set(results.map(r => r.meta.agent))];
  593. const select = document.getElementById('agentFilter');
  594. // Keep "All Agents" option
  595. select.innerHTML = '<option value="all">All Agents</option>';
  596. agents.forEach(agent => {
  597. const option = document.createElement('option');
  598. option.value = agent;
  599. option.textContent = agent.charAt(0).toUpperCase() + agent.slice(1);
  600. select.appendChild(option);
  601. });
  602. }
  603. // Populate Variant Filter
  604. function populateVariantFilter(results) {
  605. const variants = [...new Set(results.map(r => r.meta.prompt_variant).filter(Boolean))];
  606. const select = document.getElementById('variantFilter');
  607. // Keep "All Variants" option
  608. select.innerHTML = '<option value="all">All Variants</option>';
  609. // Add "No Variant" option if there are results without variant
  610. const hasNoVariant = results.some(r => !r.meta.prompt_variant);
  611. if (hasNoVariant) {
  612. const option = document.createElement('option');
  613. option.value = 'none';
  614. option.textContent = 'Default (no variant)';
  615. select.appendChild(option);
  616. }
  617. variants.forEach(variant => {
  618. const option = document.createElement('option');
  619. option.value = variant;
  620. option.textContent = variant.charAt(0).toUpperCase() + variant.slice(1);
  621. select.appendChild(option);
  622. });
  623. }
  624. // Apply Filters
  625. function applyFilters() {
  626. const searchTerm = document.getElementById('searchInput').value.toLowerCase();
  627. const agentFilter = document.getElementById('agentFilter').value;
  628. const categoryFilter = document.getElementById('categoryFilter').value;
  629. const statusFilter = document.getElementById('statusFilter').value;
  630. const variantFilter = document.getElementById('variantFilter').value;
  631. // Flatten all tests from all results
  632. const allTests = allResults.flatMap(result =>
  633. result.tests.map(test => ({
  634. ...test,
  635. agent: result.meta.agent,
  636. timestamp: result.meta.timestamp,
  637. model: result.meta.model,
  638. prompt_variant: result.meta.prompt_variant,
  639. model_family: result.meta.model_family
  640. }))
  641. );
  642. filteredResults = allTests.filter(test => {
  643. // Search filter
  644. if (searchTerm && !test.id.toLowerCase().includes(searchTerm)) {
  645. return false;
  646. }
  647. // Agent filter
  648. if (agentFilter !== 'all' && test.agent !== agentFilter) {
  649. return false;
  650. }
  651. // Category filter
  652. if (categoryFilter !== 'all' && test.category !== categoryFilter) {
  653. return false;
  654. }
  655. // Status filter
  656. if (statusFilter === 'passed' && !test.passed) {
  657. return false;
  658. }
  659. if (statusFilter === 'failed' && test.passed) {
  660. return false;
  661. }
  662. // Variant filter
  663. if (variantFilter !== 'all') {
  664. if (variantFilter === 'none' && test.prompt_variant) {
  665. return false;
  666. }
  667. if (variantFilter !== 'none' && test.prompt_variant !== variantFilter) {
  668. return false;
  669. }
  670. }
  671. return true;
  672. });
  673. renderTable(filteredResults);
  674. }
  675. // Render Table
  676. function renderTable(tests) {
  677. const container = document.getElementById('tableContainer');
  678. if (tests.length === 0) {
  679. container.innerHTML = `
  680. <div class="empty-state">
  681. <div class="empty-state-icon">📭</div>
  682. <h3>No results found</h3>
  683. <p>Try adjusting your filters or run some tests</p>
  684. </div>
  685. `;
  686. return;
  687. }
  688. const html = `
  689. <table>
  690. <thead>
  691. <tr>
  692. <th class="sortable" data-column="id">Test ID</th>
  693. <th class="sortable" data-column="agent">Agent</th>
  694. <th class="sortable" data-column="prompt_variant">Variant</th>
  695. <th class="sortable" data-column="category">Category</th>
  696. <th class="sortable" data-column="passed">Status</th>
  697. <th class="sortable" data-column="duration_ms">Duration</th>
  698. <th class="sortable" data-column="violations.total">Violations</th>
  699. </tr>
  700. </thead>
  701. <tbody>
  702. ${tests.map((test, idx) => renderTestRow(test, idx)).join('')}
  703. </tbody>
  704. </table>
  705. `;
  706. container.innerHTML = html;
  707. // Add sort listeners
  708. container.querySelectorAll('th.sortable').forEach(th => {
  709. th.addEventListener('click', () => sortTable(th.dataset.column));
  710. });
  711. // Add expand listeners
  712. container.querySelectorAll('.expandable-row').forEach(row => {
  713. row.addEventListener('click', () => toggleDetails(row.dataset.index));
  714. });
  715. }
  716. // Render Test Row
  717. function renderTestRow(test, index) {
  718. const statusClass = test.passed ? 'passed' : 'failed';
  719. const statusText = test.passed ? '✅ Passed' : '❌ Failed';
  720. const duration = (test.duration_ms / 1000).toFixed(2) + 's';
  721. const variant = test.prompt_variant || 'default';
  722. const variantClass = test.prompt_variant ? 'variant-badge' : 'variant-badge default';
  723. return `
  724. <tr class="expandable-row" data-index="${index}">
  725. <td><strong>${test.id}</strong></td>
  726. <td>${test.agent}</td>
  727. <td><span class="category-badge ${variantClass}">${variant}</span></td>
  728. <td><span class="category-badge">${test.category}</span></td>
  729. <td><span class="status-badge ${statusClass}">${statusText}</span></td>
  730. <td>${duration}</td>
  731. <td>${test.violations.total} ${test.violations.errors > 0 ? '⚠️' : ''}</td>
  732. </tr>
  733. <tr class="details-row" id="details-${index}">
  734. <td colspan="7">
  735. ${renderTestDetails(test)}
  736. </td>
  737. </tr>
  738. `;
  739. }
  740. // Render Test Details
  741. function renderTestDetails(test) {
  742. let html = '<div class="details-content">';
  743. html += `<p><strong>Model:</strong> ${test.model || 'unknown'}</p>`;
  744. if (test.prompt_variant) {
  745. html += `<p><strong>Prompt Variant:</strong> ${test.prompt_variant}</p>`;
  746. }
  747. if (test.model_family) {
  748. html += `<p><strong>Model Family:</strong> ${test.model_family}</p>`;
  749. }
  750. html += `<p><strong>Approvals:</strong> ${test.approvals}</p>`;
  751. html += `<p><strong>Events:</strong> ${test.events}</p>`;
  752. if (test.violations.total > 0) {
  753. html += '<h4>Violations:</h4>';
  754. test.violations.details?.forEach(v => {
  755. html += `
  756. <div class="violation-item ${v.severity}">
  757. <strong>[${v.severity.toUpperCase()}] ${v.type}</strong><br>
  758. ${v.message}
  759. </div>
  760. `;
  761. });
  762. } else {
  763. html += '<p>✅ No violations</p>';
  764. }
  765. html += '</div>';
  766. return html;
  767. }
  768. // Toggle Details
  769. function toggleDetails(index) {
  770. const detailsRow = document.getElementById(`details-${index}`);
  771. detailsRow.classList.toggle('show');
  772. }
  773. // Sort Table
  774. function sortTable(column) {
  775. if (currentSort.column === column) {
  776. currentSort.direction = currentSort.direction === 'asc' ? 'desc' : 'asc';
  777. } else {
  778. currentSort.column = column;
  779. currentSort.direction = 'asc';
  780. }
  781. filteredResults.sort((a, b) => {
  782. let aVal = getNestedValue(a, column);
  783. let bVal = getNestedValue(b, column);
  784. if (typeof aVal === 'string') {
  785. aVal = aVal.toLowerCase();
  786. bVal = bVal.toLowerCase();
  787. }
  788. if (aVal < bVal) return currentSort.direction === 'asc' ? -1 : 1;
  789. if (aVal > bVal) return currentSort.direction === 'asc' ? 1 : -1;
  790. return 0;
  791. });
  792. renderTable(filteredResults);
  793. updateSortIndicators();
  794. }
  795. // Get Nested Value
  796. function getNestedValue(obj, path) {
  797. return path.split('.').reduce((curr, prop) => curr?.[prop], obj);
  798. }
  799. // Update Sort Indicators
  800. function updateSortIndicators() {
  801. document.querySelectorAll('th.sortable').forEach(th => {
  802. th.classList.remove('sort-asc', 'sort-desc');
  803. if (th.dataset.column === currentSort.column) {
  804. th.classList.add(`sort-${currentSort.direction}`);
  805. }
  806. });
  807. }
  808. // Update Stats
  809. function updateStats(results) {
  810. const allTests = results.flatMap(r => r.tests);
  811. const total = allTests.length;
  812. const passed = allTests.filter(t => t.passed).length;
  813. const failed = total - passed;
  814. const passRate = total > 0 ? ((passed / total) * 100).toFixed(1) : 0;
  815. const avgDuration = total > 0
  816. ? (allTests.reduce((sum, t) => sum + t.duration_ms, 0) / total / 1000).toFixed(2)
  817. : 0;
  818. document.getElementById('totalTests').textContent = total;
  819. document.getElementById('passRate').textContent = passRate + '%';
  820. document.getElementById('passedCount').textContent = `${passed} passed`;
  821. document.getElementById('failedTests').textContent = failed;
  822. document.getElementById('failedSubtitle').textContent = failed === 0 ? 'All tests passing! 🎉' : `${failed} tests need attention`;
  823. document.getElementById('avgDuration').textContent = avgDuration + 's';
  824. }
  825. // Update Trend Chart
  826. function updateTrendChart(results) {
  827. const ctx = document.getElementById('trendChart');
  828. // Sort by timestamp
  829. const sorted = [...results].sort((a, b) =>
  830. new Date(a.meta.timestamp) - new Date(b.meta.timestamp)
  831. );
  832. const labels = sorted.map(r => {
  833. const date = new Date(r.meta.timestamp);
  834. return date.toLocaleDateString() + ' ' + date.toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'});
  835. });
  836. const passRates = sorted.map(r => (r.summary.pass_rate * 100).toFixed(1));
  837. if (trendChart) {
  838. trendChart.destroy();
  839. }
  840. trendChart = new Chart(ctx, {
  841. type: 'line',
  842. data: {
  843. labels: labels,
  844. datasets: [{
  845. label: 'Pass Rate (%)',
  846. data: passRates,
  847. borderColor: '#28a745',
  848. backgroundColor: 'rgba(40, 167, 69, 0.1)',
  849. tension: 0.4,
  850. fill: true
  851. }]
  852. },
  853. options: {
  854. responsive: true,
  855. maintainAspectRatio: true,
  856. plugins: {
  857. legend: {
  858. display: false
  859. }
  860. },
  861. scales: {
  862. y: {
  863. beginAtZero: true,
  864. max: 100,
  865. ticks: {
  866. callback: function(value) {
  867. return value + '%';
  868. }
  869. }
  870. }
  871. }
  872. }
  873. });
  874. }
  875. // Export to CSV
  876. function exportToCSV() {
  877. const tests = filteredResults.length > 0 ? filteredResults : allResults.flatMap(r => r.tests);
  878. const headers = ['Test ID', 'Agent', 'Category', 'Status', 'Duration (ms)', 'Events', 'Approvals', 'Violations'];
  879. const rows = tests.map(test => [
  880. test.id,
  881. test.agent || 'unknown',
  882. test.category,
  883. test.passed ? 'Passed' : 'Failed',
  884. test.duration_ms,
  885. test.events,
  886. test.approvals,
  887. test.violations.total
  888. ]);
  889. const csv = [headers, ...rows]
  890. .map(row => row.map(cell => `"${cell}"`).join(','))
  891. .join('\n');
  892. const blob = new Blob([csv], { type: 'text/csv' });
  893. const url = URL.createObjectURL(blob);
  894. const a = document.createElement('a');
  895. a.href = url;
  896. a.download = `test-results-${new Date().toISOString().split('T')[0]}.csv`;
  897. a.click();
  898. URL.revokeObjectURL(url);
  899. }
  900. // Show Loading
  901. function showLoading() {
  902. document.getElementById('tableContainer').innerHTML = `
  903. <div class="loading">
  904. <div class="spinner"></div>
  905. <p>Loading test results...</p>
  906. </div>
  907. `;
  908. }
  909. // Show Error
  910. function showError(message) {
  911. // Format multi-line messages
  912. const formattedMessage = message.split('\n').map(line =>
  913. line.trim() ? `<p style="margin: 5px 0; text-align: left;">${line}</p>` : '<br>'
  914. ).join('');
  915. document.getElementById('tableContainer').innerHTML = `
  916. <div class="empty-state">
  917. <div class="empty-state-icon">⚠️</div>
  918. <h3>Cannot Load Results</h3>
  919. <div style="max-width: 600px; margin: 20px auto; background: var(--bg-secondary); padding: 20px; border-radius: 8px; text-align: left;">
  920. <h4 style="margin-top: 0;">Solution: Serve via HTTP</h4>
  921. <pre style="background: var(--bg-card); padding: 10px; border-radius: 4px; overflow-x: auto;">cd evals/results
  922. python3 -m http.server 8000</pre>
  923. <p>Then open: <a href="http://localhost:8000" target="_blank">http://localhost:8000</a></p>
  924. <hr style="margin: 15px 0; border: none; border-top: 1px solid var(--border-color);">
  925. <p style="font-size: 12px; color: var(--text-secondary);">
  926. <strong>Why?</strong> Browsers block loading local JSON files for security.
  927. Serving via HTTP solves this.
  928. </p>
  929. </div>
  930. <button onclick="loadResults()" class="primary">Try Again</button>
  931. </div>
  932. `;
  933. }
  934. </script>
  935. </body>
  936. </html>