engine.test.ts 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. /// <reference types="bun-types" />
  2. import { describe, expect, test } from 'bun:test';
  3. import type { DiscoveredModel, ExternalSignalMap } from '../types';
  4. import { rankModelsV2, scoreCandidateV2 } from './engine';
  5. function model(
  6. input: Partial<DiscoveredModel> & { model: string },
  7. ): DiscoveredModel {
  8. const [providerID] = input.model.split('/');
  9. return {
  10. providerID: providerID ?? 'openai',
  11. model: input.model,
  12. name: input.name ?? input.model,
  13. status: input.status ?? 'active',
  14. contextLimit: input.contextLimit ?? 200000,
  15. outputLimit: input.outputLimit ?? 32000,
  16. reasoning: input.reasoning ?? true,
  17. toolcall: input.toolcall ?? true,
  18. attachment: input.attachment ?? false,
  19. dailyRequestLimit: input.dailyRequestLimit,
  20. costInput: input.costInput,
  21. costOutput: input.costOutput,
  22. };
  23. }
  24. describe('scoring-v2', () => {
  25. test('returns explain breakdown with deterministic total', () => {
  26. const candidate = model({ model: 'openai/gpt-5.3-codex' });
  27. const signalMap: ExternalSignalMap = {
  28. 'openai/gpt-5.3-codex': {
  29. source: 'artificial-analysis',
  30. qualityScore: 70,
  31. codingScore: 75,
  32. latencySeconds: 1.2,
  33. inputPricePer1M: 1,
  34. outputPricePer1M: 3,
  35. },
  36. };
  37. const first = scoreCandidateV2(candidate, 'oracle', signalMap);
  38. const second = scoreCandidateV2(candidate, 'oracle', signalMap);
  39. expect(first.totalScore).toBe(second.totalScore);
  40. expect(first.scoreBreakdown.features.quality).toBe(0.7);
  41. expect(first.scoreBreakdown.weighted.coding).toBeGreaterThan(0);
  42. });
  43. test('uses stable tie-break when scores are equal', () => {
  44. const ranked = rankModelsV2(
  45. [
  46. model({ model: 'zai-coding-plan/glm-4.7', reasoning: false }),
  47. model({ model: 'openai/gpt-5.3-codex', reasoning: false }),
  48. ],
  49. 'explorer',
  50. );
  51. expect(ranked[0]?.model.providerID).toBe('openai');
  52. expect(ranked[1]?.model.providerID).toBe('zai-coding-plan');
  53. });
  54. test('matches external signals for multi-segment chutes ids', () => {
  55. const candidate = model({
  56. model: 'chutes/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8-TEE',
  57. });
  58. const signalMap: ExternalSignalMap = {
  59. 'qwen/qwen3-coder-480b-a35b-instruct': {
  60. source: 'artificial-analysis',
  61. qualityScore: 95,
  62. codingScore: 92,
  63. },
  64. };
  65. const scored = scoreCandidateV2(candidate, 'fixer', signalMap);
  66. expect(scored.scoreBreakdown.features.quality).toBe(0.95);
  67. expect(scored.scoreBreakdown.features.coding).toBe(0.92);
  68. });
  69. test('applies designer output threshold rule', () => {
  70. const belowThreshold = model({
  71. model: 'chutes/moonshotai/Kimi-K2.5-TEE',
  72. outputLimit: 63999,
  73. });
  74. const aboveThreshold = model({
  75. model: 'zai-coding-plan/glm-4.7',
  76. outputLimit: 64000,
  77. });
  78. const low = scoreCandidateV2(belowThreshold, 'designer');
  79. const high = scoreCandidateV2(aboveThreshold, 'designer');
  80. expect(low.scoreBreakdown.features.output).toBe(-1);
  81. expect(low.scoreBreakdown.weighted.output).toBe(-10);
  82. expect(high.scoreBreakdown.features.output).toBe(0);
  83. expect(high.scoreBreakdown.weighted.output).toBe(0);
  84. });
  85. test('prefers kimi k2.5 over kimi k2 when otherwise equal', () => {
  86. const ranked = rankModelsV2(
  87. [
  88. model({
  89. model: 'chutes/moonshotai/Kimi-K2-TEE',
  90. contextLimit: 262144,
  91. outputLimit: 65535,
  92. reasoning: true,
  93. toolcall: true,
  94. attachment: false,
  95. }),
  96. model({
  97. model: 'chutes/moonshotai/Kimi-K2.5-TEE',
  98. contextLimit: 262144,
  99. outputLimit: 65535,
  100. reasoning: true,
  101. toolcall: true,
  102. attachment: false,
  103. }),
  104. ],
  105. 'designer',
  106. );
  107. expect(ranked[0]?.model.model).toBe('chutes/moonshotai/Kimi-K2.5-TEE');
  108. expect(ranked[1]?.model.model).toBe('chutes/moonshotai/Kimi-K2-TEE');
  109. });
  110. test('downranks chutes qwen3 against kimi/minimax priors', () => {
  111. const ranked = rankModelsV2(
  112. [
  113. model({
  114. model: 'chutes/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8-TEE',
  115. contextLimit: 262144,
  116. outputLimit: 262144,
  117. reasoning: true,
  118. toolcall: true,
  119. }),
  120. model({
  121. model: 'chutes/moonshotai/Kimi-K2.5-TEE',
  122. contextLimit: 262144,
  123. outputLimit: 65535,
  124. reasoning: true,
  125. toolcall: true,
  126. }),
  127. model({
  128. model: 'chutes/minimax-m2.1',
  129. contextLimit: 500000,
  130. outputLimit: 64000,
  131. reasoning: true,
  132. toolcall: true,
  133. }),
  134. ],
  135. 'fixer',
  136. );
  137. expect(ranked[0]?.model.model).not.toContain('Qwen3-Coder-480B');
  138. });
  139. });