test-prompt.sh 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. #!/bin/bash
  2. #
  3. # test-prompt.sh - Test a specific prompt variant for an agent
  4. #
  5. # Usage:
  6. # ./scripts/prompts/test-prompt.sh --agent=openagent --variant=default
  7. # ./scripts/prompts/test-prompt.sh --agent=openagent --variant=default --model=anthropic/claude-sonnet-4-5
  8. # ./scripts/prompts/test-prompt.sh --agent=openagent --variant=sonnet-4 --model=opencode/grok-code-fast
  9. #
  10. # What it does:
  11. # 1. Backs up current agent prompt
  12. # 2. Copies the specified prompt variant to the agent location
  13. # 3. Runs the eval tests with specified model (defaults to Sonnet 4.5)
  14. # 4. Restores the original prompt (keeps default in place)
  15. # 5. Outputs results summary
  16. #
  17. set -e
  18. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  19. ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
  20. # Colors
  21. RED='\033[0;31m'
  22. GREEN='\033[0;32m'
  23. YELLOW='\033[1;33m'
  24. BLUE='\033[0;34m'
  25. NC='\033[0m' # No Color
  26. # Default values
  27. AGENT_NAME=""
  28. PROMPT_VARIANT=""
  29. MODEL="" # Will be set from metadata or user input
  30. # Paths
  31. PROMPTS_DIR="$ROOT_DIR/.opencode/prompts"
  32. AGENT_DIR="$ROOT_DIR/.opencode/agent"
  33. EVALS_DIR="$ROOT_DIR/evals/framework"
  34. RESULTS_FILE="$ROOT_DIR/evals/results/latest.json"
  35. # Function to extract metadata from prompt file
  36. extract_metadata() {
  37. local file="$1"
  38. local key="$2"
  39. # Extract YAML frontmatter between --- markers
  40. # Look for key in the metadata section
  41. awk -v key="$key" '
  42. /^---$/ { in_yaml = !in_yaml; next }
  43. in_yaml && $0 ~ "^" key ":" {
  44. sub("^" key ": *", "")
  45. gsub(/"/, "")
  46. print
  47. exit
  48. }
  49. ' "$file"
  50. }
  51. # Function to extract recommended models array from metadata
  52. extract_recommended_models() {
  53. local file="$1"
  54. # Extract recommended_models array from YAML
  55. awk '
  56. /^---$/ { in_yaml = !in_yaml; next }
  57. in_yaml && /^recommended_models:/ { in_models = 1; next }
  58. in_yaml && in_models && /^ - / {
  59. # Remove leading spaces, dash, and quotes
  60. gsub(/^ - /, "")
  61. gsub(/"/, "")
  62. # Remove comments
  63. sub(/ *#.*$/, "")
  64. # Trim whitespace
  65. gsub(/^ +| +$/, "")
  66. print
  67. }
  68. in_yaml && in_models && /^[a-z_]+:/ { exit }
  69. ' "$file"
  70. }
  71. usage() {
  72. echo "Usage: $0 --agent=<name> --variant=<name> [--model=<model>]"
  73. echo ""
  74. echo "Required:"
  75. echo " --agent=NAME Agent name (e.g., openagent, opencoder)"
  76. echo " --variant=NAME Prompt variant (default, gpt, gemini, grok, llama)"
  77. echo ""
  78. echo "Optional:"
  79. echo " --model=MODEL Model to test with (uses prompt metadata if not specified)"
  80. echo " --help, -h Show this help"
  81. echo ""
  82. echo -e "${BLUE}Architecture:${NC}"
  83. echo " • default = Canonical agent file (.opencode/agent/<agent>.md)"
  84. echo " • Other variants = Model-specific optimizations (.opencode/prompts/<agent>/<model>.md)"
  85. echo " • Results always saved to .opencode/prompts/<agent>/results/"
  86. echo ""
  87. echo "Examples:"
  88. echo " # Test default (canonical agent file)"
  89. echo " $0 --agent=openagent --variant=default"
  90. echo ""
  91. echo " # Test GPT-optimized prompt"
  92. echo " $0 --agent=openagent --variant=gpt"
  93. echo ""
  94. echo " # Test Gemini prompt with specific model"
  95. echo " $0 --agent=openagent --variant=gemini --model=google/gemini-2.0-flash-exp"
  96. echo ""
  97. echo " # Test Grok prompt"
  98. echo " $0 --agent=openagent --variant=grok"
  99. echo ""
  100. echo "Available model families:"
  101. echo " default # Canonical agent file (Claude Sonnet 4.5)"
  102. echo " gpt # OpenAI GPT-4o, GPT-4o-mini, o1"
  103. echo " gemini # Google Gemini 2.0 Flash, Pro"
  104. echo " grok # xAI Grok (free tier available)"
  105. echo " llama # Meta Llama 3.1/3.2 (local or hosted)"
  106. echo ""
  107. echo "Note: Model-specific prompts contain metadata with recommended models."
  108. echo " If --model is not specified, the primary recommendation is used."
  109. echo ""
  110. echo "Available variants for an agent:"
  111. echo " ls $PROMPTS_DIR/<agent-name>/"
  112. exit 1
  113. }
  114. # Parse arguments
  115. for arg in "$@"; do
  116. case $arg in
  117. --agent=*)
  118. AGENT_NAME="${arg#*=}"
  119. shift
  120. ;;
  121. --variant=*)
  122. PROMPT_VARIANT="${arg#*=}"
  123. shift
  124. ;;
  125. --model=*)
  126. MODEL="${arg#*=}"
  127. shift
  128. ;;
  129. --help|-h)
  130. usage
  131. ;;
  132. *)
  133. echo -e "${RED}Unknown argument: $arg${NC}"
  134. echo ""
  135. usage
  136. ;;
  137. esac
  138. done
  139. # Validate required arguments
  140. if [[ -z "$AGENT_NAME" ]] || [[ -z "$PROMPT_VARIANT" ]]; then
  141. echo -e "${RED}Error: Missing required arguments${NC}"
  142. echo ""
  143. usage
  144. fi
  145. AGENT_FILE="$AGENT_DIR/$AGENT_NAME.md"
  146. BACKUP_FILE="$AGENT_DIR/.$AGENT_NAME.md.backup"
  147. VARIANT_RESULTS_DIR="$PROMPTS_DIR/$AGENT_NAME/results"
  148. VARIANT_RESULTS_FILE="$VARIANT_RESULTS_DIR/$PROMPT_VARIANT-results.json"
  149. # Handle "default" variant - use agent file directly
  150. if [[ "$PROMPT_VARIANT" == "default" ]]; then
  151. PROMPT_FILE="$AGENT_FILE"
  152. echo -e "${BLUE}Testing default prompt (canonical agent file)${NC}"
  153. else
  154. PROMPT_FILE="$PROMPTS_DIR/$AGENT_NAME/$PROMPT_VARIANT.md"
  155. # Check prompt exists
  156. if [[ ! -f "$PROMPT_FILE" ]]; then
  157. echo -e "${RED}Error: Prompt variant not found: $PROMPT_FILE${NC}"
  158. echo ""
  159. echo "Available variants for $AGENT_NAME:"
  160. echo " - default (canonical agent file)"
  161. if [[ -d "$PROMPTS_DIR/$AGENT_NAME" ]]; then
  162. find "$PROMPTS_DIR/$AGENT_NAME" -maxdepth 1 -name "*.md" -not -name "TEMPLATE.md" -not -name "README.md" -exec basename {} .md \; || echo " (no model variants found)"
  163. fi
  164. exit 1
  165. fi
  166. fi
  167. # Read metadata from prompt file
  168. MODEL_FAMILY=$(extract_metadata "$PROMPT_FILE" "model_family")
  169. RECOMMENDED_MODELS=$(extract_recommended_models "$PROMPT_FILE")
  170. # If no model specified, suggest from metadata
  171. if [[ -z "$MODEL" ]]; then
  172. if [[ -n "$RECOMMENDED_MODELS" ]]; then
  173. echo -e "${YELLOW}No model specified. Reading recommendations from prompt metadata...${NC}"
  174. echo ""
  175. echo -e "${BLUE}Recommended models for '$PROMPT_VARIANT' (${MODEL_FAMILY} family):${NC}"
  176. # Display recommended models with numbers
  177. i=1
  178. while IFS= read -r model; do
  179. echo " $i. $model"
  180. if [[ $i -eq 1 ]]; then
  181. PRIMARY_MODEL="$model"
  182. fi
  183. ((i++))
  184. done <<< "$RECOMMENDED_MODELS"
  185. echo ""
  186. echo -e "${YELLOW}Using primary recommendation: ${GREEN}$PRIMARY_MODEL${NC}"
  187. echo ""
  188. echo "To use a different model, run with: --model=<model-id>"
  189. echo ""
  190. MODEL="$PRIMARY_MODEL"
  191. else
  192. # Fallback to default if no metadata
  193. echo -e "${YELLOW}No metadata found. Using default model: anthropic/claude-sonnet-4-5${NC}"
  194. MODEL="anthropic/claude-sonnet-4-5"
  195. fi
  196. fi
  197. echo -e "${BLUE}╔═══════════════════════════════════════════════════════════════╗${NC}"
  198. echo -e "${BLUE}║ Testing Prompt: $AGENT_NAME / $PROMPT_VARIANT${NC}"
  199. echo -e "${BLUE}║ Model: $MODEL${NC}"
  200. echo -e "${BLUE}╚═══════════════════════════════════════════════════════════════╝${NC}"
  201. echo ""
  202. # Step 1: Backup current agent prompt
  203. echo -e "${YELLOW}[1/5] Backing up current agent prompt...${NC}"
  204. if [[ -f "$AGENT_FILE" ]]; then
  205. cp "$AGENT_FILE" "$BACKUP_FILE"
  206. echo " Backed up to $BACKUP_FILE"
  207. else
  208. echo " No existing agent file to backup"
  209. fi
  210. # Step 2: Copy prompt variant to agent location (skip if testing default)
  211. if [[ "$PROMPT_VARIANT" == "default" ]]; then
  212. echo -e "${YELLOW}[2/5] Using default prompt (already in place)...${NC}"
  213. echo " Testing: $AGENT_FILE"
  214. else
  215. echo -e "${YELLOW}[2/5] Copying prompt variant to agent location...${NC}"
  216. cp "$PROMPT_FILE" "$AGENT_FILE"
  217. echo " Copied $PROMPT_FILE"
  218. echo " To $AGENT_FILE"
  219. fi
  220. # Step 3: Run tests
  221. echo -e "${YELLOW}[3/5] Running core eval tests...${NC}"
  222. echo ""
  223. echo -e "${BLUE}Model: ${GREEN}$MODEL${NC}"
  224. echo -e "${BLUE}Running 7 core tests (estimated 5-8 minutes):${NC}"
  225. echo " 1. Approval Gate"
  226. echo " 2. Context Loading (Simple)"
  227. echo " 3. Context Loading (Multi-Turn)"
  228. echo " 4. Stop on Failure"
  229. echo " 5. Simple Task (No Delegation)"
  230. echo " 6. Subagent Delegation"
  231. echo " 7. Tool Usage"
  232. echo ""
  233. echo -e "${BLUE}Test output:${NC}"
  234. echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
  235. echo ""
  236. cd "$EVALS_DIR"
  237. # Run tests with real-time output
  238. set +e # Don't exit on test failure
  239. npm run eval:sdk:core -- --agent="$AGENT_NAME" --model="$MODEL"
  240. TEST_EXIT_CODE=$?
  241. export TEST_EXIT_CODE
  242. set -e
  243. echo ""
  244. echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
  245. # Step 4: Restore original prompt (if we changed it)
  246. echo ""
  247. if [[ "$PROMPT_VARIANT" == "default" ]]; then
  248. echo -e "${YELLOW}[4/5] No restore needed (tested default)...${NC}"
  249. echo " Agent file unchanged"
  250. else
  251. echo -e "${YELLOW}[4/5] Restoring original prompt...${NC}"
  252. if [[ -f "$BACKUP_FILE" ]]; then
  253. cp "$BACKUP_FILE" "$AGENT_FILE"
  254. echo " Restored from backup"
  255. else
  256. echo " No backup to restore"
  257. fi
  258. fi
  259. # Clean up backup
  260. rm -f "$BACKUP_FILE"
  261. # Step 5: Save and show results summary
  262. echo ""
  263. echo -e "${YELLOW}[5/5] Saving Results${NC}"
  264. # Create results directory if it doesn't exist
  265. mkdir -p "$VARIANT_RESULTS_DIR"
  266. # Save the test output for reference
  267. if [[ -f "/tmp/test-output-$AGENT_NAME.txt" ]]; then
  268. cp "/tmp/test-output-$AGENT_NAME.txt" "$VARIANT_RESULTS_DIR/$PROMPT_VARIANT-output.log"
  269. echo " Saved test output to: $VARIANT_RESULTS_DIR/$PROMPT_VARIANT-output.log"
  270. fi
  271. if [[ -f "$RESULTS_FILE" ]]; then
  272. # Extract summary from results JSON
  273. if command -v jq &> /dev/null; then
  274. PASS_COUNT=$(jq -r '.summary.passed // 0' "$RESULTS_FILE")
  275. TOTAL_COUNT=$(jq -r '.summary.total // 0' "$RESULTS_FILE")
  276. FAIL_COUNT=$(jq -r '.summary.failed // 0' "$RESULTS_FILE")
  277. else
  278. # Fallback if jq not available
  279. PASS_COUNT=$(grep -o '"passed":[0-9]*' "$RESULTS_FILE" | head -1 | grep -o '[0-9]*')
  280. TOTAL_COUNT=$(grep -o '"total":[0-9]*' "$RESULTS_FILE" | head -1 | grep -o '[0-9]*')
  281. FAIL_COUNT=$((TOTAL_COUNT - PASS_COUNT))
  282. fi
  283. # Calculate pass rate
  284. if [ $TOTAL_COUNT -gt 0 ]; then
  285. PASS_RATE=$(echo "scale=1; ($PASS_COUNT * 100) / $TOTAL_COUNT" | bc)
  286. else
  287. PASS_RATE="0.0"
  288. fi
  289. # Create variant results JSON
  290. cat > "$VARIANT_RESULTS_FILE" <<EOF
  291. {
  292. "variant": "$PROMPT_VARIANT",
  293. "agent": "$AGENT_NAME",
  294. "model": "$MODEL",
  295. "timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
  296. "passed": $PASS_COUNT,
  297. "failed": $FAIL_COUNT,
  298. "total": $TOTAL_COUNT,
  299. "passRate": "${PASS_RATE}%",
  300. "fullResults": "$RESULTS_FILE"
  301. }
  302. EOF
  303. echo " Saved results to: $VARIANT_RESULTS_FILE"
  304. echo ""
  305. echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
  306. echo ""
  307. echo -e " Agent: ${GREEN}$AGENT_NAME${NC}"
  308. echo -e " Prompt: ${GREEN}$PROMPT_VARIANT${NC}"
  309. echo -e " Model: ${GREEN}$MODEL${NC}"
  310. echo -e " Results: ${GREEN}$PASS_COUNT/$TOTAL_COUNT tests passed${NC} (${PASS_RATE}%)"
  311. echo ""
  312. echo " Variant results: $VARIANT_RESULTS_FILE"
  313. echo " Full results: $RESULTS_FILE"
  314. else
  315. echo -e " ${RED}No results file found${NC}"
  316. echo " Tests may not have run successfully"
  317. fi
  318. echo ""
  319. echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
  320. echo ""
  321. echo -e "${GREEN}Done!${NC} Default prompt restored to agent location."
  322. echo ""
  323. echo "To use this prompt permanently:"
  324. echo " ./scripts/prompts/use-prompt.sh --agent=$AGENT_NAME --variant=$PROMPT_VARIANT"