logic855
/
OpenAgentsControl
mirror of https://github.com/darrenhinde/OpenAgentsControl.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
							#!/bin/bash
#
# test-prompt.sh - Test a specific prompt variant for an agent
#
# Usage:
#   ./scripts/prompts/test-prompt.sh --agent=openagent --variant=default
#   ./scripts/prompts/test-prompt.sh --agent=openagent --variant=default --model=anthropic/claude-sonnet-4-5
#   ./scripts/prompts/test-prompt.sh --agent=openagent --variant=sonnet-4 --model=opencode/grok-code-fast
#
# What it does:
#   1. Backs up current agent prompt
#   2. Copies the specified prompt variant to the agent location
#   3. Runs the eval tests with specified model (defaults to Sonnet 4.5)
#   4. Restores the original prompt (keeps default in place)
#   5. Outputs results summary
#

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Default values
AGENT_NAME=""
PROMPT_VARIANT=""
MODEL=""  # Will be set from metadata or user input

# Paths
PROMPTS_DIR="$ROOT_DIR/.opencode/prompts"
AGENT_DIR="$ROOT_DIR/.opencode/agent"
EVALS_DIR="$ROOT_DIR/evals/framework"
RESULTS_FILE="$ROOT_DIR/evals/results/latest.json"

# Function to extract metadata from prompt file
extract_metadata() {
    local file="$1"
    local key="$2"
    
    # Extract YAML frontmatter between --- markers
    # Look for key in the metadata section
    awk -v key="$key" '
        /^---$/ { in_yaml = !in_yaml; next }
        in_yaml && $0 ~ "^" key ":" {
            sub("^" key ": *", "")
            gsub(/"/, "")
            print
            exit
        }
    ' "$file"
}

# Function to extract recommended models array from metadata
extract_recommended_models() {
    local file="$1"
    
    # Extract recommended_models array from YAML
    awk '
        /^---$/ { in_yaml = !in_yaml; next }
        in_yaml && /^recommended_models:/ { in_models = 1; next }
        in_yaml && in_models && /^  - / {
            # Remove leading spaces, dash, and quotes
            gsub(/^  - /, "")
            gsub(/"/, "")
            # Remove comments
            sub(/ *#.*$/, "")
            # Trim whitespace
            gsub(/^ +| +$/, "")
            print
        }
        in_yaml && in_models && /^[a-z_]+:/ { exit }
    ' "$file"
}

usage() {
    echo "Usage: $0 --agent=<name> --variant=<name> [--model=<model>]"
    echo ""
    echo "Required:"
    echo "  --agent=NAME       Agent name (e.g., openagent, opencoder)"
    echo "  --variant=NAME     Prompt variant (default, gpt, gemini, grok, llama)"
    echo ""
    echo "Optional:"
    echo "  --model=MODEL      Model to test with (uses prompt metadata if not specified)"
    echo "  --help, -h         Show this help"
    echo ""
    echo -e "${BLUE}Architecture:${NC}"
    echo "  • default = Canonical agent file (.opencode/agent/<agent>.md)"
    echo "  • Other variants = Model-specific optimizations (.opencode/prompts/<agent>/<model>.md)"
    echo "  • Results always saved to .opencode/prompts/<agent>/results/"
    echo ""
    echo "Examples:"
    echo "  # Test default (canonical agent file)"
    echo "  $0 --agent=openagent --variant=default"
    echo ""
    echo "  # Test GPT-optimized prompt"
    echo "  $0 --agent=openagent --variant=gpt"
    echo ""
    echo "  # Test Gemini prompt with specific model"
    echo "  $0 --agent=openagent --variant=gemini --model=google/gemini-2.0-flash-exp"
    echo ""
    echo "  # Test Grok prompt"
    echo "  $0 --agent=openagent --variant=grok"
    echo ""
    echo "Available model families:"
    echo "  default    # Canonical agent file (Claude Sonnet 4.5)"
    echo "  gpt        # OpenAI GPT-4o, GPT-4o-mini, o1"
    echo "  gemini     # Google Gemini 2.0 Flash, Pro"
    echo "  grok       # xAI Grok (free tier available)"
    echo "  llama      # Meta Llama 3.1/3.2 (local or hosted)"
    echo ""
    echo "Note: Model-specific prompts contain metadata with recommended models."
    echo "      If --model is not specified, the primary recommendation is used."
    echo ""
    echo "Available variants for an agent:"
    echo "  ls $PROMPTS_DIR/<agent-name>/"
    exit 1
}

# Parse arguments
for arg in "$@"; do
    case $arg in
        --agent=*)
            AGENT_NAME="${arg#*=}"
            shift
            ;;
        --variant=*)
            PROMPT_VARIANT="${arg#*=}"
            shift
            ;;
        --model=*)
            MODEL="${arg#*=}"
            shift
            ;;
        --help|-h)
            usage
            ;;
        *)
            echo -e "${RED}Unknown argument: $arg${NC}"
            echo ""
            usage
            ;;
    esac
done

# Validate required arguments
if [[ -z "$AGENT_NAME" ]] || [[ -z "$PROMPT_VARIANT" ]]; then
    echo -e "${RED}Error: Missing required arguments${NC}"
    echo ""
    usage
fi

AGENT_FILE="$AGENT_DIR/$AGENT_NAME.md"
BACKUP_FILE="$AGENT_DIR/.$AGENT_NAME.md.backup"
VARIANT_RESULTS_DIR="$PROMPTS_DIR/$AGENT_NAME/results"
VARIANT_RESULTS_FILE="$VARIANT_RESULTS_DIR/$PROMPT_VARIANT-results.json"

# Handle "default" variant - use agent file directly
if [[ "$PROMPT_VARIANT" == "default" ]]; then
    PROMPT_FILE="$AGENT_FILE"
    echo -e "${BLUE}Testing default prompt (canonical agent file)${NC}"
else
    PROMPT_FILE="$PROMPTS_DIR/$AGENT_NAME/$PROMPT_VARIANT.md"
    
    # Check prompt exists
    if [[ ! -f "$PROMPT_FILE" ]]; then
        echo -e "${RED}Error: Prompt variant not found: $PROMPT_FILE${NC}"
        echo ""
        echo "Available variants for $AGENT_NAME:"
        echo "  - default (canonical agent file)"
        if [[ -d "$PROMPTS_DIR/$AGENT_NAME" ]]; then
            find "$PROMPTS_DIR/$AGENT_NAME" -maxdepth 1 -name "*.md" -not -name "TEMPLATE.md" -not -name "README.md" -exec basename {} .md \; || echo "  (no model variants found)"
        fi
        exit 1
    fi
fi

# Read metadata from prompt file
MODEL_FAMILY=$(extract_metadata "$PROMPT_FILE" "model_family")
RECOMMENDED_MODELS=$(extract_recommended_models "$PROMPT_FILE")

# If no model specified, suggest from metadata
if [[ -z "$MODEL" ]]; then
    if [[ -n "$RECOMMENDED_MODELS" ]]; then
        echo -e "${YELLOW}No model specified. Reading recommendations from prompt metadata...${NC}"
        echo ""
        echo -e "${BLUE}Recommended models for '$PROMPT_VARIANT' (${MODEL_FAMILY} family):${NC}"
        
        # Display recommended models with numbers
        i=1
        while IFS= read -r model; do
            echo "  $i. $model"
            if [[ $i -eq 1 ]]; then
                PRIMARY_MODEL="$model"
            fi
            ((i++))
        done <<< "$RECOMMENDED_MODELS"
        
        echo ""
        echo -e "${YELLOW}Using primary recommendation: ${GREEN}$PRIMARY_MODEL${NC}"
        echo ""
        echo "To use a different model, run with: --model=<model-id>"
        echo ""
        
        MODEL="$PRIMARY_MODEL"
    else
        # Fallback to default if no metadata
        echo -e "${YELLOW}No metadata found. Using default model: anthropic/claude-sonnet-4-5${NC}"
        MODEL="anthropic/claude-sonnet-4-5"
    fi
fi

echo -e "${BLUE}╔═══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║  Testing Prompt: $AGENT_NAME / $PROMPT_VARIANT${NC}"
echo -e "${BLUE}║  Model: $MODEL${NC}"
echo -e "${BLUE}╚═══════════════════════════════════════════════════════════════╝${NC}"
echo ""

# Step 1: Backup current agent prompt
echo -e "${YELLOW}[1/5] Backing up current agent prompt...${NC}"
if [[ -f "$AGENT_FILE" ]]; then
    cp "$AGENT_FILE" "$BACKUP_FILE"
    echo "      Backed up to $BACKUP_FILE"
else
    echo "      No existing agent file to backup"
fi

# Step 2: Copy prompt variant to agent location (skip if testing default)
if [[ "$PROMPT_VARIANT" == "default" ]]; then
    echo -e "${YELLOW}[2/5] Using default prompt (already in place)...${NC}"
    echo "      Testing: $AGENT_FILE"
else
    echo -e "${YELLOW}[2/5] Copying prompt variant to agent location...${NC}"
    cp "$PROMPT_FILE" "$AGENT_FILE"
    echo "      Copied $PROMPT_FILE"
    echo "      To     $AGENT_FILE"
fi

# Step 3: Run tests
echo -e "${YELLOW}[3/5] Running core eval tests...${NC}"
echo ""
echo -e "${BLUE}Model: ${GREEN}$MODEL${NC}"
echo -e "${BLUE}Running 7 core tests (estimated 5-8 minutes):${NC}"
echo "  1. Approval Gate"
echo "  2. Context Loading (Simple)"
echo "  3. Context Loading (Multi-Turn)"
echo "  4. Stop on Failure"
echo "  5. Simple Task (No Delegation)"
echo "  6. Subagent Delegation"
echo "  7. Tool Usage"
echo ""
echo -e "${BLUE}Test output:${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo ""

cd "$EVALS_DIR"

# Run tests with real-time output
set +e  # Don't exit on test failure
npm run eval:sdk:core -- --agent="$AGENT_NAME" --model="$MODEL"
TEST_EXIT_CODE=$?
export TEST_EXIT_CODE
set -e

echo ""
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"

# Step 4: Restore original prompt (if we changed it)
echo ""
if [[ "$PROMPT_VARIANT" == "default" ]]; then
    echo -e "${YELLOW}[4/5] No restore needed (tested default)...${NC}"
    echo "      Agent file unchanged"
else
    echo -e "${YELLOW}[4/5] Restoring original prompt...${NC}"
    if [[ -f "$BACKUP_FILE" ]]; then
        cp "$BACKUP_FILE" "$AGENT_FILE"
        echo "      Restored from backup"
    else
        echo "      No backup to restore"
    fi
fi

# Clean up backup
rm -f "$BACKUP_FILE"

# Step 5: Save and show results summary
echo ""
echo -e "${YELLOW}[5/5] Saving Results${NC}"

# Create results directory if it doesn't exist
mkdir -p "$VARIANT_RESULTS_DIR"

# Save the test output for reference
if [[ -f "/tmp/test-output-$AGENT_NAME.txt" ]]; then
    cp "/tmp/test-output-$AGENT_NAME.txt" "$VARIANT_RESULTS_DIR/$PROMPT_VARIANT-output.log"
    echo "      Saved test output to: $VARIANT_RESULTS_DIR/$PROMPT_VARIANT-output.log"
fi

if [[ -f "$RESULTS_FILE" ]]; then
    # Extract summary from results JSON
    if command -v jq &> /dev/null; then
        PASS_COUNT=$(jq -r '.summary.passed // 0' "$RESULTS_FILE")
        TOTAL_COUNT=$(jq -r '.summary.total // 0' "$RESULTS_FILE")
        FAIL_COUNT=$(jq -r '.summary.failed // 0' "$RESULTS_FILE")
    else
        # Fallback if jq not available
        PASS_COUNT=$(grep -o '"passed":[0-9]*' "$RESULTS_FILE" | head -1 | grep -o '[0-9]*')
        TOTAL_COUNT=$(grep -o '"total":[0-9]*' "$RESULTS_FILE" | head -1 | grep -o '[0-9]*')
        FAIL_COUNT=$((TOTAL_COUNT - PASS_COUNT))
    fi
    
    # Calculate pass rate
    if [ $TOTAL_COUNT -gt 0 ]; then
        PASS_RATE=$(echo "scale=1; ($PASS_COUNT * 100) / $TOTAL_COUNT" | bc)
    else
        PASS_RATE="0.0"
    fi
    
    # Create variant results JSON
    cat > "$VARIANT_RESULTS_FILE" <<EOF
{
  "variant": "$PROMPT_VARIANT",
  "agent": "$AGENT_NAME",
  "model": "$MODEL",
  "timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
  "passed": $PASS_COUNT,
  "failed": $FAIL_COUNT,
  "total": $TOTAL_COUNT,
  "passRate": "${PASS_RATE}%",
  "fullResults": "$RESULTS_FILE"
}
EOF
    
    echo "      Saved results to: $VARIANT_RESULTS_FILE"
    
    echo ""
    echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
    echo ""
    echo -e "  Agent:     ${GREEN}$AGENT_NAME${NC}"
    echo -e "  Prompt:    ${GREEN}$PROMPT_VARIANT${NC}"
    echo -e "  Model:     ${GREEN}$MODEL${NC}"
    echo -e "  Results:   ${GREEN}$PASS_COUNT/$TOTAL_COUNT tests passed${NC} (${PASS_RATE}%)"
    echo ""
    echo "  Variant results: $VARIANT_RESULTS_FILE"
    echo "  Full results:    $RESULTS_FILE"
else
    echo -e "  ${RED}No results file found${NC}"
    echo "  Tests may not have run successfully"
fi

echo ""
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo ""
echo -e "${GREEN}Done!${NC} Default prompt restored to agent location."
echo ""
echo "To use this prompt permanently:"
echo "  ./scripts/prompts/use-prompt.sh --agent=$AGENT_NAME --variant=$PROMPT_VARIANT"