loop-doctor.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. #!/usr/bin/env bash
  2. # Preflight a loop config - will this loop actually RUN, or die at 3am?
  3. #
  4. # loop-check checks the config is well-formed; loop-doctor checks the loop will
  5. # execute: the gate command's binary resolves, claude/git are on PATH, the budget
  6. # can fit a tick, and the permission mode is achievable from where it launches.
  7. # Modeled on fleet-worker/scripts/fleet-doctor.sh.
  8. #
  9. # Usage: loop-doctor.sh [--offline|--live] [--json] [-q] <loop.config.yaml>
  10. # Input: argv flags + a config path (no stdin).
  11. # Output: stdout = check rows (TSV: state<TAB>check<TAB>detail), or a --json envelope.
  12. # Stderr: the preflight panel, notices, errors.
  13. # Exit: 0 ok, 2 usage, 3 config not found, 4 unparseable, 5 missing core dep,
  14. # 10 a check predicts a runtime failure (a gate binary missing, bypass on
  15. # host without isolation, budget too small for a tick)
  16. #
  17. # --offline (default): no PATH/exec - config-shape + budget-vs-cost + permission/
  18. # isolation coherence. Safe for PR CI.
  19. # --live: adds runtime preflight - claude/git on PATH, the verify/guard
  20. # leading binary resolvable, the kill-switch path's parent exists.
  21. #
  22. # Examples:
  23. # loop-doctor.sh --offline .loops/pr-watch/loop.config.yaml
  24. # loop-doctor.sh --live .loops/ci-watch/loop.config.yaml
  25. # loop-doctor.sh --live --json .loops/dep-bump/loop.config.yaml | jq '.data[] | select(.state=="bad")'
  26. set -uo pipefail
  27. readonly EX_OK=0 EX_USAGE=2 EX_NOTFOUND=3 EX_UNPARSEABLE=4 EX_MISSING_DEP=5 EX_FINDINGS=10
  28. __lib="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../_lib" 2>/dev/null && pwd || true)"
  29. if [ -n "${__lib:-}" ] && [ -f "$__lib/term.sh" ]; then . "$__lib/term.sh"; term_init 2
  30. else
  31. term_panel_open() { :; }; term_panel_close() { :; }; term_panel_vert() { :; }
  32. term_status_row() { shift; printf ' - %s %s\n' "$1" "${2:-}"; }
  33. term_color() { shift; printf '%s' "$*"; }; TERM_DOT="|"
  34. fi
  35. HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  36. PRICING="$HERE/../assets/model-pricing.json"
  37. CFG=""; MODE="offline"; JSON=0; QUIET=0
  38. usage() {
  39. cat <<'EOF'
  40. loop-doctor.sh - preflight a loop config (will it actually run?).
  41. Usage:
  42. loop-doctor.sh [--offline|--live] [--json] [-q] <loop.config.yaml>
  43. Options:
  44. --offline config-shape + budget-vs-cost + permission coherence (default; no PATH/exec).
  45. --live adds runtime preflight: claude/git on PATH, verify/guard binary resolvable.
  46. --json emit a JSON envelope.
  47. -q, --quiet suppress the stderr panel.
  48. -h, --help show this help and exit 0.
  49. Exit codes:
  50. 0 ok 2 usage 3 not found 4 unparseable 5 missing dep 10 predicted runtime failure
  51. Examples:
  52. loop-doctor.sh --offline .loops/pr-watch/loop.config.yaml
  53. loop-doctor.sh --live .loops/ci-watch/loop.config.yaml
  54. loop-doctor.sh --live --json .loops/dep-bump/loop.config.yaml | jq '.data[] | select(.state=="bad")'
  55. EOF
  56. }
  57. die_usage() { printf 'error: %s\n' "$1" >&2; echo >&2; usage >&2; exit "$EX_USAGE"; }
  58. while [[ $# -gt 0 ]]; do
  59. case "$1" in
  60. --offline) MODE="offline"; shift ;;
  61. --live) MODE="live"; shift ;;
  62. --json) JSON=1; shift ;;
  63. -q|--quiet) QUIET=1; shift ;;
  64. -h|--help) usage; exit "$EX_OK" ;;
  65. -*) die_usage "unknown flag: $1" ;;
  66. *) [[ -z "$CFG" ]] || die_usage "unexpected extra argument: $1"; CFG="$1"; shift ;;
  67. esac
  68. done
  69. command -v awk >/dev/null 2>&1 || { echo "loop-doctor: awk required" >&2; exit "$EX_MISSING_DEP"; }
  70. command -v grep >/dev/null 2>&1 || { echo "loop-doctor: grep required" >&2; exit "$EX_MISSING_DEP"; }
  71. [[ -n "$CFG" ]] || die_usage "a loop.config.yaml path is required"
  72. [[ -f "$CFG" ]] || { printf 'error: config not found: %s\n' "$CFG" >&2; exit "$EX_NOTFOUND"; }
  73. # Normalize Windows-authored configs: strip a leading UTF-8 BOM + CR line-endings so a
  74. # CRLF/BOM file parses like a clean LF one (portable octal BOM + gsub \r).
  75. __NORM="$(mktemp 2>/dev/null)" && awk 'NR==1{sub(/^\357\273\277/,"")} {gsub(/\r/,""); print}' "$CFG" > "$__NORM" 2>/dev/null && CFG="$__NORM" && trap 'rm -f "$__NORM"' EXIT
  76. grep -Eq '^[a-z_]+:' "$CFG" || { printf 'error: no parseable keys in %s\n' "$CFG" >&2; exit "$EX_UNPARSEABLE"; }
  77. # Pick a working python for the budget-vs-cost check (skipped gracefully if none).
  78. PY=""
  79. for c in python python3 py; do
  80. if command -v "$c" >/dev/null 2>&1 && "$c" -c "" >/dev/null 2>&1; then PY="$c"; break; fi
  81. done
  82. # ── flat-YAML readers (no yq), same contract as loop-check.sh ────────────────
  83. cfg_scalar() {
  84. awk -v k="$1" -v q="'" '
  85. $0 ~ "^"k":" { sub("^"k":[ \t]*",""); sub(/[ \t]*#.*$/,""); gsub(/^[ \t]+|[ \t]+$/,"");
  86. gsub(/^"|"$/,""); gsub("^"q"|"q"$",""); print; exit }' "$CFG"
  87. }
  88. cfg_list_items() {
  89. awk -v k="$1" -v q="'" '
  90. $0 ~ "^"k":" { inlist=1; next }
  91. inlist==1 { if ($0 ~ /^[ \t]*-[ \t]+/) { line=$0; sub(/^[ \t]*-[ \t]+/,"",line); sub(/[ \t]*#.*$/,"",line);
  92. gsub(/^[ \t]+|[ \t]+$/,"",line); gsub(/^"|"$/,"",line); gsub("^"q"|"q"$","",line); if (line!="") print line }
  93. else if ($0 ~ /^[^ \t#]/) { inlist=0 } }' "$CFG"
  94. }
  95. TIER="$(cfg_scalar tier)"; PMODE="$(cfg_scalar permission_mode)"; PATTERN="$(cfg_scalar pattern)"
  96. VERIFY="$(cfg_scalar verify)"; GUARD="$(cfg_scalar guard)"; BUDGET="$(cfg_scalar budget_tokens)"
  97. KILL="$(cfg_scalar kill_switch)"; ESCAL="$(cfg_scalar escalation)"
  98. is_l2plus=0; [[ "$TIER" == "L2" || "$TIER" == "L3" ]] && is_l2plus=1
  99. # ── findings ─────────────────────────────────────────────────────────────
  100. ROWS=() # "state\tcheck\tdetail"
  101. FINDING=0
  102. row() { ROWS+=("$1"$'\t'"$2"$'\t'"$3"); [[ "$1" == "bad" ]] && FINDING=1; }
  103. # leading binary of a command string (first whitespace token; strips a leading VAR= prefix)
  104. lead_bin() { awk '{ for(i=1;i<=NF;i++){ if($i !~ /=/){print $i; exit} } }' <<<"$1"; }
  105. # ── OFFLINE checks ───────────────────────────────────────────────────────
  106. # Permission mode achievability.
  107. case "$PMODE" in
  108. default) row bad "permission_mode" "default is interactive - a headless 'claude -p' tick can't answer prompts; use dontAsk/auto/bypassPermissions" ;;
  109. "") row bad "permission_mode" "missing" ;;
  110. *) row ok "permission_mode" "$PMODE" ;;
  111. esac
  112. # L3 bypass needs an isolation boundary.
  113. if [[ "$TIER" == "L3" && "$PMODE" == "bypassPermissions" ]]; then
  114. if printf '%s %s' "$ESCAL" "$(cfg_list_items scope | tr '\n' ' ')" | grep -Eqi 'container|isolat|sandbox|devcontainer'; then
  115. row ok "isolation" "L3 bypass declares an isolation boundary"
  116. else
  117. row bad "isolation" "L3 + bypassPermissions with no container/sandbox note - only safe in an isolated VM/container"
  118. fi
  119. fi
  120. # Budget vs estimated tokens/run.
  121. if [[ -n "$BUDGET" && "$BUDGET" =~ ^[0-9]+$ && -n "$PY" && -n "$PATTERN" && -f "$PRICING" ]]; then
  122. TPR="$(PR="$PRICING" PAT="$PATTERN" "$PY" -c "import json,os
  123. try:
  124. d=json.load(open(os.environ['PR']))['_pattern_defaults'].get(os.environ['PAT'])
  125. print((int(d['input'])+int(d['output']))*int(d.get('subagents',1)) if d else '')
  126. except Exception: print('')" 2>/dev/null)"
  127. if [[ -n "$TPR" && "$TPR" =~ ^[0-9]+$ ]]; then
  128. if [[ "$BUDGET" -lt "$TPR" ]]; then
  129. row bad "budget" "budget_tokens $BUDGET < ~$TPR est. tokens/run for $PATTERN - a tick can't complete"
  130. else
  131. row ok "budget" "budget_tokens $BUDGET >= ~$TPR est. tokens/run"
  132. fi
  133. fi
  134. fi
  135. # ── LIVE checks ──────────────────────────────────────────────────────────
  136. if [[ "$MODE" == "live" ]]; then
  137. if command -v claude >/dev/null 2>&1; then row ok "claude" "on PATH"; else row warn "claude" "not on PATH - the scheduler that runs 'claude -p' must have it"; fi
  138. if command -v git >/dev/null 2>&1; then
  139. row ok "git" "on PATH"
  140. if [[ "$is_l2plus" -eq 1 ]] && ! git worktree list >/dev/null 2>&1; then
  141. row warn "worktree" "'git worktree' unavailable here - L2+ isolates changes in a worktree"
  142. fi
  143. elif [[ "$is_l2plus" -eq 1 ]]; then
  144. row bad "git" "git not on PATH - L2+ needs it for worktree isolation + landing"
  145. else
  146. row warn "git" "git not on PATH"
  147. fi
  148. # verify / guard leading binary resolvable
  149. for pair in "verify:$VERIFY" "guard:$GUARD"; do
  150. label="${pair%%:*}"; cmd="${pair#*:}"
  151. [[ -z "$cmd" ]] && continue
  152. case "$cmd" in *"<"*">"*) continue ;; esac # unfilled placeholder - audit's job
  153. bin="$(lead_bin "$cmd")"
  154. [[ -z "$bin" ]] && continue
  155. if [[ "$bin" == */* ]]; then
  156. [[ -x "$bin" ]] && row ok "$label" "$bin executable" || row bad "$label" "$bin not executable - the gate can't run"
  157. elif command -v "$bin" >/dev/null 2>&1; then
  158. row ok "$label" "$bin resolves"
  159. else
  160. row bad "$label" "'$bin' not on PATH - the gate command can't run at tick time"
  161. fi
  162. done
  163. # kill-switch path parent exists (only when it clearly names a path)
  164. ks_path="$(grep -oE '[^ "'"'"']*/[^ "'"'"']*' <<<"$KILL" | head -1)"
  165. if [[ -n "$ks_path" ]]; then
  166. parent="$(dirname "$ks_path")"
  167. [[ -d "$parent" || "$parent" == "." ]] && row ok "kill_switch" "sentinel path parent exists ($parent)" \
  168. || row warn "kill_switch" "sentinel parent dir missing ($parent) - create it so the switch works"
  169. fi
  170. fi
  171. # ── output ───────────────────────────────────────────────────────────────
  172. n_bad=0; n_warn=0; n_ok=0
  173. for r in "${ROWS[@]:-}"; do
  174. case "${r%%$'\t'*}" in bad) n_bad=$((n_bad+1));; warn) n_warn=$((n_warn+1));; ok) n_ok=$((n_ok+1));; esac
  175. done
  176. if [[ "$JSON" -eq 1 ]]; then
  177. printf '{\n "data": [\n'
  178. if [[ ${#ROWS[@]} -gt 0 ]]; then
  179. for i in "${!ROWS[@]}"; do
  180. IFS=$'\t' read -r st ck dt <<<"${ROWS[$i]}"
  181. dt="${dt//\\/\\\\}"; dt="${dt//\"/\\\"}"
  182. sep=","; [[ "$i" -eq $(( ${#ROWS[@]} - 1 )) ]] && sep=""
  183. printf ' {"state": "%s", "check": "%s", "detail": "%s"}%s\n' "$st" "$ck" "$dt" "$sep"
  184. done
  185. fi
  186. printf ' ],\n "meta": {"mode": "%s", "ok": %d, "warn": %d, "bad": %d, "will_run": %s, "tier": "%s", "schema": "claude-mods.loop-ops.doctor/v1"}\n}\n' \
  187. "$MODE" "$n_ok" "$n_warn" "$n_bad" "$([[ "$FINDING" -eq 0 ]] && echo true || echo false)" "${TIER:-unknown}"
  188. else
  189. if [[ ${#ROWS[@]} -gt 0 ]]; then
  190. for r in "${ROWS[@]}"; do
  191. IFS=$'\t' read -r st ck dt <<<"$r"
  192. printf '%-5s %-14s %s\n' "$st" "$ck" "$dt"
  193. done
  194. fi
  195. if [[ "$QUIET" -eq 0 ]]; then
  196. verdict="$([[ "$FINDING" -eq 0 ]] && echo "WILL RUN" || echo "WILL FAIL")"
  197. vstate="$([[ "$FINDING" -eq 0 ]] && echo ok || echo bad)"
  198. {
  199. term_panel_open loop "loop ${TERM_DOT} doctor ($MODE)" "$(basename "$(dirname "$CFG")")"
  200. term_panel_vert
  201. term_status_row "$vstate" "$verdict" "$n_bad blocking ${TERM_DOT} $n_warn advisory ${TERM_DOT} $n_ok ok"
  202. [[ "$MODE" == "offline" ]] && term_status_row skip "run --live before scheduling" "checks gate binaries + PATH"
  203. term_panel_vert
  204. term_panel_close "audit = well-formed ${TERM_DOT} doctor = will-run" ""
  205. } >&2
  206. fi
  207. fi
  208. [[ "$FINDING" -eq 0 ]] && exit "$EX_OK" || exit "$EX_FINDINGS"