probe.sh 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. #!/usr/bin/env bash
  2. # net-ops :: macos/probe.sh
  3. # Full layered diagnostic ladder for macOS network troubleshooting.
  4. # Outputs structured [PASS]/[FAIL] lines so a human or LLM can scan for
  5. # the first FAIL and drill in.
  6. set -u
  7. TEST_HOST="${TEST_HOST:-google.com}"
  8. TEST_IPS=("1.1.1.1" "8.8.8.8")
  9. TIMEOUT="${TIMEOUT:-5}"
  10. VERBOSE=0
  11. for arg in "$@"; do
  12. case "$arg" in
  13. --verbose|-v) VERBOSE=1 ;;
  14. --help|-h)
  15. cat <<EOF
  16. Usage: $0 [--redact] [--verbose] [--json] [--quick]
  17. --redact Mask private IPs, MAC addresses, and *.ts.net tailnet names
  18. --verbose Full scutil --dns dump (default: condensed one-line-per-resolver)
  19. --json Newline-delimited JSON output (for piping to jq, dashboards)
  20. --quick Skip rungs 1-4 and 7 if the last full run cached as healthy
  21. (cache: \${TMPDIR}/net-ops/last-state.json, TTL 10min)
  22. Compose freely: --json + --redact emits sanitized NDJSON.
  23. EOF
  24. exit 0 ;;
  25. esac
  26. done
  27. # shellcheck source=../_lib/redact.sh
  28. source "$(dirname "$0")/../_lib/redact.sh"
  29. # shellcheck source=../_lib/output.sh
  30. source "$(dirname "$0")/../_lib/output.sh"
  31. # shellcheck source=../_lib/cache.sh
  32. source "$(dirname "$0")/../_lib/cache.sh"
  33. PANEL_TITLE="macos probe"
  34. parse_redact_flag "$@"
  35. parse_output_flags "$@"
  36. parse_quick_flag "$@"
  37. maybe_redact_self "$@"
  38. if cache_indicates_healthy; then
  39. info " [--quick: last full run was healthy, skipping rungs 1-4 and 7]"
  40. fi
  41. # ---------------------------------------------------------------------------
  42. if should_run_rung 1; then
  43. section "1. LINK LAYER"
  44. # ---------------------------------------------------------------------------
  45. ACTIVE_IFS=$(networksetup -listallhardwareports 2>/dev/null | awk '/Hardware Port/{port=$3} /Device/{print port" "$2}' || true)
  46. echo "$ACTIVE_IFS" | while read -r line; do
  47. [[ -z "$line" ]] && continue
  48. name="${line% *}"; dev="${line##* }"
  49. status=$(ifconfig "$dev" 2>/dev/null | awk '/status:/{print $2; exit}')
  50. if [[ "$status" == "active" ]]; then
  51. ip=$(ifconfig "$dev" 2>/dev/null | awk '/inet /{print $2; exit}')
  52. pass "Interface $name ($dev) active" "$ip"
  53. fi
  54. done
  55. GATEWAY=$(route -n get default 2>/dev/null | awk '/gateway:/{print $2}')
  56. DEFAULT_IF=$(route -n get default 2>/dev/null | awk '/interface:/{print $2}')
  57. [[ -n "$GATEWAY" ]] && pass "Default gateway" "$GATEWAY via $DEFAULT_IF" || fail "Default gateway" "none configured"
  58. fi # end rung 1
  59. # ---------------------------------------------------------------------------
  60. if should_run_rung 2; then
  61. section "2. IP / ICMP REACHABILITY"
  62. # ---------------------------------------------------------------------------
  63. [[ -n "${GATEWAY:-}" ]] && {
  64. if ping -c 2 -W "${TIMEOUT}000" "$GATEWAY" >/dev/null 2>&1; then pass "Ping gateway $GATEWAY"; else fail "Ping gateway $GATEWAY"; fi
  65. }
  66. for ip in "${TEST_IPS[@]}"; do
  67. if ping -c 2 -W "${TIMEOUT}000" "$ip" >/dev/null 2>&1; then pass "Ping $ip"; else fail "Ping $ip"; fi
  68. done
  69. fi # end rung 2
  70. # ---------------------------------------------------------------------------
  71. if should_run_rung 3; then
  72. section "3. TCP/UDP SOCKET REACHABILITY"
  73. # ---------------------------------------------------------------------------
  74. for ip in "${TEST_IPS[@]}"; do
  75. if nc -zv -G "$TIMEOUT" "$ip" 443 >/dev/null 2>&1; then pass "TCP/443 -> $ip"; else fail "TCP/443 -> $ip"; fi
  76. if nc -zv -G "$TIMEOUT" "$ip" 53 >/dev/null 2>&1; then pass "TCP/53 -> $ip"; else fail "TCP/53 -> $ip"; fi
  77. done
  78. # Raw UDP/53 — uses dig with explicit server, bypasses /etc/resolv.conf
  79. for ip in "${TEST_IPS[@]}"; do
  80. if dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" >/dev/null 2>&1; then
  81. result=$(dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" | head -1)
  82. pass "UDP/53 -> $ip (dig)" "$result"
  83. else
  84. fail "UDP/53 -> $ip (dig)"
  85. fi
  86. done
  87. fi # end rung 3
  88. # ---------------------------------------------------------------------------
  89. if should_run_rung 4; then
  90. section "4. DNS INFRASTRUCTURE (bypass tools)"
  91. # ---------------------------------------------------------------------------
  92. # dig uses its own resolver — does NOT touch macOS DNS resolution chain
  93. for srv in "" "${TEST_IPS[@]}"; do
  94. if [[ -z "$srv" ]]; then
  95. out=$(dig +short +time="$TIMEOUT" +tries=1 "$TEST_HOST" 2>&1)
  96. label="default"
  97. else
  98. out=$(dig +short +time="$TIMEOUT" +tries=1 @"$srv" "$TEST_HOST" 2>&1)
  99. label="$srv"
  100. fi
  101. if [[ -n "$out" && ! "$out" =~ "timed out"|"connection refused" ]]; then
  102. pass "dig via $label" "$(echo "$out" | head -1)"
  103. else
  104. fail "dig via $label" "$out"
  105. fi
  106. done
  107. fi # end rung 4
  108. # ---------------------------------------------------------------------------
  109. section "5. macOS RESOLVER PATH (the hook layer)"
  110. # ---------------------------------------------------------------------------
  111. # dscacheutil uses the macOS resolver chain — goes through everything
  112. out=$(dscacheutil -q host -a name "$TEST_HOST" 2>&1)
  113. if echo "$out" | grep -q "ip_address:"; then
  114. addr=$(echo "$out" | awk '/ip_address:/{print $2; exit}')
  115. pass "dscacheutil (system resolver)" "$addr"
  116. else
  117. fail "dscacheutil (system resolver)" "$(echo "$out" | head -3)"
  118. fi
  119. # /etc/resolver/* — per-domain overrides, classic VPN residue
  120. if [[ -d /etc/resolver ]]; then
  121. resolver_files=$(ls /etc/resolver/ 2>/dev/null)
  122. if [[ -n "$resolver_files" ]]; then
  123. echo " /etc/resolver/ contents (per-domain DNS overrides):"
  124. for f in /etc/resolver/*; do
  125. [[ -f "$f" ]] || continue
  126. domain="${f##*/}"
  127. ns=$(awk '/^nameserver/{print $2}' "$f" | tr '\n' ' ')
  128. echo " $domain -> $ns"
  129. done
  130. fi
  131. fi
  132. # scutil DNS state — the authoritative view of macOS resolver config
  133. if [[ "$VERBOSE" -eq 1 ]]; then
  134. echo " scutil --dns (full):"
  135. scutil --dns 2>/dev/null | sed 's/^/ /'
  136. else
  137. # Condensed: one line per resolver — scope (via domain or search), nameservers, order
  138. echo " scutil --dns (condensed, --verbose for full):"
  139. scutil --dns 2>/dev/null | awk '
  140. /^resolver #/{ if(num){flush()} num=$2; sub(/#/,"",num); scope=""; ns=""; ord="" }
  141. /search domain\[0\]/{ scope="search="$NF }
  142. /domain[[:space:]]*:/{ scope="domain="$NF }
  143. /options/{ if($NF~/mdns/) scope="mdns" }
  144. /nameserver\[[0-9]+\]/{ ns=ns?ns","$NF:$NF }
  145. /order[[:space:]]*:/{ ord=$NF }
  146. function flush() {
  147. if (!scope) scope="default"
  148. print " #"num" scope="scope" via="ns" order="ord
  149. }
  150. END{ if(num) flush() }
  151. '
  152. fi
  153. # Configuration profiles (MDM / VPN-installed). Without sudo we only see user-scope.
  154. profile_count=$(profiles list -type configuration 2>/dev/null | grep -c "attribute:" 2>/dev/null)
  155. profile_count="${profile_count:-0}"
  156. if [[ "$profile_count" =~ ^[0-9]+$ ]] && (( profile_count > 0 )); then
  157. echo " Configuration profiles installed (user scope): $profile_count"
  158. echo " For full detail incl. system profiles: sudo profiles list -type configuration"
  159. fi
  160. # Local DNS proxy detection — derived from scutil (works unprivileged).
  161. # Common with NextDNS, AdGuard, dnsmasq, Pi-hole client, Cloudflare WARP.
  162. if scutil --dns 2>/dev/null | awk '/nameserver\[[0-9]+\]/{print $3}' | grep -qE '^(127\.|::1$)'; then
  163. echo " !! Local DNS proxy detected in resolver chain (127.x or ::1 nameserver)"
  164. echo " Apps using the system resolver may route DNS through it."
  165. echo " For PID/process: sudo lsof -nP -iUDP:53"
  166. fi
  167. # mDNSResponder state
  168. if pgrep -x mDNSResponder >/dev/null; then
  169. pid=$(pgrep -x mDNSResponder | head -1)
  170. pass "mDNSResponder running" "PID $pid"
  171. else
  172. fail "mDNSResponder" "not running — system DNS will be broken"
  173. fi
  174. # ---------------------------------------------------------------------------
  175. # Time-sync deep-dive: compare local clock to HTTP Date, AND check whether
  176. # macOS network time sync itself is enabled + which server it's pointing at.
  177. # Stratum-16 (unsynced) clocks are the silent killer of TLS validation.
  178. ntp_enabled=$(systemsetup -getusingnetworktime 2>/dev/null | awk -F': ' '{print $2}')
  179. ntp_server=$(systemsetup -getnetworktimeserver 2>/dev/null | awk -F': ' '{print $2}')
  180. # HTTP Date drift (works without elevated privs, no NTP infra needed)
  181. remote_date=$(curl -sIA 'net-ops-probe' --max-time 5 https://www.google.com 2>/dev/null | awk -F': ' 'tolower($1)=="date"{print $2; exit}' | tr -d '\r')
  182. drift_ok=1
  183. drift_detail=""
  184. if [[ -n "$remote_date" ]]; then
  185. remote_epoch=$(date -j -f '%a, %d %b %Y %H:%M:%S %Z' "$remote_date" +%s 2>/dev/null)
  186. if [[ -n "$remote_epoch" ]]; then
  187. local_epoch=$(date +%s)
  188. drift=$(( local_epoch - remote_epoch ))
  189. abs_drift=${drift#-}
  190. if [[ "$abs_drift" -lt 300 ]]; then
  191. drift_detail="${drift}s vs HTTP Date (within ±5min)"
  192. else
  193. drift_ok=0
  194. drift_detail="${drift}s drift — will break TLS cert validation"
  195. fi
  196. fi
  197. fi
  198. # Optional: query the configured NTP server for actual stratum / offset.
  199. # sntp is built-in on macOS; suppress its noisy output.
  200. ntp_offset=""
  201. if [[ -n "$ntp_server" ]] && command -v sntp >/dev/null 2>&1; then
  202. ntp_offset=$(sntp -t 3 "$ntp_server" 2>/dev/null | awk '/[+-][0-9]+\.[0-9]+/{print $1; exit}')
  203. fi
  204. combined="$drift_detail"
  205. [[ -n "$ntp_enabled" ]] && combined="$combined; NTP sync=$ntp_enabled"
  206. [[ -n "$ntp_server" ]] && combined="$combined; server=$ntp_server"
  207. [[ -n "$ntp_offset" ]] && combined="$combined; sntp offset=${ntp_offset}s"
  208. if [[ "$drift_ok" -eq 1 ]] && { [[ "$ntp_enabled" == "On" ]] || [[ -z "$ntp_enabled" ]]; }; then
  209. pass "Time sync" "$combined"
  210. else
  211. fail "Time sync" "$combined"
  212. fi
  213. # MTU / path-MTU discovery test. Standard Ethernet MTU is 1500.
  214. # We send a 1472-byte payload (1472 + 20 IP + 8 ICMP = 1500) with DF set.
  215. # If this fails but a smaller size works, there's a path-MTU issue
  216. # (PPPoE, weird tunnel, broken ICMP "fragmentation needed" delivery).
  217. if ping -D -s 1472 -c 1 -t 3 1.1.1.1 >/dev/null 2>&1; then
  218. pass "Path MTU 1500 (1472-byte DF payload)" "to 1.1.1.1"
  219. else
  220. if ping -D -s 1400 -c 1 -t 3 1.1.1.1 >/dev/null 2>&1; then
  221. fail "Path MTU 1500 (1472-byte DF payload)" "1500 fails, 1428+ works — path MTU < 1500 (VPN/PPPoE?)"
  222. else
  223. # Both fail — DF blocking entirely; don't flag as MTU
  224. pass "Path MTU test inconclusive" "ICMP DF blocked or destination unreachable"
  225. fi
  226. fi
  227. # IPv6 deep-dive — classifies v6 stack state across four meaningful tiers
  228. # instead of a binary works/broken. Each tier maps to a distinct fix path.
  229. v6_state=""
  230. v6_detail=""
  231. # 1. Any v6 address on a non-loopback interface?
  232. v6_addrs=$(ifconfig 2>/dev/null | awk '/^[a-z]/{ifn=$1} /inet6 /{print ifn" "$2}' | grep -v "::1\|fe80::" | grep -v "^utun\|^awdl\|^llw\|^bridge")
  233. # 2. Any GLOBAL v6 address (not ULA fd00::/8)?
  234. v6_global=$(printf '%s\n' "$v6_addrs" | awk '$2 !~ /^fd/ && $2 !~ /^fc/{print; exit}')
  235. # 3. Is there an actual global default route?
  236. v6_default=$(route -n get -inet6 default 2>&1 | awk '/gateway:/{print $2; exit}')
  237. [[ "$v6_default" =~ ^fe80 ]] && v6_default="" # link-local doesn't count
  238. if [[ -z "$v6_addrs" ]]; then
  239. v6_state="disabled"
  240. v6_detail="no v6 addresses on physical interfaces — IPv6 disabled or unconfigured"
  241. elif [[ -z "$v6_global" ]]; then
  242. v6_state="ula_only"
  243. v6_detail="only ULA (fd00::/8) addresses present — ISP/router not delegating public v6 prefix"
  244. elif [[ -z "$v6_default" ]]; then
  245. v6_state="no_route"
  246. v6_detail="global v6 address present but no default route — RA not received or NDP broken"
  247. else
  248. # We have a v6 address and a route — test actual connectivity
  249. aaaa=$(dig +short +time=2 +tries=1 AAAA "$TEST_HOST" 2>/dev/null | head -1)
  250. if [[ -n "$aaaa" ]] && curl -6 -sS -o /dev/null --max-time 4 "https://$TEST_HOST" 2>/dev/null; then
  251. v6_state="healthy"
  252. v6_detail="global addr + default route + curl -6 works"
  253. else
  254. v6_state="path_broken"
  255. v6_detail="addr=$v6_global, route via $v6_default, but curl -6 fails — upstream v6 path dead"
  256. fi
  257. fi
  258. case "$v6_state" in
  259. disabled|healthy)
  260. pass "IPv6 stack ($v6_state)" "$v6_detail" ;;
  261. ula_only)
  262. fail "IPv6 stack ($v6_state)" "$v6_detail — apps may try v6 first, hit 'no route', fall back to v4 (slow). Fix: sudo networksetup -setv6off <service>" ;;
  263. no_route)
  264. fail "IPv6 stack ($v6_state)" "$v6_detail — check ndp -an for RA receipt; restart interface or check router RA config" ;;
  265. path_broken)
  266. fail "IPv6 stack ($v6_state)" "$v6_detail — VPN/firewall blocking v6, or ISP black-holing v6 traffic" ;;
  267. esac
  268. # ---------------------------------------------------------------------------
  269. section "6. APPLICATION LAYER (real HTTP request)"
  270. # ---------------------------------------------------------------------------
  271. for url in "https://www.google.com" "https://github.com"; do
  272. if out=$(curl -sS -o /dev/null -w "%{http_code} %{size_download}b" --max-time "$TIMEOUT" "$url" 2>&1); then
  273. pass "GET $url" "$out"
  274. else
  275. fail "GET $url" "$out"
  276. fi
  277. done
  278. # ---------------------------------------------------------------------------
  279. if should_run_rung 7; then
  280. section "7. KNOWN VPN / DNS CLIENT FOOTPRINT"
  281. # ---------------------------------------------------------------------------
  282. KNOWN_PATHS=(
  283. "/Applications/Proton VPN.app"
  284. "/Applications/Mullvad VPN.app"
  285. "/Applications/Tailscale.app"
  286. "/Applications/Cisco/Cisco Secure Client.app"
  287. "/Applications/Cisco/Cisco AnyConnect Secure Mobility Client.app"
  288. "/Applications/NordVPN.app"
  289. "/Applications/NextDNS.app"
  290. "/Applications/Little Snitch.app"
  291. "/Applications/Lulu.app"
  292. "/Library/Application Support/NextDNS"
  293. )
  294. for p in "${KNOWN_PATHS[@]}"; do
  295. [[ -e "$p" ]] && echo " Installed: $p"
  296. done
  297. # Browser DoH state — Chrome / Brave / Edge / Firefox have their own resolvers
  298. # that bypass system DNS entirely when DoH is configured. Useful for explaining
  299. # "Chrome works but Safari doesn't" type asymmetries.
  300. browser_findings=""
  301. chrome_prefs="$HOME/Library/Application Support/Google/Chrome/Default/Preferences"
  302. brave_prefs="$HOME/Library/Application Support/BraveSoftware/Brave-Browser/Default/Preferences"
  303. edge_prefs="$HOME/Library/Application Support/Microsoft Edge/Default/Preferences"
  304. for label_prefs in "Chrome:$chrome_prefs" "Brave:$brave_prefs" "Edge:$edge_prefs"; do
  305. label="${label_prefs%%:*}"
  306. prefs="${label_prefs#*:}"
  307. if [[ -f "$prefs" ]]; then
  308. # Chromium stores DoH mode under dns_over_https.mode: "off" | "automatic" | "secure"
  309. mode=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"mode"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  310. templates=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"templates"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  311. if [[ -n "$mode" ]]; then
  312. browser_findings+=" $label DoH: mode=$mode${templates:+, server=$templates}\n"
  313. else
  314. browser_findings+=" $label installed, DoH: not configured (system DNS)\n"
  315. fi
  316. fi
  317. done
  318. # Firefox: per-profile prefs.js, network.trr.mode (0=off, 2=enabled w/fallback, 3=enabled only, 5=disabled)
  319. for fx_prefs in "$HOME/Library/Application Support/Firefox/Profiles"/*.default*/prefs.js; do
  320. [[ -f "$fx_prefs" ]] || continue
  321. trr_mode=$(awk -F'"' '/"network.trr.mode"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  322. trr_uri=$(awk -F'"' '/"network.trr.uri"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  323. case "${trr_mode:-0}" in
  324. 2) state="enabled (with system fallback)" ;;
  325. 3) state="enabled (no fallback)" ;;
  326. 5) state="disabled by policy" ;;
  327. *) state="off (system DNS)" ;;
  328. esac
  329. browser_findings+=" Firefox DoH: $state${trr_uri:+, server=$trr_uri}\n"
  330. break # only check one profile
  331. done
  332. if [[ -n "$browser_findings" ]]; then
  333. info " Browser DoH state (browsers may bypass system DNS):"
  334. printf '%b' "$browser_findings"
  335. fi
  336. # Network services often reveal VPN/DNS clients that don't install at /Applications
  337. # (e.g. CLI-only NextDNS, kernel/system extensions, virtual interfaces)
  338. ns_pattern='Proton|Mullvad|NextDNS|Cisco|NordVPN|Tailscale|WireGuard|OpenVPN|Cloudflare|WARP|AdGuard'
  339. ns_found=$(networksetup -listallnetworkservices 2>/dev/null | grep -iE "$ns_pattern" || true)
  340. if [[ -n "$ns_found" ]]; then
  341. echo " Network services:"
  342. echo "$ns_found" | sed 's/^/ /'
  343. fi
  344. fi # end rung 7
  345. # Persist state for future --quick runs (only when we ran the FULL ladder).
  346. if [[ "$QUICK_MODE" -eq 0 ]]; then
  347. cache_save_state "$PASS_COUNT" "$FAIL_COUNT" "$FIRST_FAIL"
  348. fi
  349. emit_summary
  350. if [[ "$JSON_MODE" -eq 0 ]]; then
  351. if [[ -n "$FIRST_FAIL" ]]; then
  352. case "$FIRST_FAIL" in
  353. *"LINK LAYER"*) echo " Next: check ifconfig / networksetup, fix interface / DHCP" ;;
  354. *"SOCKET"*) echo " Next: check Little Snitch / Lulu / pfctl rules; AV protocol filtering; consumer router DoH IP blocking" ;;
  355. *"ICMP"*|*"IP /"*) echo " Next: check route table, ISP/upstream connectivity" ;;
  356. *"DNS INFRASTRUCTURE"*) echo " Next: check UDP/53 outbound, router DNS forwarder" ;;
  357. *"RESOLVER PATH"*) echo " Next: bash scripts/macos/dns-audit.sh # drill rung 5 (the hook layer)" ;;
  358. *"APPLICATION"*) echo " Next: check proxy (scutil --proxy), keychain certs, IPv6 preference" ;;
  359. *) echo " Next: re-run with --verbose; check references/common-culprits.md" ;;
  360. esac
  361. else
  362. echo " (No failures. If user still reports issues, see rung 7 footprint and time-based notes in references/diagnostic-ladder.md.)"
  363. fi
  364. echo
  365. echo "=== END PROBE ==="
  366. fi