probe.sh 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. #!/usr/bin/env bash
  2. # net-ops :: macos/probe.sh
  3. # Full layered diagnostic ladder for macOS network troubleshooting.
  4. # Outputs structured [PASS]/[FAIL] lines so a human or LLM can scan for
  5. # the first FAIL and drill in.
  6. set -u
  7. TEST_HOST="${TEST_HOST:-google.com}"
  8. TEST_IPS=("1.1.1.1" "8.8.8.8")
  9. TIMEOUT="${TIMEOUT:-5}"
  10. VERBOSE=0
  11. for arg in "$@"; do
  12. case "$arg" in
  13. --verbose|-v) VERBOSE=1 ;;
  14. --help|-h)
  15. cat <<EOF
  16. Usage: $0 [--redact] [--verbose] [--json] [--quick]
  17. --redact Mask private IPs, MAC addresses, and *.ts.net tailnet names
  18. --verbose Full scutil --dns dump (default: condensed one-line-per-resolver)
  19. --json Newline-delimited JSON output (for piping to jq, dashboards)
  20. --quick Skip rungs 1-4 and 7 if the last full run cached as healthy
  21. (cache: \${TMPDIR}/net-ops/last-state.json, TTL 10min)
  22. Compose freely: --json + --redact emits sanitized NDJSON.
  23. EOF
  24. exit 0 ;;
  25. esac
  26. done
  27. # shellcheck source=../_lib/redact.sh
  28. source "$(dirname "$0")/../_lib/redact.sh"
  29. # shellcheck source=../_lib/output.sh
  30. source "$(dirname "$0")/../_lib/output.sh"
  31. # shellcheck source=../_lib/cache.sh
  32. source "$(dirname "$0")/../_lib/cache.sh"
  33. parse_redact_flag "$@"
  34. parse_output_flags "$@"
  35. parse_quick_flag "$@"
  36. maybe_redact_self "$@"
  37. if cache_indicates_healthy; then
  38. info " [--quick: last full run was healthy, skipping rungs 1-4 and 7]"
  39. fi
  40. # ---------------------------------------------------------------------------
  41. if should_run_rung 1; then
  42. section "1. LINK LAYER"
  43. # ---------------------------------------------------------------------------
  44. ACTIVE_IFS=$(networksetup -listallhardwareports 2>/dev/null | awk '/Hardware Port/{port=$3} /Device/{print port" "$2}' || true)
  45. echo "$ACTIVE_IFS" | while read -r line; do
  46. [[ -z "$line" ]] && continue
  47. name="${line% *}"; dev="${line##* }"
  48. status=$(ifconfig "$dev" 2>/dev/null | awk '/status:/{print $2; exit}')
  49. if [[ "$status" == "active" ]]; then
  50. ip=$(ifconfig "$dev" 2>/dev/null | awk '/inet /{print $2; exit}')
  51. pass "Interface $name ($dev) active" "$ip"
  52. fi
  53. done
  54. GATEWAY=$(route -n get default 2>/dev/null | awk '/gateway:/{print $2}')
  55. DEFAULT_IF=$(route -n get default 2>/dev/null | awk '/interface:/{print $2}')
  56. [[ -n "$GATEWAY" ]] && pass "Default gateway" "$GATEWAY via $DEFAULT_IF" || fail "Default gateway" "none configured"
  57. fi # end rung 1
  58. # ---------------------------------------------------------------------------
  59. if should_run_rung 2; then
  60. section "2. IP / ICMP REACHABILITY"
  61. # ---------------------------------------------------------------------------
  62. [[ -n "${GATEWAY:-}" ]] && {
  63. if ping -c 2 -W "${TIMEOUT}000" "$GATEWAY" >/dev/null 2>&1; then pass "Ping gateway $GATEWAY"; else fail "Ping gateway $GATEWAY"; fi
  64. }
  65. for ip in "${TEST_IPS[@]}"; do
  66. if ping -c 2 -W "${TIMEOUT}000" "$ip" >/dev/null 2>&1; then pass "Ping $ip"; else fail "Ping $ip"; fi
  67. done
  68. fi # end rung 2
  69. # ---------------------------------------------------------------------------
  70. if should_run_rung 3; then
  71. section "3. TCP/UDP SOCKET REACHABILITY"
  72. # ---------------------------------------------------------------------------
  73. for ip in "${TEST_IPS[@]}"; do
  74. if nc -zv -G "$TIMEOUT" "$ip" 443 >/dev/null 2>&1; then pass "TCP/443 -> $ip"; else fail "TCP/443 -> $ip"; fi
  75. if nc -zv -G "$TIMEOUT" "$ip" 53 >/dev/null 2>&1; then pass "TCP/53 -> $ip"; else fail "TCP/53 -> $ip"; fi
  76. done
  77. # Raw UDP/53 — uses dig with explicit server, bypasses /etc/resolv.conf
  78. for ip in "${TEST_IPS[@]}"; do
  79. if dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" >/dev/null 2>&1; then
  80. result=$(dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" | head -1)
  81. pass "UDP/53 -> $ip (dig)" "$result"
  82. else
  83. fail "UDP/53 -> $ip (dig)"
  84. fi
  85. done
  86. fi # end rung 3
  87. # ---------------------------------------------------------------------------
  88. if should_run_rung 4; then
  89. section "4. DNS INFRASTRUCTURE (bypass tools)"
  90. # ---------------------------------------------------------------------------
  91. # dig uses its own resolver — does NOT touch macOS DNS resolution chain
  92. for srv in "" "${TEST_IPS[@]}"; do
  93. if [[ -z "$srv" ]]; then
  94. out=$(dig +short +time="$TIMEOUT" +tries=1 "$TEST_HOST" 2>&1)
  95. label="default"
  96. else
  97. out=$(dig +short +time="$TIMEOUT" +tries=1 @"$srv" "$TEST_HOST" 2>&1)
  98. label="$srv"
  99. fi
  100. if [[ -n "$out" && ! "$out" =~ "timed out"|"connection refused" ]]; then
  101. pass "dig via $label" "$(echo "$out" | head -1)"
  102. else
  103. fail "dig via $label" "$out"
  104. fi
  105. done
  106. fi # end rung 4
  107. # ---------------------------------------------------------------------------
  108. section "5. macOS RESOLVER PATH (the hook layer)"
  109. # ---------------------------------------------------------------------------
  110. # dscacheutil uses the macOS resolver chain — goes through everything
  111. out=$(dscacheutil -q host -a name "$TEST_HOST" 2>&1)
  112. if echo "$out" | grep -q "ip_address:"; then
  113. addr=$(echo "$out" | awk '/ip_address:/{print $2; exit}')
  114. pass "dscacheutil (system resolver)" "$addr"
  115. else
  116. fail "dscacheutil (system resolver)" "$(echo "$out" | head -3)"
  117. fi
  118. # /etc/resolver/* — per-domain overrides, classic VPN residue
  119. if [[ -d /etc/resolver ]]; then
  120. resolver_files=$(ls /etc/resolver/ 2>/dev/null)
  121. if [[ -n "$resolver_files" ]]; then
  122. echo " /etc/resolver/ contents (per-domain DNS overrides):"
  123. for f in /etc/resolver/*; do
  124. [[ -f "$f" ]] || continue
  125. domain="${f##*/}"
  126. ns=$(awk '/^nameserver/{print $2}' "$f" | tr '\n' ' ')
  127. echo " $domain -> $ns"
  128. done
  129. fi
  130. fi
  131. # scutil DNS state — the authoritative view of macOS resolver config
  132. if [[ "$VERBOSE" -eq 1 ]]; then
  133. echo " scutil --dns (full):"
  134. scutil --dns 2>/dev/null | sed 's/^/ /'
  135. else
  136. # Condensed: one line per resolver — scope (via domain or search), nameservers, order
  137. echo " scutil --dns (condensed, --verbose for full):"
  138. scutil --dns 2>/dev/null | awk '
  139. /^resolver #/{ if(num){flush()} num=$2; sub(/#/,"",num); scope=""; ns=""; ord="" }
  140. /search domain\[0\]/{ scope="search="$NF }
  141. /domain[[:space:]]*:/{ scope="domain="$NF }
  142. /options/{ if($NF~/mdns/) scope="mdns" }
  143. /nameserver\[[0-9]+\]/{ ns=ns?ns","$NF:$NF }
  144. /order[[:space:]]*:/{ ord=$NF }
  145. function flush() {
  146. if (!scope) scope="default"
  147. print " #"num" scope="scope" via="ns" order="ord
  148. }
  149. END{ if(num) flush() }
  150. '
  151. fi
  152. # Configuration profiles (MDM / VPN-installed). Without sudo we only see user-scope.
  153. profile_count=$(profiles list -type configuration 2>/dev/null | grep -c "attribute:" 2>/dev/null)
  154. profile_count="${profile_count:-0}"
  155. if [[ "$profile_count" =~ ^[0-9]+$ ]] && (( profile_count > 0 )); then
  156. echo " Configuration profiles installed (user scope): $profile_count"
  157. echo " For full detail incl. system profiles: sudo profiles list -type configuration"
  158. fi
  159. # Local DNS proxy detection — derived from scutil (works unprivileged).
  160. # Common with NextDNS, AdGuard, dnsmasq, Pi-hole client, Cloudflare WARP.
  161. if scutil --dns 2>/dev/null | awk '/nameserver\[[0-9]+\]/{print $3}' | grep -qE '^(127\.|::1$)'; then
  162. echo " !! Local DNS proxy detected in resolver chain (127.x or ::1 nameserver)"
  163. echo " Apps using the system resolver may route DNS through it."
  164. echo " For PID/process: sudo lsof -nP -iUDP:53"
  165. fi
  166. # mDNSResponder state
  167. if pgrep -x mDNSResponder >/dev/null; then
  168. pid=$(pgrep -x mDNSResponder | head -1)
  169. pass "mDNSResponder running" "PID $pid"
  170. else
  171. fail "mDNSResponder" "not running — system DNS will be broken"
  172. fi
  173. # ---------------------------------------------------------------------------
  174. # Time-sync deep-dive: compare local clock to HTTP Date, AND check whether
  175. # macOS network time sync itself is enabled + which server it's pointing at.
  176. # Stratum-16 (unsynced) clocks are the silent killer of TLS validation.
  177. ntp_enabled=$(systemsetup -getusingnetworktime 2>/dev/null | awk -F': ' '{print $2}')
  178. ntp_server=$(systemsetup -getnetworktimeserver 2>/dev/null | awk -F': ' '{print $2}')
  179. # HTTP Date drift (works without elevated privs, no NTP infra needed)
  180. remote_date=$(curl -sIA 'net-ops-probe' --max-time 5 https://www.google.com 2>/dev/null | awk -F': ' 'tolower($1)=="date"{print $2; exit}' | tr -d '\r')
  181. drift_ok=1
  182. drift_detail=""
  183. if [[ -n "$remote_date" ]]; then
  184. remote_epoch=$(date -j -f '%a, %d %b %Y %H:%M:%S %Z' "$remote_date" +%s 2>/dev/null)
  185. if [[ -n "$remote_epoch" ]]; then
  186. local_epoch=$(date +%s)
  187. drift=$(( local_epoch - remote_epoch ))
  188. abs_drift=${drift#-}
  189. if [[ "$abs_drift" -lt 300 ]]; then
  190. drift_detail="${drift}s vs HTTP Date (within ±5min)"
  191. else
  192. drift_ok=0
  193. drift_detail="${drift}s drift — will break TLS cert validation"
  194. fi
  195. fi
  196. fi
  197. # Optional: query the configured NTP server for actual stratum / offset.
  198. # sntp is built-in on macOS; suppress its noisy output.
  199. ntp_offset=""
  200. if [[ -n "$ntp_server" ]] && command -v sntp >/dev/null 2>&1; then
  201. ntp_offset=$(sntp -t 3 "$ntp_server" 2>/dev/null | awk '/[+-][0-9]+\.[0-9]+/{print $1; exit}')
  202. fi
  203. combined="$drift_detail"
  204. [[ -n "$ntp_enabled" ]] && combined="$combined; NTP sync=$ntp_enabled"
  205. [[ -n "$ntp_server" ]] && combined="$combined; server=$ntp_server"
  206. [[ -n "$ntp_offset" ]] && combined="$combined; sntp offset=${ntp_offset}s"
  207. if [[ "$drift_ok" -eq 1 ]] && { [[ "$ntp_enabled" == "On" ]] || [[ -z "$ntp_enabled" ]]; }; then
  208. pass "Time sync" "$combined"
  209. else
  210. fail "Time sync" "$combined"
  211. fi
  212. # MTU / path-MTU discovery test. Standard Ethernet MTU is 1500.
  213. # We send a 1472-byte payload (1472 + 20 IP + 8 ICMP = 1500) with DF set.
  214. # If this fails but a smaller size works, there's a path-MTU issue
  215. # (PPPoE, weird tunnel, broken ICMP "fragmentation needed" delivery).
  216. if ping -D -s 1472 -c 1 -t 3 1.1.1.1 >/dev/null 2>&1; then
  217. pass "Path MTU 1500 (1472-byte DF payload)" "to 1.1.1.1"
  218. else
  219. if ping -D -s 1400 -c 1 -t 3 1.1.1.1 >/dev/null 2>&1; then
  220. fail "Path MTU 1500 (1472-byte DF payload)" "1500 fails, 1428+ works — path MTU < 1500 (VPN/PPPoE?)"
  221. else
  222. # Both fail — DF blocking entirely; don't flag as MTU
  223. pass "Path MTU test inconclusive" "ICMP DF blocked or destination unreachable"
  224. fi
  225. fi
  226. # IPv6 deep-dive — classifies v6 stack state across four meaningful tiers
  227. # instead of a binary works/broken. Each tier maps to a distinct fix path.
  228. v6_state=""
  229. v6_detail=""
  230. # 1. Any v6 address on a non-loopback interface?
  231. v6_addrs=$(ifconfig 2>/dev/null | awk '/^[a-z]/{ifn=$1} /inet6 /{print ifn" "$2}' | grep -v "::1\|fe80::" | grep -v "^utun\|^awdl\|^llw\|^bridge")
  232. # 2. Any GLOBAL v6 address (not ULA fd00::/8)?
  233. v6_global=$(printf '%s\n' "$v6_addrs" | awk '$2 !~ /^fd/ && $2 !~ /^fc/{print; exit}')
  234. # 3. Is there an actual global default route?
  235. v6_default=$(route -n get -inet6 default 2>&1 | awk '/gateway:/{print $2; exit}')
  236. [[ "$v6_default" =~ ^fe80 ]] && v6_default="" # link-local doesn't count
  237. if [[ -z "$v6_addrs" ]]; then
  238. v6_state="disabled"
  239. v6_detail="no v6 addresses on physical interfaces — IPv6 disabled or unconfigured"
  240. elif [[ -z "$v6_global" ]]; then
  241. v6_state="ula_only"
  242. v6_detail="only ULA (fd00::/8) addresses present — ISP/router not delegating public v6 prefix"
  243. elif [[ -z "$v6_default" ]]; then
  244. v6_state="no_route"
  245. v6_detail="global v6 address present but no default route — RA not received or NDP broken"
  246. else
  247. # We have a v6 address and a route — test actual connectivity
  248. aaaa=$(dig +short +time=2 +tries=1 AAAA "$TEST_HOST" 2>/dev/null | head -1)
  249. if [[ -n "$aaaa" ]] && curl -6 -sS -o /dev/null --max-time 4 "https://$TEST_HOST" 2>/dev/null; then
  250. v6_state="healthy"
  251. v6_detail="global addr + default route + curl -6 works"
  252. else
  253. v6_state="path_broken"
  254. v6_detail="addr=$v6_global, route via $v6_default, but curl -6 fails — upstream v6 path dead"
  255. fi
  256. fi
  257. case "$v6_state" in
  258. disabled|healthy)
  259. pass "IPv6 stack ($v6_state)" "$v6_detail" ;;
  260. ula_only)
  261. fail "IPv6 stack ($v6_state)" "$v6_detail — apps may try v6 first, hit 'no route', fall back to v4 (slow). Fix: sudo networksetup -setv6off <service>" ;;
  262. no_route)
  263. fail "IPv6 stack ($v6_state)" "$v6_detail — check ndp -an for RA receipt; restart interface or check router RA config" ;;
  264. path_broken)
  265. fail "IPv6 stack ($v6_state)" "$v6_detail — VPN/firewall blocking v6, or ISP black-holing v6 traffic" ;;
  266. esac
  267. # ---------------------------------------------------------------------------
  268. section "6. APPLICATION LAYER (real HTTP request)"
  269. # ---------------------------------------------------------------------------
  270. for url in "https://www.google.com" "https://github.com"; do
  271. if out=$(curl -sS -o /dev/null -w "%{http_code} %{size_download}b" --max-time "$TIMEOUT" "$url" 2>&1); then
  272. pass "GET $url" "$out"
  273. else
  274. fail "GET $url" "$out"
  275. fi
  276. done
  277. # ---------------------------------------------------------------------------
  278. if should_run_rung 7; then
  279. section "7. KNOWN VPN / DNS CLIENT FOOTPRINT"
  280. # ---------------------------------------------------------------------------
  281. KNOWN_PATHS=(
  282. "/Applications/Proton VPN.app"
  283. "/Applications/Mullvad VPN.app"
  284. "/Applications/Tailscale.app"
  285. "/Applications/Cisco/Cisco Secure Client.app"
  286. "/Applications/Cisco/Cisco AnyConnect Secure Mobility Client.app"
  287. "/Applications/NordVPN.app"
  288. "/Applications/NextDNS.app"
  289. "/Applications/Little Snitch.app"
  290. "/Applications/Lulu.app"
  291. "/Library/Application Support/NextDNS"
  292. )
  293. for p in "${KNOWN_PATHS[@]}"; do
  294. [[ -e "$p" ]] && echo " Installed: $p"
  295. done
  296. # Browser DoH state — Chrome / Brave / Edge / Firefox have their own resolvers
  297. # that bypass system DNS entirely when DoH is configured. Useful for explaining
  298. # "Chrome works but Safari doesn't" type asymmetries.
  299. browser_findings=""
  300. chrome_prefs="$HOME/Library/Application Support/Google/Chrome/Default/Preferences"
  301. brave_prefs="$HOME/Library/Application Support/BraveSoftware/Brave-Browser/Default/Preferences"
  302. edge_prefs="$HOME/Library/Application Support/Microsoft Edge/Default/Preferences"
  303. for label_prefs in "Chrome:$chrome_prefs" "Brave:$brave_prefs" "Edge:$edge_prefs"; do
  304. label="${label_prefs%%:*}"
  305. prefs="${label_prefs#*:}"
  306. if [[ -f "$prefs" ]]; then
  307. # Chromium stores DoH mode under dns_over_https.mode: "off" | "automatic" | "secure"
  308. mode=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"mode"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  309. templates=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"templates"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  310. if [[ -n "$mode" ]]; then
  311. browser_findings+=" $label DoH: mode=$mode${templates:+, server=$templates}\n"
  312. else
  313. browser_findings+=" $label installed, DoH: not configured (system DNS)\n"
  314. fi
  315. fi
  316. done
  317. # Firefox: per-profile prefs.js, network.trr.mode (0=off, 2=enabled w/fallback, 3=enabled only, 5=disabled)
  318. for fx_prefs in "$HOME/Library/Application Support/Firefox/Profiles"/*.default*/prefs.js; do
  319. [[ -f "$fx_prefs" ]] || continue
  320. trr_mode=$(awk -F'"' '/"network.trr.mode"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  321. trr_uri=$(awk -F'"' '/"network.trr.uri"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  322. case "${trr_mode:-0}" in
  323. 2) state="enabled (with system fallback)" ;;
  324. 3) state="enabled (no fallback)" ;;
  325. 5) state="disabled by policy" ;;
  326. *) state="off (system DNS)" ;;
  327. esac
  328. browser_findings+=" Firefox DoH: $state${trr_uri:+, server=$trr_uri}\n"
  329. break # only check one profile
  330. done
  331. if [[ -n "$browser_findings" ]]; then
  332. info " Browser DoH state (browsers may bypass system DNS):"
  333. printf '%b' "$browser_findings"
  334. fi
  335. # Network services often reveal VPN/DNS clients that don't install at /Applications
  336. # (e.g. CLI-only NextDNS, kernel/system extensions, virtual interfaces)
  337. ns_pattern='Proton|Mullvad|NextDNS|Cisco|NordVPN|Tailscale|WireGuard|OpenVPN|Cloudflare|WARP|AdGuard'
  338. ns_found=$(networksetup -listallnetworkservices 2>/dev/null | grep -iE "$ns_pattern" || true)
  339. if [[ -n "$ns_found" ]]; then
  340. echo " Network services:"
  341. echo "$ns_found" | sed 's/^/ /'
  342. fi
  343. fi # end rung 7
  344. # Persist state for future --quick runs (only when we ran the FULL ladder).
  345. if [[ "$QUICK_MODE" -eq 0 ]]; then
  346. cache_save_state "$PASS_COUNT" "$FAIL_COUNT" "$FIRST_FAIL"
  347. fi
  348. emit_summary
  349. if [[ "$JSON_MODE" -eq 0 ]]; then
  350. if [[ -n "$FIRST_FAIL" ]]; then
  351. case "$FIRST_FAIL" in
  352. *"LINK LAYER"*) echo " Next: check ifconfig / networksetup, fix interface / DHCP" ;;
  353. *"SOCKET"*) echo " Next: check Little Snitch / Lulu / pfctl rules; AV protocol filtering; consumer router DoH IP blocking" ;;
  354. *"ICMP"*|*"IP /"*) echo " Next: check route table, ISP/upstream connectivity" ;;
  355. *"DNS INFRASTRUCTURE"*) echo " Next: check UDP/53 outbound, router DNS forwarder" ;;
  356. *"RESOLVER PATH"*) echo " Next: bash scripts/macos/dns-audit.sh # drill rung 5 (the hook layer)" ;;
  357. *"APPLICATION"*) echo " Next: check proxy (scutil --proxy), keychain certs, IPv6 preference" ;;
  358. *) echo " Next: re-run with --verbose; check references/common-culprits.md" ;;
  359. esac
  360. else
  361. echo " (No failures. If user still reports issues, see rung 7 footprint and time-based notes in references/diagnostic-ladder.md.)"
  362. fi
  363. echo
  364. echo "=== END PROBE ==="
  365. fi