probe.sh 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. #!/usr/bin/env bash
  2. # net-ops :: linux/probe.sh
  3. # Full layered diagnostic ladder for Linux network troubleshooting.
  4. # Outputs structured [PASS]/[FAIL] lines so a human or LLM can scan for
  5. # the first FAIL and drill in.
  6. set -u
  7. TEST_HOST="${TEST_HOST:-google.com}"
  8. TEST_IPS=("1.1.1.1" "8.8.8.8")
  9. TIMEOUT="${TIMEOUT:-5}"
  10. for arg in "$@"; do
  11. case "$arg" in
  12. --help|-h)
  13. cat <<EOF
  14. Usage: $0 [--redact] [--json] [--quick]
  15. --redact Mask private IPs, MAC addresses, and *.ts.net tailnet names
  16. --json Newline-delimited JSON output (for piping to jq, dashboards)
  17. --quick Skip rungs 1-4 and 7 if the last full run cached as healthy
  18. (cache: \${TMPDIR:-/tmp}/net-ops/last-state.json, TTL 10min)
  19. Compose freely: --json + --redact emits sanitized NDJSON.
  20. Env: TEST_HOST (default google.com), TIMEOUT (default 5s).
  21. EOF
  22. exit 0 ;;
  23. esac
  24. done
  25. # shellcheck source=../_lib/redact.sh
  26. source "$(dirname "$0")/../_lib/redact.sh"
  27. # shellcheck source=../_lib/output.sh
  28. source "$(dirname "$0")/../_lib/output.sh"
  29. parse_redact_flag "$@"
  30. parse_output_flags "$@"
  31. maybe_redact_self "$@"
  32. # ---------------------------------------------------------------------------
  33. section "1. LINK LAYER"
  34. # ---------------------------------------------------------------------------
  35. ip -br link 2>/dev/null | awk '$2=="UP"{print $1}' | while read -r dev; do
  36. [[ "$dev" == "lo" ]] && continue
  37. addr=$(ip -br -4 addr show "$dev" 2>/dev/null | awk '{print $3}')
  38. pass "Interface $dev UP" "${addr:-no IPv4}"
  39. done
  40. GATEWAY=$(ip route show default 2>/dev/null | awk '/default/{print $3; exit}')
  41. DEFAULT_IF=$(ip route show default 2>/dev/null | awk '/default/{print $5; exit}')
  42. [[ -n "$GATEWAY" ]] && pass "Default gateway" "$GATEWAY via $DEFAULT_IF" || fail "Default gateway" "none configured"
  43. # ---------------------------------------------------------------------------
  44. section "2. IP / ICMP REACHABILITY"
  45. # ---------------------------------------------------------------------------
  46. [[ -n "${GATEWAY:-}" ]] && {
  47. if ping -c 2 -W "$TIMEOUT" "$GATEWAY" >/dev/null 2>&1; then pass "Ping gateway $GATEWAY"; else fail "Ping gateway $GATEWAY"; fi
  48. }
  49. for ip in "${TEST_IPS[@]}"; do
  50. if ping -c 2 -W "$TIMEOUT" "$ip" >/dev/null 2>&1; then pass "Ping $ip"; else fail "Ping $ip"; fi
  51. done
  52. # ---------------------------------------------------------------------------
  53. section "3. TCP/UDP SOCKET REACHABILITY"
  54. # ---------------------------------------------------------------------------
  55. for ip in "${TEST_IPS[@]}"; do
  56. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/443" 2>/dev/null; then pass "TCP/443 -> $ip"; else fail "TCP/443 -> $ip"; fi
  57. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/53" 2>/dev/null; then pass "TCP/53 -> $ip"; else fail "TCP/53 -> $ip"; fi
  58. done
  59. # Raw UDP/53 via dig with explicit server — bypasses /etc/resolv.conf
  60. for ip in "${TEST_IPS[@]}"; do
  61. if result=$(dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" 2>&1) && [[ -n "$result" ]] && [[ ! "$result" =~ "timed out"|"connection refused" ]]; then
  62. pass "UDP/53 -> $ip (dig)" "$(echo "$result" | head -1)"
  63. else
  64. fail "UDP/53 -> $ip (dig)" "$result"
  65. fi
  66. done
  67. # ---------------------------------------------------------------------------
  68. section "4. DNS INFRASTRUCTURE (bypass tools)"
  69. # ---------------------------------------------------------------------------
  70. # dig uses its own resolver — does NOT touch glibc NSS chain
  71. for srv in "" "${TEST_IPS[@]}"; do
  72. if [[ -z "$srv" ]]; then
  73. out=$(dig +short +time="$TIMEOUT" +tries=1 "$TEST_HOST" 2>&1)
  74. label="default"
  75. else
  76. out=$(dig +short +time="$TIMEOUT" +tries=1 @"$srv" "$TEST_HOST" 2>&1)
  77. label="$srv"
  78. fi
  79. if [[ -n "$out" && ! "$out" =~ "timed out"|"connection refused" ]]; then
  80. pass "dig via $label" "$(echo "$out" | head -1)"
  81. else
  82. fail "dig via $label" "$out"
  83. fi
  84. done
  85. # ---------------------------------------------------------------------------
  86. section "5. LINUX RESOLVER PATH (the hook layer)"
  87. # ---------------------------------------------------------------------------
  88. # getent uses glibc NSS — goes through the whole system resolver chain
  89. if out=$(getent hosts "$TEST_HOST" 2>&1) && [[ -n "$out" ]]; then
  90. addr=$(echo "$out" | awk '{print $1; exit}')
  91. pass "getent hosts (NSS path)" "$addr"
  92. else
  93. fail "getent hosts (NSS path)" "$out"
  94. fi
  95. # resolvectl query if systemd-resolved present
  96. if command -v resolvectl >/dev/null 2>&1; then
  97. if out=$(resolvectl query "$TEST_HOST" 2>&1) && echo "$out" | grep -q "^$TEST_HOST:"; then
  98. addr=$(echo "$out" | awk '/^[^:]+:.+[0-9]+\./{print $2; exit}')
  99. pass "resolvectl query" "$addr"
  100. else
  101. fail "resolvectl query" "$(echo "$out" | head -2)"
  102. fi
  103. fi
  104. # nsswitch.conf — name resolution order
  105. echo " /etc/nsswitch.conf hosts line:"
  106. grep "^hosts:" /etc/nsswitch.conf 2>/dev/null | sed 's/^/ /'
  107. # /etc/resolv.conf — is it the systemd-resolved stub, NetworkManager's, or static?
  108. echo " /etc/resolv.conf:"
  109. if [[ -L /etc/resolv.conf ]]; then
  110. target=$(readlink /etc/resolv.conf)
  111. echo " symlink -> $target"
  112. fi
  113. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  114. # Active resolver listeners on 127.x:53
  115. echo " Local DNS listeners on 127.0.0.x:53:"
  116. ss -tulnp 2>/dev/null | awk '$5 ~ /^127\./ && $5 ~ /:53$/' | sed 's/^/ /' || true
  117. # systemd-resolved status (if present)
  118. if systemctl is-active systemd-resolved >/dev/null 2>&1; then
  119. echo " systemd-resolved active. Per-link DNS:"
  120. resolvectl status 2>/dev/null | awk '
  121. /^Link [0-9]+/{link=$0; show=0; printed=0}
  122. /Current DNS Server:|DNS Servers:|DNS Domain:/{
  123. if(!printed){print " "link; printed=1}
  124. print " "$0
  125. }
  126. ' | head -40
  127. fi
  128. # ---------------------------------------------------------------------------
  129. # Time-sync deep-dive: HTTP Date drift + check timedatectl/chrony/ntpd status
  130. remote_date=$(curl -sIA 'net-ops-probe' --max-time 5 https://www.google.com 2>/dev/null | awk -F': ' 'tolower($1)=="date"{print $2; exit}' | tr -d '\r')
  131. drift_ok=1
  132. drift_detail=""
  133. if [[ -n "$remote_date" ]]; then
  134. remote_epoch=$(date -d "$remote_date" +%s 2>/dev/null)
  135. if [[ -n "$remote_epoch" ]]; then
  136. local_epoch=$(date +%s)
  137. drift=$(( local_epoch - remote_epoch ))
  138. abs_drift=${drift#-}
  139. if [[ "$abs_drift" -lt 300 ]]; then
  140. drift_detail="${drift}s vs HTTP Date (within ±5min)"
  141. else
  142. drift_ok=0
  143. drift_detail="${drift}s drift — will break TLS cert validation"
  144. fi
  145. fi
  146. fi
  147. # Detect which time daemon and its sync state
  148. sync_detail=""
  149. if command -v timedatectl >/dev/null 2>&1; then
  150. sync_state=$(timedatectl show 2>/dev/null | awk -F= '/^NTPSynchronized=/{print $2}')
  151. sync_detail="systemd-timesyncd NTPSynchronized=$sync_state"
  152. elif command -v chronyc >/dev/null 2>&1; then
  153. stratum=$(chronyc tracking 2>/dev/null | awk -F': ' '/Stratum/{print $2}')
  154. sync_detail="chronyd stratum=$stratum"
  155. [[ "$stratum" == "16" ]] && drift_ok=0
  156. elif command -v ntpq >/dev/null 2>&1; then
  157. sync_detail="ntpd present (run 'ntpq -p' for peer status)"
  158. fi
  159. combined="$drift_detail${sync_detail:+; $sync_detail}"
  160. if [[ "$drift_ok" -eq 1 ]]; then
  161. pass "Time sync" "$combined"
  162. else
  163. fail "Time sync" "$combined"
  164. fi
  165. # MTU / path-MTU discovery. Linux uses -M do (don't fragment).
  166. if ping -M do -s 1472 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  167. pass "Path MTU 1500 (1472-byte DF payload)" "to 1.1.1.1"
  168. else
  169. if ping -M do -s 1400 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  170. fail "Path MTU 1500 (1472-byte DF payload)" "1500 fails, 1428+ works — path MTU < 1500 (VPN/PPPoE?)"
  171. else
  172. pass "Path MTU test inconclusive" "ICMP DF blocked or destination unreachable"
  173. fi
  174. fi
  175. # IPv6 deep-dive — classifies v6 stack state across four meaningful tiers.
  176. v6_state=""
  177. v6_detail=""
  178. v6_addrs=$(ip -6 -br addr show scope global 2>/dev/null | awk '{for(i=3;i<=NF;i++) print $1" "$i}' | grep -v '^lo ')
  179. v6_global=$(printf '%s\n' "$v6_addrs" | awk '$2 !~ /^fd/ && $2 !~ /^fc/{print; exit}')
  180. v6_default=$(ip -6 route show default 2>/dev/null | head -1)
  181. if [[ -z "$v6_addrs" ]]; then
  182. v6_state="disabled"
  183. v6_detail="no global v6 addresses — IPv6 disabled or unconfigured (check sysctl net.ipv6.conf.all.disable_ipv6)"
  184. elif [[ -z "$v6_global" ]]; then
  185. v6_state="ula_only"
  186. v6_detail="only ULA (fc00::/7) addresses present — router not delegating public v6 prefix"
  187. elif [[ -z "$v6_default" ]]; then
  188. v6_state="no_route"
  189. v6_detail="global v6 address present but no default route — RA not received (check accept_ra sysctl)"
  190. else
  191. aaaa=$(dig +short +time=2 +tries=1 AAAA "$TEST_HOST" 2>/dev/null | head -1)
  192. if [[ -n "$aaaa" ]] && curl -6 -sS -o /dev/null --max-time 4 "https://$TEST_HOST" 2>/dev/null; then
  193. v6_state="healthy"
  194. v6_detail="global addr + default route + curl -6 works"
  195. else
  196. v6_state="path_broken"
  197. v6_detail="addr present, default route present, but curl -6 fails — firewall or ISP black-holing"
  198. fi
  199. fi
  200. case "$v6_state" in
  201. disabled|healthy) pass "IPv6 stack ($v6_state)" "$v6_detail" ;;
  202. *) fail "IPv6 stack ($v6_state)" "$v6_detail" ;;
  203. esac
  204. # ---------------------------------------------------------------------------
  205. section "6. APPLICATION LAYER (real HTTP request)"
  206. # ---------------------------------------------------------------------------
  207. for url in "https://www.google.com" "https://github.com"; do
  208. if out=$(curl -sS -o /dev/null -w "%{http_code} %{size_download}b" --max-time "$TIMEOUT" "$url" 2>&1); then
  209. pass "GET $url" "$out"
  210. else
  211. fail "GET $url" "$out"
  212. fi
  213. done
  214. # ---------------------------------------------------------------------------
  215. section "7. KNOWN VPN / DNS CLIENT FOOTPRINT"
  216. # ---------------------------------------------------------------------------
  217. # Browser DoH state — Chrome / Brave / Edge / Firefox bypass system DNS when DoH set.
  218. browser_findings=""
  219. for label_prefs in \
  220. "Chrome:$HOME/.config/google-chrome/Default/Preferences" \
  221. "Chromium:$HOME/.config/chromium/Default/Preferences" \
  222. "Brave:$HOME/.config/BraveSoftware/Brave-Browser/Default/Preferences" \
  223. "Edge:$HOME/.config/microsoft-edge/Default/Preferences"; do
  224. label="${label_prefs%%:*}"
  225. prefs="${label_prefs#*:}"
  226. [[ -f "$prefs" ]] || continue
  227. mode=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"mode"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  228. templates=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"templates"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  229. if [[ -n "$mode" ]]; then
  230. browser_findings+=" $label DoH: mode=$mode${templates:+, server=$templates}\n"
  231. else
  232. browser_findings+=" $label installed, DoH: not configured (system DNS)\n"
  233. fi
  234. done
  235. for fx_prefs in "$HOME/.mozilla/firefox"/*.default*/prefs.js; do
  236. [[ -f "$fx_prefs" ]] || continue
  237. trr_mode=$(awk -F'"' '/"network.trr.mode"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  238. trr_uri=$(awk -F'"' '/"network.trr.uri"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  239. case "${trr_mode:-0}" in
  240. 2) state="enabled (with system fallback)" ;;
  241. 3) state="enabled (no fallback)" ;;
  242. 5) state="disabled by policy" ;;
  243. *) state="off (system DNS)" ;;
  244. esac
  245. browser_findings+=" Firefox DoH: $state${trr_uri:+, server=$trr_uri}\n"
  246. break
  247. done
  248. if [[ -n "$browser_findings" ]]; then
  249. info " Browser DoH state (browsers may bypass system DNS):"
  250. printf '%b' "$browser_findings"
  251. fi
  252. KNOWN=(
  253. /etc/openvpn /etc/wireguard /opt/cisco /etc/proton-vpn /etc/mullvad-vpn
  254. /opt/nordvpn /etc/NetworkManager/dnsmasq.d /etc/dnsmasq.d
  255. /etc/cloudflared /etc/nextdns.conf
  256. )
  257. for p in "${KNOWN[@]}"; do
  258. [[ -e "$p" ]] && echo " Found: $p"
  259. done
  260. # Running VPN / DNS proxy processes
  261. echo " VPN / DNS proxy processes:"
  262. pgrep -af 'openvpn|wireguard|wg-quick|mullvad|proton|nordvpn|cloudflared|nextdns|dnsmasq|stubby|dnscrypt' 2>/dev/null | head -10 | sed 's/^/ /' || true
  263. # ---------------------------------------------------------------------------
  264. section "8. ENVIRONMENT (WSL / container detection)"
  265. # ---------------------------------------------------------------------------
  266. env_type=""
  267. if [[ -f /proc/sys/fs/binfmt_misc/WSLInterop ]] || grep -qi microsoft /proc/version 2>/dev/null; then
  268. env_type="WSL2"
  269. elif [[ -f /.dockerenv ]]; then
  270. env_type="Docker container"
  271. elif grep -qE 'docker|containerd|kubepods' /proc/1/cgroup 2>/dev/null; then
  272. env_type="container (cgroup signature)"
  273. fi
  274. if [[ -z "$env_type" ]]; then
  275. info " Bare-metal / VM Linux (no WSL/container signature)"
  276. else
  277. info " Detected environment: $env_type"
  278. case "$env_type" in
  279. WSL2*)
  280. info " WSL2 has bespoke DNS handling. Key files if DNS misbehaves:"
  281. info " /etc/wsl.conf — controls generateResolvConf"
  282. info " /etc/resolv.conf — auto-generated by WSL unless wsl.conf opts out"
  283. info " Host Windows DNS — affects WSL DNS via mirrored mode"
  284. info " Fix pattern: edit /etc/wsl.conf, set [network] generateResolvConf=false, write static /etc/resolv.conf"
  285. [[ -f /etc/wsl.conf ]] && { info " --- /etc/wsl.conf ---"; sed 's/^/ /' /etc/wsl.conf; }
  286. info " --- /etc/resolv.conf head ---"
  287. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  288. ;;
  289. Docker*|container*)
  290. info " Container DNS inherits from host or --dns flag at run time."
  291. info " /etc/resolv.conf here is set by runtime, not user."
  292. info " If broken inside container but fine on host: check 'docker network inspect' / runtime config."
  293. ;;
  294. esac
  295. fi
  296. emit_summary
  297. if [[ "$JSON_MODE" -eq 0 ]]; then
  298. if [[ -n "$FIRST_FAIL" ]]; then
  299. case "$FIRST_FAIL" in
  300. *"LINK LAYER"*) echo " Next: check ip link / ip addr, DHCP, NetworkManager state" ;;
  301. *"SOCKET"*) echo " Next: check iptables/nftables OUTPUT chain; AV protocol filtering; consumer router DoH IP blocking" ;;
  302. *"ICMP"*|*"IP /"*) echo " Next: check ip route, ISP/upstream connectivity" ;;
  303. *"DNS INFRASTRUCTURE"*) echo " Next: check UDP/53 outbound, /etc/resolv.conf upstream" ;;
  304. *"RESOLVER PATH"*) echo " Next: bash scripts/linux/dns-audit.sh # drill rung 5 (the hook layer)" ;;
  305. *"APPLICATION"*) echo " Next: check http_proxy/https_proxy env, CA bundle, IPv6 preference" ;;
  306. *) echo " Next: re-run with --verbose; check references/common-culprits.md" ;;
  307. esac
  308. fi
  309. echo
  310. echo "=== END PROBE ==="
  311. fi