probe.sh 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #!/usr/bin/env bash
  2. # net-ops :: linux/probe.sh
  3. # Full layered diagnostic ladder for Linux network troubleshooting.
  4. # Outputs structured [PASS]/[FAIL] lines so a human or LLM can scan for
  5. # the first FAIL and drill in.
  6. set -u
  7. TEST_HOST="${TEST_HOST:-google.com}"
  8. TEST_IPS=("1.1.1.1" "8.8.8.8")
  9. TIMEOUT="${TIMEOUT:-5}"
  10. # shellcheck source=../_lib/redact.sh
  11. source "$(dirname "$0")/../_lib/redact.sh"
  12. # shellcheck source=../_lib/output.sh
  13. source "$(dirname "$0")/../_lib/output.sh"
  14. parse_redact_flag "$@"
  15. parse_output_flags "$@"
  16. maybe_redact_self "$@"
  17. # ---------------------------------------------------------------------------
  18. section "1. LINK LAYER"
  19. # ---------------------------------------------------------------------------
  20. ip -br link 2>/dev/null | awk '$2=="UP"{print $1}' | while read -r dev; do
  21. [[ "$dev" == "lo" ]] && continue
  22. addr=$(ip -br -4 addr show "$dev" 2>/dev/null | awk '{print $3}')
  23. pass "Interface $dev UP" "${addr:-no IPv4}"
  24. done
  25. GATEWAY=$(ip route show default 2>/dev/null | awk '/default/{print $3; exit}')
  26. DEFAULT_IF=$(ip route show default 2>/dev/null | awk '/default/{print $5; exit}')
  27. [[ -n "$GATEWAY" ]] && pass "Default gateway" "$GATEWAY via $DEFAULT_IF" || fail "Default gateway" "none configured"
  28. # ---------------------------------------------------------------------------
  29. section "2. IP / ICMP REACHABILITY"
  30. # ---------------------------------------------------------------------------
  31. [[ -n "${GATEWAY:-}" ]] && {
  32. if ping -c 2 -W "$TIMEOUT" "$GATEWAY" >/dev/null 2>&1; then pass "Ping gateway $GATEWAY"; else fail "Ping gateway $GATEWAY"; fi
  33. }
  34. for ip in "${TEST_IPS[@]}"; do
  35. if ping -c 2 -W "$TIMEOUT" "$ip" >/dev/null 2>&1; then pass "Ping $ip"; else fail "Ping $ip"; fi
  36. done
  37. # ---------------------------------------------------------------------------
  38. section "3. TCP/UDP SOCKET REACHABILITY"
  39. # ---------------------------------------------------------------------------
  40. for ip in "${TEST_IPS[@]}"; do
  41. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/443" 2>/dev/null; then pass "TCP/443 -> $ip"; else fail "TCP/443 -> $ip"; fi
  42. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/53" 2>/dev/null; then pass "TCP/53 -> $ip"; else fail "TCP/53 -> $ip"; fi
  43. done
  44. # Raw UDP/53 via dig with explicit server — bypasses /etc/resolv.conf
  45. for ip in "${TEST_IPS[@]}"; do
  46. if result=$(dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" 2>&1) && [[ -n "$result" ]] && [[ ! "$result" =~ "timed out"|"connection refused" ]]; then
  47. pass "UDP/53 -> $ip (dig)" "$(echo "$result" | head -1)"
  48. else
  49. fail "UDP/53 -> $ip (dig)" "$result"
  50. fi
  51. done
  52. # ---------------------------------------------------------------------------
  53. section "4. DNS INFRASTRUCTURE (bypass tools)"
  54. # ---------------------------------------------------------------------------
  55. # dig uses its own resolver — does NOT touch glibc NSS chain
  56. for srv in "" "${TEST_IPS[@]}"; do
  57. if [[ -z "$srv" ]]; then
  58. out=$(dig +short +time="$TIMEOUT" +tries=1 "$TEST_HOST" 2>&1)
  59. label="default"
  60. else
  61. out=$(dig +short +time="$TIMEOUT" +tries=1 @"$srv" "$TEST_HOST" 2>&1)
  62. label="$srv"
  63. fi
  64. if [[ -n "$out" && ! "$out" =~ "timed out"|"connection refused" ]]; then
  65. pass "dig via $label" "$(echo "$out" | head -1)"
  66. else
  67. fail "dig via $label" "$out"
  68. fi
  69. done
  70. # ---------------------------------------------------------------------------
  71. section "5. LINUX RESOLVER PATH (the hook layer)"
  72. # ---------------------------------------------------------------------------
  73. # getent uses glibc NSS — goes through the whole system resolver chain
  74. if out=$(getent hosts "$TEST_HOST" 2>&1) && [[ -n "$out" ]]; then
  75. addr=$(echo "$out" | awk '{print $1; exit}')
  76. pass "getent hosts (NSS path)" "$addr"
  77. else
  78. fail "getent hosts (NSS path)" "$out"
  79. fi
  80. # resolvectl query if systemd-resolved present
  81. if command -v resolvectl >/dev/null 2>&1; then
  82. if out=$(resolvectl query "$TEST_HOST" 2>&1) && echo "$out" | grep -q "^$TEST_HOST:"; then
  83. addr=$(echo "$out" | awk '/^[^:]+:.+[0-9]+\./{print $2; exit}')
  84. pass "resolvectl query" "$addr"
  85. else
  86. fail "resolvectl query" "$(echo "$out" | head -2)"
  87. fi
  88. fi
  89. # nsswitch.conf — name resolution order
  90. echo " /etc/nsswitch.conf hosts line:"
  91. grep "^hosts:" /etc/nsswitch.conf 2>/dev/null | sed 's/^/ /'
  92. # /etc/resolv.conf — is it the systemd-resolved stub, NetworkManager's, or static?
  93. echo " /etc/resolv.conf:"
  94. if [[ -L /etc/resolv.conf ]]; then
  95. target=$(readlink /etc/resolv.conf)
  96. echo " symlink -> $target"
  97. fi
  98. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  99. # Active resolver listeners on 127.x:53
  100. echo " Local DNS listeners on 127.0.0.x:53:"
  101. ss -tulnp 2>/dev/null | awk '$5 ~ /^127\./ && $5 ~ /:53$/' | sed 's/^/ /' || true
  102. # systemd-resolved status (if present)
  103. if systemctl is-active systemd-resolved >/dev/null 2>&1; then
  104. echo " systemd-resolved active. Per-link DNS:"
  105. resolvectl status 2>/dev/null | awk '
  106. /^Link [0-9]+/{link=$0; show=0; printed=0}
  107. /Current DNS Server:|DNS Servers:|DNS Domain:/{
  108. if(!printed){print " "link; printed=1}
  109. print " "$0
  110. }
  111. ' | head -40
  112. fi
  113. # ---------------------------------------------------------------------------
  114. # Time-sync deep-dive: HTTP Date drift + check timedatectl/chrony/ntpd status
  115. remote_date=$(curl -sIA 'net-ops-probe' --max-time 5 https://www.google.com 2>/dev/null | awk -F': ' 'tolower($1)=="date"{print $2; exit}' | tr -d '\r')
  116. drift_ok=1
  117. drift_detail=""
  118. if [[ -n "$remote_date" ]]; then
  119. remote_epoch=$(date -d "$remote_date" +%s 2>/dev/null)
  120. if [[ -n "$remote_epoch" ]]; then
  121. local_epoch=$(date +%s)
  122. drift=$(( local_epoch - remote_epoch ))
  123. abs_drift=${drift#-}
  124. if [[ "$abs_drift" -lt 300 ]]; then
  125. drift_detail="${drift}s vs HTTP Date (within ±5min)"
  126. else
  127. drift_ok=0
  128. drift_detail="${drift}s drift — will break TLS cert validation"
  129. fi
  130. fi
  131. fi
  132. # Detect which time daemon and its sync state
  133. sync_detail=""
  134. if command -v timedatectl >/dev/null 2>&1; then
  135. sync_state=$(timedatectl show 2>/dev/null | awk -F= '/^NTPSynchronized=/{print $2}')
  136. sync_detail="systemd-timesyncd NTPSynchronized=$sync_state"
  137. elif command -v chronyc >/dev/null 2>&1; then
  138. stratum=$(chronyc tracking 2>/dev/null | awk -F': ' '/Stratum/{print $2}')
  139. sync_detail="chronyd stratum=$stratum"
  140. [[ "$stratum" == "16" ]] && drift_ok=0
  141. elif command -v ntpq >/dev/null 2>&1; then
  142. sync_detail="ntpd present (run 'ntpq -p' for peer status)"
  143. fi
  144. combined="$drift_detail${sync_detail:+; $sync_detail}"
  145. if [[ "$drift_ok" -eq 1 ]]; then
  146. pass "Time sync" "$combined"
  147. else
  148. fail "Time sync" "$combined"
  149. fi
  150. # MTU / path-MTU discovery. Linux uses -M do (don't fragment).
  151. if ping -M do -s 1472 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  152. pass "Path MTU 1500 (1472-byte DF payload)" "to 1.1.1.1"
  153. else
  154. if ping -M do -s 1400 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  155. fail "Path MTU 1500 (1472-byte DF payload)" "1500 fails, 1428+ works — path MTU < 1500 (VPN/PPPoE?)"
  156. else
  157. pass "Path MTU test inconclusive" "ICMP DF blocked or destination unreachable"
  158. fi
  159. fi
  160. # IPv6 deep-dive — classifies v6 stack state across four meaningful tiers.
  161. v6_state=""
  162. v6_detail=""
  163. v6_addrs=$(ip -6 -br addr show scope global 2>/dev/null | awk '{for(i=3;i<=NF;i++) print $1" "$i}' | grep -v '^lo ')
  164. v6_global=$(printf '%s\n' "$v6_addrs" | awk '$2 !~ /^fd/ && $2 !~ /^fc/{print; exit}')
  165. v6_default=$(ip -6 route show default 2>/dev/null | head -1)
  166. if [[ -z "$v6_addrs" ]]; then
  167. v6_state="disabled"
  168. v6_detail="no global v6 addresses — IPv6 disabled or unconfigured (check sysctl net.ipv6.conf.all.disable_ipv6)"
  169. elif [[ -z "$v6_global" ]]; then
  170. v6_state="ula_only"
  171. v6_detail="only ULA (fc00::/7) addresses present — router not delegating public v6 prefix"
  172. elif [[ -z "$v6_default" ]]; then
  173. v6_state="no_route"
  174. v6_detail="global v6 address present but no default route — RA not received (check accept_ra sysctl)"
  175. else
  176. aaaa=$(dig +short +time=2 +tries=1 AAAA "$TEST_HOST" 2>/dev/null | head -1)
  177. if [[ -n "$aaaa" ]] && curl -6 -sS -o /dev/null --max-time 4 "https://$TEST_HOST" 2>/dev/null; then
  178. v6_state="healthy"
  179. v6_detail="global addr + default route + curl -6 works"
  180. else
  181. v6_state="path_broken"
  182. v6_detail="addr present, default route present, but curl -6 fails — firewall or ISP black-holing"
  183. fi
  184. fi
  185. case "$v6_state" in
  186. disabled|healthy) pass "IPv6 stack ($v6_state)" "$v6_detail" ;;
  187. *) fail "IPv6 stack ($v6_state)" "$v6_detail" ;;
  188. esac
  189. # ---------------------------------------------------------------------------
  190. section "6. APPLICATION LAYER (real HTTP request)"
  191. # ---------------------------------------------------------------------------
  192. for url in "https://www.google.com" "https://github.com"; do
  193. if out=$(curl -sS -o /dev/null -w "%{http_code} %{size_download}b" --max-time "$TIMEOUT" "$url" 2>&1); then
  194. pass "GET $url" "$out"
  195. else
  196. fail "GET $url" "$out"
  197. fi
  198. done
  199. # ---------------------------------------------------------------------------
  200. section "7. KNOWN VPN / DNS CLIENT FOOTPRINT"
  201. # ---------------------------------------------------------------------------
  202. # Browser DoH state — Chrome / Brave / Edge / Firefox bypass system DNS when DoH set.
  203. browser_findings=""
  204. for label_prefs in \
  205. "Chrome:$HOME/.config/google-chrome/Default/Preferences" \
  206. "Chromium:$HOME/.config/chromium/Default/Preferences" \
  207. "Brave:$HOME/.config/BraveSoftware/Brave-Browser/Default/Preferences" \
  208. "Edge:$HOME/.config/microsoft-edge/Default/Preferences"; do
  209. label="${label_prefs%%:*}"
  210. prefs="${label_prefs#*:}"
  211. [[ -f "$prefs" ]] || continue
  212. mode=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"mode"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  213. templates=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"templates"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  214. if [[ -n "$mode" ]]; then
  215. browser_findings+=" $label DoH: mode=$mode${templates:+, server=$templates}\n"
  216. else
  217. browser_findings+=" $label installed, DoH: not configured (system DNS)\n"
  218. fi
  219. done
  220. for fx_prefs in "$HOME/.mozilla/firefox"/*.default*/prefs.js; do
  221. [[ -f "$fx_prefs" ]] || continue
  222. trr_mode=$(awk -F'"' '/"network.trr.mode"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  223. trr_uri=$(awk -F'"' '/"network.trr.uri"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  224. case "${trr_mode:-0}" in
  225. 2) state="enabled (with system fallback)" ;;
  226. 3) state="enabled (no fallback)" ;;
  227. 5) state="disabled by policy" ;;
  228. *) state="off (system DNS)" ;;
  229. esac
  230. browser_findings+=" Firefox DoH: $state${trr_uri:+, server=$trr_uri}\n"
  231. break
  232. done
  233. if [[ -n "$browser_findings" ]]; then
  234. info " Browser DoH state (browsers may bypass system DNS):"
  235. printf '%b' "$browser_findings"
  236. fi
  237. KNOWN=(
  238. /etc/openvpn /etc/wireguard /opt/cisco /etc/proton-vpn /etc/mullvad-vpn
  239. /opt/nordvpn /etc/NetworkManager/dnsmasq.d /etc/dnsmasq.d
  240. /etc/cloudflared /etc/nextdns.conf
  241. )
  242. for p in "${KNOWN[@]}"; do
  243. [[ -e "$p" ]] && echo " Found: $p"
  244. done
  245. # Running VPN / DNS proxy processes
  246. echo " VPN / DNS proxy processes:"
  247. pgrep -af 'openvpn|wireguard|wg-quick|mullvad|proton|nordvpn|cloudflared|nextdns|dnsmasq|stubby|dnscrypt' 2>/dev/null | head -10 | sed 's/^/ /' || true
  248. # ---------------------------------------------------------------------------
  249. section "8. ENVIRONMENT (WSL / container detection)"
  250. # ---------------------------------------------------------------------------
  251. env_type=""
  252. if [[ -f /proc/sys/fs/binfmt_misc/WSLInterop ]] || grep -qi microsoft /proc/version 2>/dev/null; then
  253. env_type="WSL2"
  254. elif [[ -f /.dockerenv ]]; then
  255. env_type="Docker container"
  256. elif grep -qE 'docker|containerd|kubepods' /proc/1/cgroup 2>/dev/null; then
  257. env_type="container (cgroup signature)"
  258. fi
  259. if [[ -z "$env_type" ]]; then
  260. info " Bare-metal / VM Linux (no WSL/container signature)"
  261. else
  262. info " Detected environment: $env_type"
  263. case "$env_type" in
  264. WSL2*)
  265. info " WSL2 has bespoke DNS handling. Key files if DNS misbehaves:"
  266. info " /etc/wsl.conf — controls generateResolvConf"
  267. info " /etc/resolv.conf — auto-generated by WSL unless wsl.conf opts out"
  268. info " Host Windows DNS — affects WSL DNS via mirrored mode"
  269. info " Fix pattern: edit /etc/wsl.conf, set [network] generateResolvConf=false, write static /etc/resolv.conf"
  270. [[ -f /etc/wsl.conf ]] && { info " --- /etc/wsl.conf ---"; sed 's/^/ /' /etc/wsl.conf; }
  271. info " --- /etc/resolv.conf head ---"
  272. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  273. ;;
  274. Docker*|container*)
  275. info " Container DNS inherits from host or --dns flag at run time."
  276. info " /etc/resolv.conf here is set by runtime, not user."
  277. info " If broken inside container but fine on host: check 'docker network inspect' / runtime config."
  278. ;;
  279. esac
  280. fi
  281. emit_summary
  282. if [[ "$JSON_MODE" -eq 0 ]]; then
  283. if [[ -n "$FIRST_FAIL" ]]; then
  284. case "$FIRST_FAIL" in
  285. *"LINK LAYER"*) echo " Next: check ip link / ip addr, DHCP, NetworkManager state" ;;
  286. *"SOCKET"*) echo " Next: check iptables/nftables OUTPUT chain; AV protocol filtering; consumer router DoH IP blocking" ;;
  287. *"ICMP"*|*"IP /"*) echo " Next: check ip route, ISP/upstream connectivity" ;;
  288. *"DNS INFRASTRUCTURE"*) echo " Next: check UDP/53 outbound, /etc/resolv.conf upstream" ;;
  289. *"RESOLVER PATH"*) echo " Next: bash scripts/linux/dns-audit.sh # drill rung 5 (the hook layer)" ;;
  290. *"APPLICATION"*) echo " Next: check http_proxy/https_proxy env, CA bundle, IPv6 preference" ;;
  291. *) echo " Next: re-run with --verbose; check references/common-culprits.md" ;;
  292. esac
  293. fi
  294. echo
  295. echo "=== END PROBE ==="
  296. fi