probe.sh 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. #!/usr/bin/env bash
  2. # net-ops :: linux/probe.sh
  3. # Full layered diagnostic ladder for Linux network troubleshooting.
  4. # Outputs structured [PASS]/[FAIL] lines so a human or LLM can scan for
  5. # the first FAIL and drill in.
  6. set -u
  7. TEST_HOST="${TEST_HOST:-google.com}"
  8. TEST_IPS=("1.1.1.1" "8.8.8.8")
  9. TIMEOUT="${TIMEOUT:-5}"
  10. for arg in "$@"; do
  11. case "$arg" in
  12. --help|-h)
  13. cat <<EOF
  14. Usage: $0 [--redact] [--json] [--quick]
  15. --redact Mask private IPs, MAC addresses, and *.ts.net tailnet names
  16. --json Newline-delimited JSON output (for piping to jq, dashboards)
  17. --quick Skip rungs 1-4 and 7 if the last full run cached as healthy
  18. (cache: \${TMPDIR:-/tmp}/net-ops/last-state.json, TTL 10min)
  19. Compose freely: --json + --redact emits sanitized NDJSON.
  20. Env: TEST_HOST (default google.com), TIMEOUT (default 5s).
  21. EOF
  22. exit 0 ;;
  23. esac
  24. done
  25. # shellcheck source=../_lib/redact.sh
  26. source "$(dirname "$0")/../_lib/redact.sh"
  27. # shellcheck source=../_lib/output.sh
  28. source "$(dirname "$0")/../_lib/output.sh"
  29. PANEL_TITLE="linux probe"
  30. parse_redact_flag "$@"
  31. parse_output_flags "$@"
  32. maybe_redact_self "$@"
  33. # ---------------------------------------------------------------------------
  34. section "1. LINK LAYER"
  35. # ---------------------------------------------------------------------------
  36. ip -br link 2>/dev/null | awk '$2=="UP"{print $1}' | while read -r dev; do
  37. [[ "$dev" == "lo" ]] && continue
  38. addr=$(ip -br -4 addr show "$dev" 2>/dev/null | awk '{print $3}')
  39. pass "Interface $dev UP" "${addr:-no IPv4}"
  40. done
  41. GATEWAY=$(ip route show default 2>/dev/null | awk '/default/{print $3; exit}')
  42. DEFAULT_IF=$(ip route show default 2>/dev/null | awk '/default/{print $5; exit}')
  43. [[ -n "$GATEWAY" ]] && pass "Default gateway" "$GATEWAY via $DEFAULT_IF" || fail "Default gateway" "none configured"
  44. # ---------------------------------------------------------------------------
  45. section "2. IP / ICMP REACHABILITY"
  46. # ---------------------------------------------------------------------------
  47. [[ -n "${GATEWAY:-}" ]] && {
  48. if ping -c 2 -W "$TIMEOUT" "$GATEWAY" >/dev/null 2>&1; then pass "Ping gateway $GATEWAY"; else fail "Ping gateway $GATEWAY"; fi
  49. }
  50. for ip in "${TEST_IPS[@]}"; do
  51. if ping -c 2 -W "$TIMEOUT" "$ip" >/dev/null 2>&1; then pass "Ping $ip"; else fail "Ping $ip"; fi
  52. done
  53. # ---------------------------------------------------------------------------
  54. section "3. TCP/UDP SOCKET REACHABILITY"
  55. # ---------------------------------------------------------------------------
  56. for ip in "${TEST_IPS[@]}"; do
  57. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/443" 2>/dev/null; then pass "TCP/443 -> $ip"; else fail "TCP/443 -> $ip"; fi
  58. if timeout "$TIMEOUT" bash -c "</dev/tcp/$ip/53" 2>/dev/null; then pass "TCP/53 -> $ip"; else fail "TCP/53 -> $ip"; fi
  59. done
  60. # Raw UDP/53 via dig with explicit server — bypasses /etc/resolv.conf
  61. for ip in "${TEST_IPS[@]}"; do
  62. if result=$(dig +short +time="$TIMEOUT" +tries=1 @"$ip" "$TEST_HOST" 2>&1) && [[ -n "$result" ]] && [[ ! "$result" =~ "timed out"|"connection refused" ]]; then
  63. pass "UDP/53 -> $ip (dig)" "$(echo "$result" | head -1)"
  64. else
  65. fail "UDP/53 -> $ip (dig)" "$result"
  66. fi
  67. done
  68. # ---------------------------------------------------------------------------
  69. section "4. DNS INFRASTRUCTURE (bypass tools)"
  70. # ---------------------------------------------------------------------------
  71. # dig uses its own resolver — does NOT touch glibc NSS chain
  72. for srv in "" "${TEST_IPS[@]}"; do
  73. if [[ -z "$srv" ]]; then
  74. out=$(dig +short +time="$TIMEOUT" +tries=1 "$TEST_HOST" 2>&1)
  75. label="default"
  76. else
  77. out=$(dig +short +time="$TIMEOUT" +tries=1 @"$srv" "$TEST_HOST" 2>&1)
  78. label="$srv"
  79. fi
  80. if [[ -n "$out" && ! "$out" =~ "timed out"|"connection refused" ]]; then
  81. pass "dig via $label" "$(echo "$out" | head -1)"
  82. else
  83. fail "dig via $label" "$out"
  84. fi
  85. done
  86. # ---------------------------------------------------------------------------
  87. section "5. LINUX RESOLVER PATH (the hook layer)"
  88. # ---------------------------------------------------------------------------
  89. # getent uses glibc NSS — goes through the whole system resolver chain
  90. if out=$(getent hosts "$TEST_HOST" 2>&1) && [[ -n "$out" ]]; then
  91. addr=$(echo "$out" | awk '{print $1; exit}')
  92. pass "getent hosts (NSS path)" "$addr"
  93. else
  94. fail "getent hosts (NSS path)" "$out"
  95. fi
  96. # resolvectl query if systemd-resolved present
  97. if command -v resolvectl >/dev/null 2>&1; then
  98. if out=$(resolvectl query "$TEST_HOST" 2>&1) && echo "$out" | grep -q "^$TEST_HOST:"; then
  99. addr=$(echo "$out" | awk '/^[^:]+:.+[0-9]+\./{print $2; exit}')
  100. pass "resolvectl query" "$addr"
  101. else
  102. fail "resolvectl query" "$(echo "$out" | head -2)"
  103. fi
  104. fi
  105. # nsswitch.conf — name resolution order
  106. echo " /etc/nsswitch.conf hosts line:"
  107. grep "^hosts:" /etc/nsswitch.conf 2>/dev/null | sed 's/^/ /'
  108. # /etc/resolv.conf — is it the systemd-resolved stub, NetworkManager's, or static?
  109. echo " /etc/resolv.conf:"
  110. if [[ -L /etc/resolv.conf ]]; then
  111. target=$(readlink /etc/resolv.conf)
  112. echo " symlink -> $target"
  113. fi
  114. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  115. # Active resolver listeners on 127.x:53
  116. echo " Local DNS listeners on 127.0.0.x:53:"
  117. ss -tulnp 2>/dev/null | awk '$5 ~ /^127\./ && $5 ~ /:53$/' | sed 's/^/ /' || true
  118. # systemd-resolved status (if present)
  119. if systemctl is-active systemd-resolved >/dev/null 2>&1; then
  120. echo " systemd-resolved active. Per-link DNS:"
  121. resolvectl status 2>/dev/null | awk '
  122. /^Link [0-9]+/{link=$0; show=0; printed=0}
  123. /Current DNS Server:|DNS Servers:|DNS Domain:/{
  124. if(!printed){print " "link; printed=1}
  125. print " "$0
  126. }
  127. ' | head -40
  128. fi
  129. # ---------------------------------------------------------------------------
  130. # Time-sync deep-dive: HTTP Date drift + check timedatectl/chrony/ntpd status
  131. remote_date=$(curl -sIA 'net-ops-probe' --max-time 5 https://www.google.com 2>/dev/null | awk -F': ' 'tolower($1)=="date"{print $2; exit}' | tr -d '\r')
  132. drift_ok=1
  133. drift_detail=""
  134. if [[ -n "$remote_date" ]]; then
  135. remote_epoch=$(date -d "$remote_date" +%s 2>/dev/null)
  136. if [[ -n "$remote_epoch" ]]; then
  137. local_epoch=$(date +%s)
  138. drift=$(( local_epoch - remote_epoch ))
  139. abs_drift=${drift#-}
  140. if [[ "$abs_drift" -lt 300 ]]; then
  141. drift_detail="${drift}s vs HTTP Date (within ±5min)"
  142. else
  143. drift_ok=0
  144. drift_detail="${drift}s drift — will break TLS cert validation"
  145. fi
  146. fi
  147. fi
  148. # Detect which time daemon and its sync state
  149. sync_detail=""
  150. if command -v timedatectl >/dev/null 2>&1; then
  151. sync_state=$(timedatectl show 2>/dev/null | awk -F= '/^NTPSynchronized=/{print $2}')
  152. sync_detail="systemd-timesyncd NTPSynchronized=$sync_state"
  153. elif command -v chronyc >/dev/null 2>&1; then
  154. stratum=$(chronyc tracking 2>/dev/null | awk -F': ' '/Stratum/{print $2}')
  155. sync_detail="chronyd stratum=$stratum"
  156. [[ "$stratum" == "16" ]] && drift_ok=0
  157. elif command -v ntpq >/dev/null 2>&1; then
  158. sync_detail="ntpd present (run 'ntpq -p' for peer status)"
  159. fi
  160. combined="$drift_detail${sync_detail:+; $sync_detail}"
  161. if [[ "$drift_ok" -eq 1 ]]; then
  162. pass "Time sync" "$combined"
  163. else
  164. fail "Time sync" "$combined"
  165. fi
  166. # MTU / path-MTU discovery. Linux uses -M do (don't fragment).
  167. if ping -M do -s 1472 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  168. pass "Path MTU 1500 (1472-byte DF payload)" "to 1.1.1.1"
  169. else
  170. if ping -M do -s 1400 -c 1 -W 3 1.1.1.1 >/dev/null 2>&1; then
  171. fail "Path MTU 1500 (1472-byte DF payload)" "1500 fails, 1428+ works — path MTU < 1500 (VPN/PPPoE?)"
  172. else
  173. pass "Path MTU test inconclusive" "ICMP DF blocked or destination unreachable"
  174. fi
  175. fi
  176. # IPv6 deep-dive — classifies v6 stack state across four meaningful tiers.
  177. v6_state=""
  178. v6_detail=""
  179. v6_addrs=$(ip -6 -br addr show scope global 2>/dev/null | awk '{for(i=3;i<=NF;i++) print $1" "$i}' | grep -v '^lo ')
  180. v6_global=$(printf '%s\n' "$v6_addrs" | awk '$2 !~ /^fd/ && $2 !~ /^fc/{print; exit}')
  181. v6_default=$(ip -6 route show default 2>/dev/null | head -1)
  182. if [[ -z "$v6_addrs" ]]; then
  183. v6_state="disabled"
  184. v6_detail="no global v6 addresses — IPv6 disabled or unconfigured (check sysctl net.ipv6.conf.all.disable_ipv6)"
  185. elif [[ -z "$v6_global" ]]; then
  186. v6_state="ula_only"
  187. v6_detail="only ULA (fc00::/7) addresses present — router not delegating public v6 prefix"
  188. elif [[ -z "$v6_default" ]]; then
  189. v6_state="no_route"
  190. v6_detail="global v6 address present but no default route — RA not received (check accept_ra sysctl)"
  191. else
  192. aaaa=$(dig +short +time=2 +tries=1 AAAA "$TEST_HOST" 2>/dev/null | head -1)
  193. if [[ -n "$aaaa" ]] && curl -6 -sS -o /dev/null --max-time 4 "https://$TEST_HOST" 2>/dev/null; then
  194. v6_state="healthy"
  195. v6_detail="global addr + default route + curl -6 works"
  196. else
  197. v6_state="path_broken"
  198. v6_detail="addr present, default route present, but curl -6 fails — firewall or ISP black-holing"
  199. fi
  200. fi
  201. case "$v6_state" in
  202. disabled|healthy) pass "IPv6 stack ($v6_state)" "$v6_detail" ;;
  203. *) fail "IPv6 stack ($v6_state)" "$v6_detail" ;;
  204. esac
  205. # ---------------------------------------------------------------------------
  206. section "6. APPLICATION LAYER (real HTTP request)"
  207. # ---------------------------------------------------------------------------
  208. for url in "https://www.google.com" "https://github.com"; do
  209. if out=$(curl -sS -o /dev/null -w "%{http_code} %{size_download}b" --max-time "$TIMEOUT" "$url" 2>&1); then
  210. pass "GET $url" "$out"
  211. else
  212. fail "GET $url" "$out"
  213. fi
  214. done
  215. # ---------------------------------------------------------------------------
  216. section "7. KNOWN VPN / DNS CLIENT FOOTPRINT"
  217. # ---------------------------------------------------------------------------
  218. # Browser DoH state — Chrome / Brave / Edge / Firefox bypass system DNS when DoH set.
  219. browser_findings=""
  220. for label_prefs in \
  221. "Chrome:$HOME/.config/google-chrome/Default/Preferences" \
  222. "Chromium:$HOME/.config/chromium/Default/Preferences" \
  223. "Brave:$HOME/.config/BraveSoftware/Brave-Browser/Default/Preferences" \
  224. "Edge:$HOME/.config/microsoft-edge/Default/Preferences"; do
  225. label="${label_prefs%%:*}"
  226. prefs="${label_prefs#*:}"
  227. [[ -f "$prefs" ]] || continue
  228. mode=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"mode"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  229. templates=$(perl -ne 'if (/"dns_over_https"\s*:\s*\{[^}]*"templates"\s*:\s*"([^"]+)"/) { print "$1\n"; exit }' "$prefs" 2>/dev/null)
  230. if [[ -n "$mode" ]]; then
  231. browser_findings+=" $label DoH: mode=$mode${templates:+, server=$templates}\n"
  232. else
  233. browser_findings+=" $label installed, DoH: not configured (system DNS)\n"
  234. fi
  235. done
  236. for fx_prefs in "$HOME/.mozilla/firefox"/*.default*/prefs.js; do
  237. [[ -f "$fx_prefs" ]] || continue
  238. trr_mode=$(awk -F'"' '/"network.trr.mode"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  239. trr_uri=$(awk -F'"' '/"network.trr.uri"/{print $4; exit}' "$fx_prefs" 2>/dev/null)
  240. case "${trr_mode:-0}" in
  241. 2) state="enabled (with system fallback)" ;;
  242. 3) state="enabled (no fallback)" ;;
  243. 5) state="disabled by policy" ;;
  244. *) state="off (system DNS)" ;;
  245. esac
  246. browser_findings+=" Firefox DoH: $state${trr_uri:+, server=$trr_uri}\n"
  247. break
  248. done
  249. if [[ -n "$browser_findings" ]]; then
  250. info " Browser DoH state (browsers may bypass system DNS):"
  251. printf '%b' "$browser_findings"
  252. fi
  253. KNOWN=(
  254. /etc/openvpn /etc/wireguard /opt/cisco /etc/proton-vpn /etc/mullvad-vpn
  255. /opt/nordvpn /etc/NetworkManager/dnsmasq.d /etc/dnsmasq.d
  256. /etc/cloudflared /etc/nextdns.conf
  257. )
  258. for p in "${KNOWN[@]}"; do
  259. [[ -e "$p" ]] && echo " Found: $p"
  260. done
  261. # Running VPN / DNS proxy processes
  262. echo " VPN / DNS proxy processes:"
  263. pgrep -af 'openvpn|wireguard|wg-quick|mullvad|proton|nordvpn|cloudflared|nextdns|dnsmasq|stubby|dnscrypt' 2>/dev/null | head -10 | sed 's/^/ /' || true
  264. # ---------------------------------------------------------------------------
  265. section "8. ENVIRONMENT (WSL / container detection)"
  266. # ---------------------------------------------------------------------------
  267. env_type=""
  268. if [[ -f /proc/sys/fs/binfmt_misc/WSLInterop ]] || grep -qi microsoft /proc/version 2>/dev/null; then
  269. env_type="WSL2"
  270. elif [[ -f /.dockerenv ]]; then
  271. env_type="Docker container"
  272. elif grep -qE 'docker|containerd|kubepods' /proc/1/cgroup 2>/dev/null; then
  273. env_type="container (cgroup signature)"
  274. fi
  275. if [[ -z "$env_type" ]]; then
  276. info " Bare-metal / VM Linux (no WSL/container signature)"
  277. else
  278. info " Detected environment: $env_type"
  279. case "$env_type" in
  280. WSL2*)
  281. info " WSL2 has bespoke DNS handling. Key files if DNS misbehaves:"
  282. info " /etc/wsl.conf — controls generateResolvConf"
  283. info " /etc/resolv.conf — auto-generated by WSL unless wsl.conf opts out"
  284. info " Host Windows DNS — affects WSL DNS via mirrored mode"
  285. info " Fix pattern: edit /etc/wsl.conf, set [network] generateResolvConf=false, write static /etc/resolv.conf"
  286. [[ -f /etc/wsl.conf ]] && { info " --- /etc/wsl.conf ---"; sed 's/^/ /' /etc/wsl.conf; }
  287. info " --- /etc/resolv.conf head ---"
  288. head -5 /etc/resolv.conf 2>/dev/null | sed 's/^/ /'
  289. ;;
  290. Docker*|container*)
  291. info " Container DNS inherits from host or --dns flag at run time."
  292. info " /etc/resolv.conf here is set by runtime, not user."
  293. info " If broken inside container but fine on host: check 'docker network inspect' / runtime config."
  294. ;;
  295. esac
  296. fi
  297. emit_summary
  298. if [[ "$JSON_MODE" -eq 0 ]]; then
  299. if [[ -n "$FIRST_FAIL" ]]; then
  300. case "$FIRST_FAIL" in
  301. *"LINK LAYER"*) echo " Next: check ip link / ip addr, DHCP, NetworkManager state" ;;
  302. *"SOCKET"*) echo " Next: check iptables/nftables OUTPUT chain; AV protocol filtering; consumer router DoH IP blocking" ;;
  303. *"ICMP"*|*"IP /"*) echo " Next: check ip route, ISP/upstream connectivity" ;;
  304. *"DNS INFRASTRUCTURE"*) echo " Next: check UDP/53 outbound, /etc/resolv.conf upstream" ;;
  305. *"RESOLVER PATH"*) echo " Next: bash scripts/linux/dns-audit.sh # drill rung 5 (the hook layer)" ;;
  306. *"APPLICATION"*) echo " Next: check http_proxy/https_proxy env, CA bundle, IPv6 preference" ;;
  307. *) echo " Next: re-run with --verbose; check references/common-culprits.md" ;;
  308. esac
  309. fi
  310. echo
  311. echo "=== END PROBE ==="
  312. fi