disk-health.sh 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. #!/usr/bin/env bash
  2. # mac-ops :: disk-health.sh
  3. # Per-disk / per-volume deep dive. Maps APFS containers, surfaces IO errors
  4. # from the unified log, reports SMART status (where macOS exposes it), and
  5. # checks snapshot bloat.
  6. #
  7. # Usage:
  8. # scripts/disk-health.sh # all disks
  9. # scripts/disk-health.sh -d disk2 # by /dev/diskN
  10. # scripts/disk-health.sh -v /Volumes/Foo # by mount point
  11. # scripts/disk-health.sh -v / # boot volume
  12. set -u
  13. TARGET_DEV=""
  14. TARGET_VOL=""
  15. DAYS=30
  16. while [[ $# -gt 0 ]]; do
  17. case "$1" in
  18. -d|--disk) TARGET_DEV="$2"; shift 2 ;;
  19. -v|--volume) TARGET_VOL="$2"; shift 2 ;;
  20. --days) DAYS="$2"; shift 2 ;;
  21. --help|-h)
  22. cat <<EOF
  23. Usage: $0 [options]
  24. -d, --disk diskN Inspect specific device (e.g. disk2)
  25. -v, --volume PATH Inspect by mount point (e.g. / or /Volumes/X)
  26. --days N Log lookback window (default: 30)
  27. --json, --redact, --quiet, --verbose Standard flags
  28. Output sections:
  29. 1. Device summary (model, size, bus, SMART status)
  30. 2. APFS container + volume layout (if APFS)
  31. 3. IO errors via unified log (last --days)
  32. 4. Snapshot bloat (Time Machine local snapshots)
  33. 5. Free space / purgeable space breakdown
  34. 6. Mount + fsck verification status
  35. EOF
  36. exit 0 ;;
  37. *) shift ;;
  38. esac
  39. done
  40. source "$(dirname "$0")/_lib/common.sh"
  41. parse_common_flags "$@"
  42. maybe_filter_self "$@"
  43. # Resolve target → /dev/diskN
  44. resolve_target() {
  45. if [[ -n "$TARGET_DEV" ]]; then
  46. echo "${TARGET_DEV#/dev/}"
  47. return
  48. fi
  49. if [[ -n "$TARGET_VOL" ]]; then
  50. diskutil info "$TARGET_VOL" 2>/dev/null | awk -F': *' '/Device Identifier/{print $2; exit}'
  51. return
  52. fi
  53. # No target — return empty (we'll iterate all)
  54. echo ""
  55. }
  56. disk_id=$(resolve_target)
  57. # ----------------------------------------------------------------------------
  58. section "1. DEVICE SUMMARY"
  59. # ----------------------------------------------------------------------------
  60. if [[ -n "$disk_id" ]]; then
  61. targets=("$disk_id")
  62. else
  63. # All physical disks (not partitions / synthesized)
  64. mapfile -t targets < <(diskutil list 2>/dev/null | awk '/^\/dev\/disk[0-9]+ /{gsub("/dev/",""); print $1}' | sort -u | head -20)
  65. fi
  66. for d in "${targets[@]}"; do
  67. [[ -z "$d" ]] && continue
  68. info=$(diskutil info "$d" 2>/dev/null)
  69. [[ -z "$info" ]] && { log_warn "diskutil info $d" "no data"; continue; }
  70. model=$(echo "$info" | awk -F': *' '/Device \/ Media Name/{print $2; exit}')
  71. bus=$(echo "$info" | awk -F': *' '/Protocol/{print $2; exit}')
  72. size=$(echo "$info" | awk -F': *' '/Disk Size/{print $2; exit}')
  73. smart=$(echo "$info" | awk -F': *' '/SMART Status/{print $2; exit}')
  74. internal=$(echo "$info" | awk -F': *' '/Device Location/{print $2; exit}')
  75. note " /dev/$d"
  76. note " Model: ${model:-(unknown)}"
  77. note " Bus: ${bus:-?} Location: ${internal:-?}"
  78. note " Size: ${size:-?}"
  79. case "$smart" in
  80. Verified)
  81. log_pass "/dev/$d SMART status" "Verified" ;;
  82. Failing|Failed)
  83. log_fail "/dev/$d SMART status" "$smart — back up immediately, do not write to drive" ;;
  84. "Not Supported"|"")
  85. log_info "/dev/$d SMART status" "${smart:-(not exposed; macOS limitation for many NVMe drives)}" ;;
  86. *)
  87. log_warn "/dev/$d SMART status" "$smart" ;;
  88. esac
  89. done
  90. # ----------------------------------------------------------------------------
  91. section "2. APFS CONTAINERS + VOLUMES"
  92. # ----------------------------------------------------------------------------
  93. if [[ -n "$disk_id" ]]; then
  94. diskutil apfs list "$disk_id" 2>/dev/null | sed 's/^/ /' | head -60
  95. else
  96. diskutil apfs list 2>/dev/null | sed 's/^/ /' | head -80
  97. fi
  98. # Volumes per target (with free space)
  99. note ""
  100. note " Mounted APFS volumes:"
  101. df -h | awk 'NR==1 || /\/dev\/disk.* apfs|\/dev\/disk.*\/Volumes/{print " " $0}' | head -12
  102. # ----------------------------------------------------------------------------
  103. section "3. IO ERRORS (unified log, last ${DAYS}d)"
  104. # ----------------------------------------------------------------------------
  105. io_lines=$(log show --last "${DAYS}d" --style compact \
  106. --predicate '(subsystem == "com.apple.iokit" OR subsystem == "com.apple.kernel") AND (eventMessage CONTAINS[c] "I/O error" OR eventMessage CONTAINS[c] "media error" OR eventMessage CONTAINS[c] "MEDIA_ERROR" OR eventMessage CONTAINS[c] "device timeout")' \
  107. 2>/dev/null)
  108. io_count=$(echo "$io_lines" | grep -c . || echo 0)
  109. if [[ "$io_count" -gt 50 ]]; then
  110. log_fail "IO errors in log" "$io_count events — active failure"
  111. note " Sample (first 5):"
  112. echo "$io_lines" | head -5 | sed 's/^/ /'
  113. elif [[ "$io_count" -gt 5 ]]; then
  114. log_warn "IO errors in log" "$io_count events"
  115. note " Sample (first 3):"
  116. echo "$io_lines" | head -3 | sed 's/^/ /'
  117. elif [[ "$io_count" -gt 0 ]]; then
  118. log_info "IO errors in log" "$io_count events (occasional events normal)"
  119. else
  120. log_pass "IO errors in log" "0"
  121. fi
  122. # APFS-specific corruption signal
  123. apfs_errors=$(log show --last "${DAYS}d" --style compact \
  124. --predicate 'eventMessage CONTAINS "apfs" AND (messageType == "Error" OR messageType == "Fault")' \
  125. 2>/dev/null | wc -l | tr -d ' ')
  126. if [[ "$apfs_errors" -gt 10 ]]; then
  127. log_warn "APFS error/fault events" "$apfs_errors"
  128. else
  129. log_pass "APFS error/fault events" "$apfs_errors"
  130. fi
  131. # ----------------------------------------------------------------------------
  132. section "4. APFS SNAPSHOT BLOAT"
  133. # ----------------------------------------------------------------------------
  134. # Per-volume snapshot count
  135. mount | awk '/apfs/{print $3}' | while read -r mnt; do
  136. [[ -z "$mnt" ]] && continue
  137. snap_count=$(tmutil listlocalsnapshots "$mnt" 2>/dev/null | grep -c "com.apple" || echo 0)
  138. if [[ "$snap_count" -gt 20 ]]; then
  139. log_warn "Snapshots on $mnt" "$snap_count — purgeable space tied up"
  140. elif [[ "$snap_count" -gt 0 ]]; then
  141. log_info "Snapshots on $mnt" "$snap_count"
  142. else
  143. log_pass "Snapshots on $mnt" "0"
  144. fi
  145. done
  146. # ----------------------------------------------------------------------------
  147. section "5. FREE SPACE / PURGEABLE BREAKDOWN"
  148. # ----------------------------------------------------------------------------
  149. if [[ -n "$TARGET_VOL" ]]; then
  150. volumes=("$TARGET_VOL")
  151. else
  152. mapfile -t volumes < <(mount | awk '/apfs/{print $3}' | head -6)
  153. fi
  154. for v in "${volumes[@]}"; do
  155. [[ -d "$v" ]] || continue
  156. df_line=$(df -h "$v" 2>/dev/null | tail -1)
  157. free_pct=$(echo "$df_line" | awk '{gsub("%","",$5); print 100-$5}')
  158. free_gb=$(echo "$df_line" | awk '{print $4}')
  159. note " $v: ${free_gb} free (${free_pct}%)"
  160. if [[ "$free_pct" -lt 10 ]]; then
  161. log_warn "Free space on $v" "${free_pct}% — low"
  162. else
  163. log_pass "Free space on $v" "${free_pct}%"
  164. fi
  165. # Purgeable space from APFS (requires diskutil apfs)
  166. purgeable=$(diskutil apfs list 2>/dev/null | awk -v vol="$v" '
  167. $0 ~ vol {found=1}
  168. found && /Capacity In Use/{print $NF; found=0; exit}
  169. ')
  170. done
  171. # ----------------------------------------------------------------------------
  172. section "6. VOLUME VERIFICATION (read-only)"
  173. # ----------------------------------------------------------------------------
  174. # Only verify the target if we have one; iterating all volumes is slow + noisy.
  175. if [[ -n "$TARGET_VOL" ]]; then
  176. verify_target="$TARGET_VOL"
  177. elif [[ -n "$disk_id" ]]; then
  178. verify_target="$disk_id"
  179. else
  180. verify_target=""
  181. fi
  182. if [[ -n "$verify_target" ]]; then
  183. note " Running: diskutil verifyVolume $verify_target (read-only)"
  184. if diskutil verifyVolume "$verify_target" 2>&1 | grep -q "appears to be OK"; then
  185. log_pass "verifyVolume $verify_target" "OK"
  186. else
  187. log_warn "verifyVolume $verify_target" "did not return clean (may need sudo or already in use)"
  188. fi
  189. else
  190. note " (skipped — pass -v or -d to verify a specific target)"
  191. fi
  192. emit_summary
  193. if [[ "$JSON_MODE" -eq 0 ]]; then
  194. echo
  195. note " Drilldowns:"
  196. note " drive-dependencies.sh -v <mount> # check what references a volume"
  197. note " storage-pressure.sh # snapshot bloat detail"
  198. note " recover-clone.sh # safely image data off a failing drive"
  199. fi