disk-health.ps1 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. <#
  2. .SYNOPSIS
  3. Focused per-drive health report — every diagnostic signal for one
  4. specific physical disk in one report.
  5. .DESCRIPTION
  6. Drill-down companion to health-audit.ps1. Targets a single physical
  7. disk (by number, drive letter, or model substring) and emits:
  8. - Hardware identification (model, serial, firmware, capacity)
  9. - SMART reliability counters (Windows native + smartctl if installed)
  10. - All disk-provider events for the disk over the time window
  11. - All storahci controller resets (skill correlates port to drive)
  12. - Per-event-ID breakdown with severity classification
  13. - Recovery clues — failing-LBA distribution, time-clustering
  14. - System dependencies — quick summary (uses drive-dependencies.ps1
  15. if available, else inline check)
  16. .PARAMETER DiskNumber
  17. Physical disk number from Get-Disk. Mutually exclusive with -DriveLetter
  18. and -Model.
  19. .PARAMETER DriveLetter
  20. Drive letter — resolves to the underlying physical disk.
  21. .PARAMETER Model
  22. Model substring match (e.g. 'HGST', '980 PRO'). Picks the first match.
  23. .PARAMETER Days
  24. Days back to scan event logs. Default: 60.
  25. .PARAMETER Json
  26. Machine-readable JSON output.
  27. .EXAMPLE
  28. scripts/disk-health.ps1 -DiskNumber 1
  29. Focused report on physical disk 1.
  30. .EXAMPLE
  31. scripts/disk-health.ps1 -DriveLetter Y -Days 30
  32. Drill on the disk that hosts Y:, 30-day window.
  33. .EXAMPLE
  34. scripts/disk-health.ps1 -Model 'HGST' -Json | jq '.errors'
  35. Find the HGST drive and dump its error counts as JSON.
  36. .NOTES
  37. Exit codes:
  38. 0 success — drive looks healthy
  39. 3 not found — no matching disk
  40. 4 validation — drive shows failure indicators
  41. #>
  42. [CmdletBinding(DefaultParameterSetName='Number')]
  43. param(
  44. [Parameter(ParameterSetName='Number', Position=0)][ValidateRange(0, 99)][int]$DiskNumber = -1,
  45. [Parameter(ParameterSetName='Letter')][ValidatePattern('^[A-Za-z]$')][string]$DriveLetter,
  46. [Parameter(ParameterSetName='Model')][string]$Model,
  47. [ValidateRange(1, 365)][int]$Days = 60,
  48. [switch]$Json
  49. )
  50. $ErrorActionPreference = 'Stop'
  51. . "$PSScriptRoot\_lib\common.ps1"
  52. . (Join-Path $PSScriptRoot '..\..\_lib\term.ps1')
  53. Initialize-Term
  54. # Resolve target disk
  55. $disks = Get-DiskMap
  56. $target = $null
  57. switch ($PSCmdlet.ParameterSetName) {
  58. 'Number' {
  59. if ($DiskNumber -lt 0) {
  60. Write-Log -Level FAIL -Message "Provide -DiskNumber, -DriveLetter, or -Model"
  61. exit $script:EXIT_USAGE
  62. }
  63. $target = $disks | Where-Object { $_.Number -eq $DiskNumber } | Select-Object -First 1
  64. }
  65. 'Letter' {
  66. $L = $DriveLetter.ToUpper()
  67. $part = Get-Partition -ErrorAction SilentlyContinue | Where-Object { $_.DriveLetter -eq $L } | Select-Object -First 1
  68. if ($part) {
  69. $target = $disks | Where-Object { $_.Number -eq $part.DiskNumber } | Select-Object -First 1
  70. }
  71. }
  72. 'Model' {
  73. $target = $disks | Where-Object { $_.Model -like "*$Model*" } | Select-Object -First 1
  74. }
  75. }
  76. if (-not $target) {
  77. Write-Log -Level FAIL -Message "No matching disk found"
  78. exit $script:EXIT_NOT_FOUND
  79. }
  80. # Collect data
  81. $result = [ordered]@{
  82. diskNumber = $target.Number
  83. model = $target.Model
  84. serial = $target.SerialNumber
  85. firmware = $target.FirmwareVersion
  86. mediaType = $target.MediaType
  87. busType = $target.BusType
  88. sizeGB = $target.SizeGB
  89. driveLetters = $target.DriveLetters
  90. healthStatus = $target.HealthStatus
  91. windowDays = $Days
  92. smart = $null
  93. eventCounts = @{}
  94. eventSamples = @()
  95. storahciResets = 0
  96. verdict = 'unknown'
  97. indicators = @()
  98. }
  99. # SMART reliability counter (Windows native)
  100. try {
  101. $physical = Get-PhysicalDisk | Where-Object { $_.DeviceId -eq $target.Number }
  102. $rel = $physical | Get-StorageReliabilityCounter -ErrorAction SilentlyContinue
  103. if ($rel) {
  104. $result.smart = @{
  105. temperatureC = $rel.Temperature
  106. temperatureMax = $rel.TemperatureMax
  107. wearPct = $rel.Wear
  108. readErrors = $rel.ReadErrorsTotal
  109. writeErrors = $rel.WriteErrorsTotal
  110. powerOnHours = $rel.PowerOnHours
  111. powerCycles = $rel.PowerCycleCount
  112. startStops = $rel.StartStopCycleCount
  113. }
  114. }
  115. } catch {}
  116. # smartctl fallback (if smartmontools installed)
  117. $smartctl = Get-Command smartctl.exe -ErrorAction SilentlyContinue
  118. if ($smartctl -and -not $result.smart) {
  119. try {
  120. $smartOutput = & smartctl -A "/dev/sd$($target.Number)" 2>&1
  121. if ($smartOutput) {
  122. $result.smartctlAvailable = $true
  123. $result.smartctlOutput = ($smartOutput -join "`n")
  124. }
  125. } catch {}
  126. }
  127. # Disk-provider events for this disk
  128. try {
  129. $diskErrs = Get-WinEvent -FilterHashtable @{
  130. LogName='System'
  131. ProviderName='disk'
  132. StartTime=(Get-Date).AddDays(-$Days)
  133. } -ErrorAction SilentlyContinue
  134. foreach ($e in $diskErrs) {
  135. $n = $null
  136. if ($e.Message -match 'Harddisk(\d+)') { $n = [int]$matches[1] }
  137. elseif ($e.Message -match '\bfor Disk (\d+)\b') { $n = [int]$matches[1] }
  138. if ($n -ne $target.Number) { continue }
  139. $id = "$($e.Id)"
  140. if ($result.eventCounts.ContainsKey($id)) {
  141. $result.eventCounts[$id] = $result.eventCounts[$id] + 1
  142. } else {
  143. $result.eventCounts[$id] = 1
  144. }
  145. if ($result.eventSamples.Count -lt 5) {
  146. $result.eventSamples += @{
  147. time = $e.TimeCreated.ToString('o')
  148. id = $e.Id
  149. message = (Format-EventMessage -Message $e.Message -MaxLength 150)
  150. }
  151. }
  152. }
  153. } catch {}
  154. # storahci resets (controller-level; we can't always tie a port to a specific
  155. # disk number reliably, so report total reset count and let caller correlate
  156. # via drive enumeration order)
  157. try {
  158. $resets = Get-WinEvent -FilterHashtable @{
  159. LogName='System'
  160. ProviderName='storahci'
  161. Id=129
  162. StartTime=(Get-Date).AddDays(-$Days)
  163. } -ErrorAction SilentlyContinue
  164. $result.storahciResets = if ($resets) { $resets.Count } else { 0 }
  165. } catch {}
  166. # Severity classification
  167. $isSsd = $target.MediaType -eq 'SSD'
  168. $ev7 = if ($result.eventCounts.ContainsKey('7')) { $result.eventCounts['7'] } else { 0 }
  169. $ev51 = if ($result.eventCounts.ContainsKey('51')) { $result.eventCounts['51'] } else { 0 }
  170. $ev154 = if ($result.eventCounts.ContainsKey('154')) { $result.eventCounts['154'] } else { 0 }
  171. $thresholds = if ($isSsd) {
  172. @{ event7=10; event154=5; event51=5 }
  173. } else {
  174. @{ event7=50; event154=10; event51=5 }
  175. }
  176. # storahci controller resets are not reliably attributable to a specific
  177. # physical disk number (RaidPort enumeration doesn't always map 1:1 to
  178. # Disk N). Only count them toward THIS disk's verdict when the disk also
  179. # shows its own error events — otherwise they're system-wide noise that
  180. # would falsely blame healthy drives sharing the same controller.
  181. $thisDiskHasOwnErrors = ($ev7 + $ev154 + $ev51) -gt 0
  182. $attributedResets = if ($thisDiskHasOwnErrors) { $result.storahciResets } else { 0 }
  183. $failing = (
  184. $ev7 -gt $thresholds.event7 -or
  185. $ev154 -gt $thresholds.event154 -or
  186. $ev51 -gt $thresholds.event51 -or
  187. $attributedResets -gt 5
  188. )
  189. $watch = (
  190. $ev7 -gt 5 -or
  191. $ev154 -gt 2 -or
  192. $attributedResets -gt 0
  193. )
  194. if ($failing) {
  195. $result.verdict = 'FAILING'
  196. if ($ev7 -gt $thresholds.event7) { $result.indicators += "Event 7 (bad block): $ev7 > $($thresholds.event7) threshold" }
  197. if ($ev154 -gt $thresholds.event154) { $result.indicators += "Event 154 (hw error): $ev154 > $($thresholds.event154) threshold" }
  198. if ($ev51 -gt $thresholds.event51) { $result.indicators += "Event 51 (paging error): $ev51 > $($thresholds.event51) threshold" }
  199. if ($attributedResets -gt 5) { $result.indicators += "Controller resets: $attributedResets > 5 threshold" }
  200. } elseif ($watch) {
  201. $result.verdict = 'WATCHLIST'
  202. if ($ev7 -gt 5) { $result.indicators += "Event 7 elevated: $ev7" }
  203. if ($ev154 -gt 2) { $result.indicators += "Event 154 elevated: $ev154" }
  204. if ($attributedResets -gt 0) { $result.indicators += "Controller resets: $attributedResets" }
  205. } else {
  206. $result.verdict = 'HEALTHY'
  207. }
  208. # Always retain the system-wide reset count for context, but flag separately
  209. $result.systemWideResets = $result.storahciResets
  210. # Output
  211. if ($Json) {
  212. [Console]::Out.WriteLine(($result | ConvertTo-Json -Depth 5))
  213. } else {
  214. $indicator = "Disk $($target.Number) / $($target.DriveLetters)"
  215. Write-TermLine (New-TermPanelOpen -Brand 'windows-ops' -Name 'windows-ops' -Subtitle 'disk-health' -Indicator $indicator)
  216. Write-TermLine (New-TermPanelVert)
  217. Write-TermLine (New-TermSummary -Text "$($target.Model) · $($target.FirmwareVersion) · $($target.SizeGB) GB · $($target.MediaType)/$($target.BusType)")
  218. Write-TermLine (New-TermPanelVert)
  219. # Verdict section header carries the state via section-color
  220. $verdictState = switch ($result.verdict) {
  221. 'FAILING' { 'FAILING' }
  222. 'WATCHLIST' { 'WARN' }
  223. 'HEALTHY' { 'PASS' }
  224. }
  225. if ($result.indicators) {
  226. Write-TermLine (New-TermSection -State $verdictState -Label $result.verdict -Count $result.indicators.Count)
  227. # Each indicator as a leaf with pip bar showing ratio over threshold
  228. $idxLast = $result.indicators.Count - 1
  229. for ($i = 0; $i -lt $result.indicators.Count; $i++) {
  230. $ind = $result.indicators[$i]
  231. # Parse indicator like "Event 7 (bad block): 1943 > 50 threshold"
  232. $name = $ind
  233. $bar = ''
  234. $meta = ''
  235. if ($ind -match '^(.+?):\s*(\d+)\s*>\s*(\d+)') {
  236. $name = $matches[1].Trim()
  237. $actual = [int]$matches[2]
  238. $threshold = [int]$matches[3]
  239. $ratio = [math]::Min(100, [int](100 * $threshold / [math]::Max($actual, 1)))
  240. # Inverted score: lower ratio = worse (more times over threshold)
  241. $bar = New-TermPipBar -Type capacity -Filled (100 - $ratio) -Total 100
  242. $multiplier = [math]::Round($actual / [math]::Max($threshold, 1), 1)
  243. $meta = "${actual}x"
  244. } elseif ($ind -match '(\d+)') {
  245. $meta = $matches[1]
  246. }
  247. Write-TermLine (New-TermLeaf -Name $name -Rail $bar -Meta $meta -IsLast:($i -eq $idxLast))
  248. }
  249. if ($result.verdict -eq 'FAILING') {
  250. Write-TermLine (New-TermAlert -Severity critical -Text 'back up data, run drive-dependencies.ps1, then replace')
  251. } elseif ($result.verdict -eq 'WATCHLIST') {
  252. Write-TermLine (New-TermAlert -Severity warning -Text 'back up irreplaceable data, monitor weekly')
  253. }
  254. Write-TermLine (New-TermPanelVert)
  255. } else {
  256. Write-TermLine (New-TermSection -State 'PASS' -Label $result.verdict -Count -1)
  257. Write-TermLine (New-TermLeaf -Name 'no failure indicators' -Meta "$Days-day window clean" -IsLast)
  258. Write-TermLine (New-TermPanelVert)
  259. }
  260. # SMART section
  261. Write-TermLine (New-TermSection -State 'INFO' -Label 'SMART' -Count -1)
  262. if ($result.smart) {
  263. Write-TermLine (New-TermLeaf -Name 'temperature' -Meta "$($result.smart.temperatureC) C (max: $($result.smart.temperatureMax))")
  264. Write-TermLine (New-TermLeaf -Name 'wear' -Meta "$($result.smart.wearPct)%")
  265. Write-TermLine (New-TermLeaf -Name 'read errors' -Meta "$($result.smart.readErrors)")
  266. Write-TermLine (New-TermLeaf -Name 'write errors' -Meta "$($result.smart.writeErrors)")
  267. Write-TermLine (New-TermLeaf -Name 'power on hours' -Meta "$($result.smart.powerOnHours)" -IsLast)
  268. } else {
  269. Write-TermLine (New-TermLeaf -Name 'reliability counter' -Meta 'unavailable' -IsLast)
  270. if ($smartctl) {
  271. Write-TermLine (New-TermHint -Text 'smartctl installed but call failed — try: smartctl -A /dev/sdX')
  272. } else {
  273. Write-TermLine (New-TermHint -Text 'scoop install smartmontools for SMART access')
  274. }
  275. }
  276. Write-TermLine (New-TermPanelVert)
  277. # Footer
  278. $health = switch ($result.verdict) {
  279. 'FAILING' { New-TermHealth -State 'busted' -Text 'failing' }
  280. 'WATCHLIST' { New-TermHealth -State 'warning' -Text 'watchlist' }
  281. 'HEALTHY' { New-TermHealth -State 'healthy' -Text 'healthy' }
  282. }
  283. $hk = @(
  284. (New-TermHotkey -Key 'B' -Verb 'back')
  285. (New-TermHotkey -Key 'C' -Verb 'clone')
  286. (New-TermHotkey -Key '?' -Verb 'help')
  287. ) | Join-TermHotkeys
  288. Write-TermLine (New-TermPanelClose -Hotkeys $hk -Healths $health)
  289. }
  290. if ($result.verdict -eq 'FAILING') { exit $script:EXIT_VALIDATION }
  291. exit $script:EXIT_OK