crash-triage.ps1 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. <#
  2. .SYNOPSIS
  3. Decode an Event 41 crash record and surface the events in the N
  4. minutes leading up to it. The pre-crash timeline is where the
  5. actual cause lives.
  6. .DESCRIPTION
  7. Reads Event 41 (Kernel-Power) and properly decodes:
  8. Properties[0] = BugCheckCode (the stop code; NOT Properties[1])
  9. Properties[1-4] = BugcheckParameter1-4
  10. Properties[6] = PowerButtonTimestamp (non-zero = forced shutdown)
  11. Then walks events in the configurable window before the crash from
  12. System log providers that matter for crash correlation: storage
  13. drivers, GPU drivers, WHEA hardware errors, kernel-power.
  14. BugCheck = 0x0 with no power-button = hard power loss or hardware
  15. lockup. BugCheck = 0x0 with power-button = user force-shutdown of a
  16. hung machine. Non-zero codes are decoded against the known catalog
  17. (see references/bugcheck-codes.md).
  18. .PARAMETER CrashTime
  19. Specific crash time (datetime) to triage. If omitted, the most
  20. recent Event 41 within -DaysBack is used.
  21. .PARAMETER WindowMinutes
  22. Minutes before the crash to scan for correlated events. Default: 10.
  23. .PARAMETER DaysBack
  24. When -CrashTime is omitted, how far back to look for the most recent
  25. crash. Default: 30.
  26. .PARAMETER Json
  27. Emit machine-readable JSON.
  28. .EXAMPLE
  29. scripts/crash-triage.ps1
  30. Triage the most recent crash in the last 30 days.
  31. .EXAMPLE
  32. scripts/crash-triage.ps1 -CrashTime '2026-05-15 00:57:50'
  33. Triage a specific crash by timestamp.
  34. .EXAMPLE
  35. scripts/crash-triage.ps1 -CrashTime '2026-05-15 00:57:50' -WindowMinutes 30
  36. Widen the pre-crash window to 30 minutes (default 10).
  37. .EXAMPLE
  38. scripts/crash-triage.ps1 -Json | jq '.bugcheck'
  39. Pull just the BugCheck code from machine-readable output.
  40. .NOTES
  41. Exit codes:
  42. 0 success
  43. 3 not found (no crashes in window)
  44. 4 validation
  45. #>
  46. [CmdletBinding()]
  47. param(
  48. [datetime]$CrashTime,
  49. [ValidateRange(1, 240)][int]$WindowMinutes = 10,
  50. [ValidateRange(1, 365)][int]$DaysBack = 30,
  51. [switch]$Json
  52. )
  53. $ErrorActionPreference = 'Stop'
  54. . "$PSScriptRoot\_lib\common.ps1"
  55. . (Join-Path $PSScriptRoot '..\..\_lib\term.ps1')
  56. Initialize-Term
  57. # BugCheck quick-lookup (most common codes; full catalog in references/bugcheck-codes.md)
  58. $bugCheckNames = @{
  59. 0x0 = '(no bugcheck recorded — hard power loss / total hang / hardware lockup)'
  60. 0x1E = 'KMODE_EXCEPTION_NOT_HANDLED'
  61. 0x1A = 'MEMORY_MANAGEMENT'
  62. 0x3B = 'SYSTEM_SERVICE_EXCEPTION'
  63. 0x50 = 'PAGE_FAULT_IN_NONPAGED_AREA (often storage I/O failure for pagefile)'
  64. 0x77 = 'KERNEL_STACK_INPAGE_ERROR (storage paging failure)'
  65. 0x7A = 'KERNEL_DATA_INPAGE_ERROR (storage paging failure)'
  66. 0x7E = 'SYSTEM_THREAD_EXCEPTION_NOT_HANDLED (often GPU/network driver)'
  67. 0x9F = 'DRIVER_POWER_STATE_FAILURE (driver hung during sleep/wake)'
  68. 0xA = 'IRQL_NOT_LESS_OR_EQUAL'
  69. 0xC1 = 'SPECIAL_POOL_DETECTED_MEMORY_CORRUPTION (Driver Verifier)'
  70. 0xC2 = 'BAD_POOL_CALLER'
  71. 0xC4 = 'DRIVER_VERIFIER_DETECTED_VIOLATION'
  72. 0xD1 = 'DRIVER_IRQL_NOT_LESS_OR_EQUAL (driver accessed bad memory at high IRQL)'
  73. 0xEF = 'CRITICAL_PROCESS_DIED (critical system process killed)'
  74. 0xF4 = 'CRITICAL_OBJECT_TERMINATION (often storage-induced)'
  75. 0x101 = 'CLOCK_WATCHDOG_TIMEOUT (CPU stall — chipset or hardware)'
  76. 0x124 = 'WHEA_UNCORRECTABLE_ERROR (hardware-level fault)'
  77. 0x139 = 'KERNEL_SECURITY_CHECK_FAILURE (stack/pool corruption)'
  78. }
  79. # ─────────────────────────────────────────────────────────────────────
  80. # Find the target crash
  81. # ─────────────────────────────────────────────────────────────────────
  82. if (-not $CrashTime) {
  83. Write-Log -Level INFO -Message "No -CrashTime given; finding most recent Event 41 in last $DaysBack days"
  84. $crash = Get-WinEvent -FilterHashtable @{
  85. LogName='System'
  86. Id=41
  87. StartTime=(Get-Date).AddDays(-$DaysBack)
  88. } -MaxEvents 1 -ErrorAction SilentlyContinue
  89. if (-not $crash) {
  90. Write-Log -Level INFO -Message "No Event 41 crashes found in last $DaysBack days. System has been stable."
  91. exit $script:EXIT_NOT_FOUND
  92. }
  93. $CrashTime = $crash.TimeCreated
  94. } else {
  95. # Find the Event 41 closest to the given time (within ±60 seconds)
  96. $low = $CrashTime.AddMinutes(-1)
  97. $high = $CrashTime.AddMinutes(1)
  98. $crash = Get-WinEvent -FilterHashtable @{
  99. LogName='System'
  100. Id=41
  101. StartTime=$low
  102. EndTime=$high
  103. } -ErrorAction SilentlyContinue | Select-Object -First 1
  104. if (-not $crash) {
  105. Write-Log -Level FAIL -Message "No Event 41 found within ±60s of $CrashTime"
  106. exit $script:EXIT_NOT_FOUND
  107. }
  108. }
  109. # ─────────────────────────────────────────────────────────────────────
  110. # Decode the crash record
  111. # ─────────────────────────────────────────────────────────────────────
  112. $bcCode = [int64]$crash.Properties[0].Value
  113. $param1 = [int64]$crash.Properties[1].Value
  114. $param2 = [int64]$crash.Properties[2].Value
  115. $param3 = [int64]$crash.Properties[3].Value
  116. $param4 = [int64]$crash.Properties[4].Value
  117. $pwrBtn = if ($crash.Properties.Count -gt 6) { [int64]$crash.Properties[6].Value } else { 0 }
  118. $bcHex = '0x{0:X}' -f $bcCode
  119. $bcName = if ($bugCheckNames.ContainsKey([int]$bcCode)) { $bugCheckNames[[int]$bcCode] } else { '(unknown — consult references/bugcheck-codes.md)' }
  120. # Cause discrimination for BugCheck = 0
  121. $causeHint = if ($bcCode -eq 0) {
  122. if ($pwrBtn -ne 0) { 'Power button was held → user force-shutdown of a hung machine' }
  123. else { 'No power button press recorded → hard power loss / hardware lockup / thermal trip' }
  124. } else { $null }
  125. # ─────────────────────────────────────────────────────────────────────
  126. # Walk the pre-crash window
  127. # ─────────────────────────────────────────────────────────────────────
  128. $windowStart = $CrashTime.AddMinutes(-$WindowMinutes)
  129. $preEvents = Get-WinEvent -FilterHashtable @{
  130. LogName='System'
  131. StartTime=$windowStart
  132. EndTime=$CrashTime
  133. Level=@(1,2,3)
  134. } -ErrorAction SilentlyContinue | Sort-Object TimeCreated
  135. # Smoking-gun detection
  136. $smokingGuns = @()
  137. foreach ($e in $preEvents) {
  138. if ($e.ProviderName -eq 'storahci' -and $e.Id -eq 129) {
  139. $smokingGuns += "STORAGE: storahci controller reset at $($e.TimeCreated.ToString('HH:mm:ss')) — drive stopped responding"
  140. } elseif ($e.ProviderName -eq 'Microsoft-Windows-WHEA-Logger' -and $e.Level -le 2) {
  141. $smokingGuns += "HARDWARE: WHEA error at $($e.TimeCreated.ToString('HH:mm:ss')) — CPU/RAM/PCIe-level fault"
  142. } elseif ($e.ProviderName -match 'nvlddmkm|igdkmd|amdkmdag' -and $e.Level -le 2) {
  143. $smokingGuns += "GPU: $($e.ProviderName) error at $($e.TimeCreated.ToString('HH:mm:ss')) — GPU driver issue"
  144. } elseif ($e.ProviderName -eq 'disk' -and $e.Id -in @(7,51,153,154)) {
  145. $smokingGuns += "STORAGE: disk Event $($e.Id) at $($e.TimeCreated.ToString('HH:mm:ss')) — bad block or hardware error"
  146. }
  147. }
  148. # ─────────────────────────────────────────────────────────────────────
  149. # Output
  150. # ─────────────────────────────────────────────────────────────────────
  151. if ($Json) {
  152. @{
  153. crashTime = $CrashTime.ToString('o')
  154. bugcheck = $bcHex
  155. bugcheckName = $bcName
  156. param1 = '0x{0:X}' -f $param1
  157. param2 = '0x{0:X}' -f $param2
  158. param3 = '0x{0:X}' -f $param3
  159. param4 = '0x{0:X}' -f $param4
  160. powerButtonHeld = ($pwrBtn -ne 0)
  161. causeHint = $causeHint
  162. windowMinutes = $WindowMinutes
  163. preCrashEvents = $preEvents.Count
  164. smokingGuns = $smokingGuns
  165. timeline = $preEvents | ForEach-Object {
  166. @{
  167. time = $_.TimeCreated.ToString('o')
  168. provider = $_.ProviderName
  169. id = $_.Id
  170. level = $_.LevelDisplayName
  171. message = (Format-EventMessage -Message $_.Message -MaxLength 200)
  172. }
  173. }
  174. } | ConvertTo-Json -Depth 5 | ForEach-Object { [Console]::Out.WriteLine($_) }
  175. } else {
  176. $indicator = $CrashTime.ToString('yyyy-MM-dd HH:mm:ss')
  177. Write-TermLine (New-TermPanelOpen -Brand 'windows-ops' -Name 'windows-ops' -Subtitle 'crash-triage' -Indicator $indicator)
  178. Write-TermLine (New-TermPanelVert)
  179. Write-TermLine (New-TermSummary -Text "BugCheck $bcHex · $bcName")
  180. Write-TermLine (New-TermPanelVert)
  181. # PARAMETERS section
  182. Write-TermLine (New-TermSection -State 'INFO' -Label 'parameters' -Count -1)
  183. Write-TermLine (New-TermLeaf -Name 'Param1' -Meta ('0x{0:X}' -f $param1))
  184. Write-TermLine (New-TermLeaf -Name 'Param2' -Meta ('0x{0:X}' -f $param2))
  185. Write-TermLine (New-TermLeaf -Name 'Param3' -Meta ('0x{0:X}' -f $param3))
  186. Write-TermLine (New-TermLeaf -Name 'Param4' -Meta ('0x{0:X}' -f $param4))
  187. $pwrText = if ($pwrBtn -ne 0) { 'held (forced shutdown)' } else { 'not pressed' }
  188. Write-TermLine (New-TermLeaf -Name 'PowerButton' -Meta $pwrText -IsLast)
  189. if ($causeHint) {
  190. Write-TermLine (New-TermAlert -Severity warning -Text $causeHint)
  191. }
  192. Write-TermLine (New-TermPanelVert)
  193. # TIMELINE section
  194. if ($preEvents) {
  195. Write-TermLine (New-TermSection -State 'WARN' -Label "pre-crash timeline" -Count $preEvents.Count)
  196. $idxLast = $preEvents.Count - 1
  197. for ($i = 0; $i -lt $preEvents.Count; $i++) {
  198. $e = $preEvents[$i]
  199. $deltaSec = [int]($CrashTime - $e.TimeCreated).TotalSeconds
  200. $deltaStr = if ($deltaSec -ge 60) {
  201. "T-{0}m{1:00}s" -f ([math]::Floor($deltaSec/60)), ($deltaSec % 60)
  202. } else {
  203. "T-{0}s" -f $deltaSec
  204. }
  205. $msg = Format-EventMessage -Message $e.Message -MaxLength 50
  206. Write-TermLine (New-TermLeaf -Name "$($e.ProviderName) $($e.Id)" -Meta $msg -Age $deltaStr -IsLast:($i -eq $idxLast) -NameColWidth 24 -MetaColWidth 50)
  207. }
  208. Write-TermLine (New-TermPanelVert)
  209. } else {
  210. Write-TermLine (New-TermSection -State 'WARN' -Label "pre-crash timeline" -Count 0)
  211. Write-TermLine (New-TermHint -Text 'no warning/error events in window — sudden hang or instant fault')
  212. Write-TermLine (New-TermPanelVert)
  213. }
  214. # SMOKING GUNS section
  215. if ($smokingGuns) {
  216. Write-TermLine (New-TermSection -State 'FAILING' -Label 'smoking guns' -Count $smokingGuns.Count)
  217. $idxLast = $smokingGuns.Count - 1
  218. for ($i = 0; $i -lt $smokingGuns.Count; $i++) {
  219. Write-TermLine (New-TermLeaf -Name $smokingGuns[$i] -IsLast:($i -eq $idxLast) -NameColWidth 80 -RailColWidth 0 -MetaColWidth 0)
  220. }
  221. Write-TermLine (New-TermPanelVert)
  222. }
  223. # Footer
  224. $health = if ($smokingGuns) {
  225. New-TermHealth -State 'busted' -Text 'cascade'
  226. } elseif ($bcCode -eq 0) {
  227. New-TermHealth -State 'critical' -Text 'no bugcheck'
  228. } else {
  229. New-TermHealth -State 'warning' -Text 'decoded'
  230. }
  231. $hk = @(
  232. (New-TermHotkey -Key 'D' -Verb 'drill')
  233. (New-TermHotkey -Key '?' -Verb 'help')
  234. ) | Join-TermHotkeys
  235. Write-TermLine (New-TermPanelClose -Hotkeys $hk -Healths $health)
  236. }
  237. exit $script:EXIT_OK