triage-flakes.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #!/usr/bin/env python3
  2. # Rank Playwright tests by flakiness from a JSON report so the agent triages, not eyeballs.
  3. #
  4. # Parses a Playwright JSON report (`--reporter=json`) and surfaces the tests
  5. # worth a human's attention: flaky tests (passed only on retry) first, then
  6. # hard "unexpected" failures. Flaky tests are ranked by retry count desc, then
  7. # total duration desc, because the most-retried, slowest test is the worst
  8. # offender in your queue.
  9. #
  10. # Usage: triage-flakes.py [OPTIONS] [REPORT]
  11. # Input: REPORT = path to a Playwright JSON report (positional, default ./results.json)
  12. # Output: stdout = ranked findings (TSV, or JSON envelope with --json)
  13. # Stderr: headers, summary, progress, errors
  14. # Exit: 0 parsed fine, no flaky/unexpected tests (clean suite)
  15. # 2 usage, 3 file not found, 4 malformed/not a Playwright report,
  16. # 10 DOMAIN SIGNAL: flaky/unexpected tests present (the thing being triaged)
  17. #
  18. # Examples:
  19. # npx playwright test --reporter=json > results.json
  20. # triage-flakes.py results.json
  21. # triage-flakes.py --outcome all -n 50 results.json
  22. # triage-flakes.py --json results.json | jq '.data[] | select(.outcome=="flaky")'
  23. import argparse
  24. import json
  25. import os
  26. import sys
  27. from pathlib import Path
  28. # Windows consoles default to cp1252; force UTF-8 so glyphs in framing don't raise
  29. # UnicodeEncodeError (the repo's standard fix).
  30. for _stream in (sys.stdout, sys.stderr):
  31. try:
  32. _stream.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
  33. except (AttributeError, ValueError):
  34. pass
  35. class Term:
  36. """Tiny ANSI helper mirroring skills/_lib/term.sh (bash-only; per
  37. TERMINAL-DESIGN.md §9 the Python port is inline). Honors FORCE_COLOR /
  38. NO_COLOR / TERM_ASCII; ASCII glyph fallback on TERM_ASCII or a non-UTF stream."""
  39. _C = {"green": "\033[32m", "yellow": "\033[33m", "orange": "\033[38;5;208m",
  40. "red": "\033[31m", "cyan": "\033[36m", "dim": "\033[2m", "off": "\033[0m"}
  41. _GLYPH = {"ok": "✓", "bad": "✗", "warn": "▲", "skip": "—", "na": "—", "unknown": "?"}
  42. _ASCII = {"ok": "+", "bad": "x", "warn": "!", "skip": "-", "na": "-", "unknown": "?"}
  43. _MARK_COLOR = {"ok": "green", "bad": "red", "warn": "orange", "skip": "dim",
  44. "na": "dim", "unknown": "yellow"}
  45. def __init__(self, stream=sys.stderr):
  46. enc = (getattr(stream, "encoding", "") or "").lower()
  47. self.ascii = (os.environ.get("TERM_ASCII") == "1"
  48. or os.environ.get("FLEET_ASCII") == "1" or "utf" not in enc)
  49. if os.environ.get("FORCE_COLOR"):
  50. self.color = True
  51. elif (os.environ.get("NO_COLOR") is not None or os.environ.get("TERM") == "dumb"
  52. or not getattr(stream, "isatty", lambda: False)()):
  53. self.color = False
  54. else:
  55. self.color = True
  56. def c(self, name, text):
  57. return f"{self._C.get(name, '')}{text}{self._C['off']}" if self.color else text
  58. def mark(self, state):
  59. return self.c(self._MARK_COLOR.get(state, ""),
  60. (self._ASCII if self.ascii else self._GLYPH).get(state, "."))
  61. def hdr(self, text):
  62. return self.c("cyan", f"=== {text} ===")
  63. TERM = Term(sys.stderr)
  64. SCHEMA = "claude-mods.playwright-ops.flake-triage/v1"
  65. EXIT_OK = 0
  66. EXIT_USAGE = 2
  67. EXIT_NOT_FOUND = 3
  68. EXIT_VALIDATION = 4
  69. EXIT_FINDINGS = 10
  70. # Rank order for outcomes: flaky always sorts before unexpected.
  71. OUTCOME_RANK = {"flaky": 0, "unexpected": 1}
  72. def err(msg):
  73. print(msg, file=sys.stderr)
  74. def walk_suites(suites, finds, file_hint=""):
  75. """Recursively descend the suites tree collecting spec/test results."""
  76. for suite in suites or []:
  77. # A suite's file is on the suite node; specs inherit it.
  78. sfile = suite.get("file") or file_hint
  79. for spec in suite.get("specs", []) or []:
  80. collect_spec(spec, finds, sfile)
  81. walk_suites(suite.get("suites"), finds, sfile)
  82. def collect_spec(spec, finds, sfile):
  83. title = spec.get("title", "<untitled>")
  84. sline = spec.get("line", 0)
  85. sfile = spec.get("file") or sfile
  86. for test in spec.get("tests", []) or []:
  87. outcome = test.get("status") or test.get("outcome") or "unknown"
  88. results = test.get("results", []) or []
  89. # status sequence ordered by retry index; duration summed across attempts
  90. ordered = sorted(results, key=lambda r: r.get("retry", 0))
  91. statuses = [r.get("status", "unknown") for r in ordered]
  92. duration = sum(int(r.get("duration", 0) or 0) for r in ordered)
  93. retries = max((r.get("retry", 0) for r in ordered), default=0)
  94. location = f"{sfile}:{sline}" if sfile else f"?:{sline}"
  95. finds.append(
  96. {
  97. "title": title,
  98. "location": location,
  99. "outcome": outcome,
  100. "retries": retries,
  101. "statuses": statuses,
  102. "durationMs": duration,
  103. }
  104. )
  105. def load_report(path):
  106. """Return parsed Playwright report dict, or raise ValueError if not one."""
  107. try:
  108. raw = path.read_text(encoding="utf-8")
  109. except OSError as e:
  110. raise FileNotFoundError(str(e))
  111. try:
  112. data = json.loads(raw)
  113. except json.JSONDecodeError as e:
  114. raise ValueError(f"not valid JSON: {e}")
  115. if not isinstance(data, dict) or "suites" not in data:
  116. raise ValueError("missing top-level 'suites' key - not a Playwright JSON report")
  117. if not isinstance(data["suites"], list):
  118. raise ValueError("'suites' is not a list — not a Playwright JSON report")
  119. return data
  120. def main(argv=None):
  121. p = argparse.ArgumentParser(
  122. prog="triage-flakes.py",
  123. description="Rank Playwright tests by flakiness from a JSON report.",
  124. formatter_class=argparse.RawDescriptionHelpFormatter,
  125. epilog=(
  126. "EXAMPLES:\n"
  127. " npx playwright test --reporter=json > results.json\n"
  128. " triage-flakes.py results.json\n"
  129. " triage-flakes.py --outcome all -n 50 results.json\n"
  130. " triage-flakes.py --json results.json | jq '.data[] | select(.outcome==\"flaky\")'\n"
  131. "\n"
  132. "EXIT CODES:\n"
  133. " 0 parsed fine, no flaky/unexpected tests (clean suite)\n"
  134. " 2 usage 3 file not found 4 malformed report\n"
  135. " 10 flaky/unexpected tests present (the triage signal)\n"
  136. ),
  137. )
  138. p.add_argument(
  139. "report",
  140. nargs="?",
  141. default="results.json",
  142. help="path to Playwright JSON report (default: ./results.json)",
  143. )
  144. p.add_argument("--json", action="store_true", help="emit a JSON envelope instead of TSV")
  145. p.add_argument("-q", "--quiet", action="store_true",
  146. help="suppress the stderr summary header (errors still print)")
  147. p.add_argument(
  148. "-n",
  149. "--limit",
  150. type=int,
  151. default=20,
  152. metavar="N",
  153. help="cap rows printed (default 20)",
  154. )
  155. p.add_argument(
  156. "--outcome",
  157. default="flaky,unexpected",
  158. help="which outcomes to include: flaky | unexpected | all (default flaky,unexpected)",
  159. )
  160. args = p.parse_args(argv)
  161. if args.limit < 0:
  162. err("ERROR: --limit must be >= 0")
  163. return EXIT_USAGE
  164. sel = args.outcome.strip().lower()
  165. if sel == "all":
  166. wanted = None # all outcomes
  167. else:
  168. wanted = {x.strip() for x in sel.split(",") if x.strip()}
  169. unknown = wanted - {"flaky", "unexpected", "expected", "skipped"}
  170. if unknown:
  171. err(f"ERROR: unknown outcome(s): {', '.join(sorted(unknown))} (use flaky|unexpected|all)")
  172. return EXIT_USAGE
  173. path = Path(args.report).resolve()
  174. if not path.exists():
  175. err(f"ERROR: report not found: {path}")
  176. if args.json:
  177. print(json.dumps({"error": {"code": "NOT_FOUND", "message": f"report not found: {path}"}}))
  178. return EXIT_NOT_FOUND
  179. if not path.is_file():
  180. err(f"ERROR: not a file: {path}")
  181. return EXIT_NOT_FOUND
  182. try:
  183. data = load_report(path)
  184. except FileNotFoundError as e:
  185. err(f"ERROR: cannot read report: {e}")
  186. return EXIT_NOT_FOUND
  187. except ValueError as e:
  188. err(f"ERROR: malformed report: {e}")
  189. if args.json:
  190. print(json.dumps({"error": {"code": "VALIDATION", "message": str(e)}}))
  191. return EXIT_VALIDATION
  192. finds = []
  193. walk_suites(data.get("suites"), finds)
  194. # The domain signal is computed over ALL findings, regardless of the display
  195. # filter — a clean suite means zero flaky AND zero unexpected, full stop.
  196. signal_present = any(f["outcome"] in ("flaky", "unexpected") for f in finds)
  197. if wanted is None:
  198. shown = list(finds)
  199. else:
  200. shown = [f for f in finds if f["outcome"] in wanted]
  201. # Rank: flaky before unexpected (OUTCOME_RANK), then retries desc, duration desc.
  202. shown.sort(
  203. key=lambda f: (
  204. OUTCOME_RANK.get(f["outcome"], 99),
  205. -f["retries"],
  206. -f["durationMs"],
  207. )
  208. )
  209. capped = shown[: args.limit] if args.limit else shown
  210. total = len(finds)
  211. flaky_n = sum(1 for f in finds if f["outcome"] == "flaky")
  212. unexp_n = sum(1 for f in finds if f["outcome"] == "unexpected")
  213. if not args.quiet:
  214. err(TERM.hdr(f"Flake triage: {path.name}"))
  215. flaky_txt = TERM.c("orange", f"{flaky_n} flaky") if flaky_n else "0 flaky"
  216. unexp_txt = TERM.c("red", f"{unexp_n} unexpected") if unexp_n else "0 unexpected"
  217. err(f" {total} tests | {flaky_txt} | {unexp_txt} | showing {len(capped)} of {len(shown)}")
  218. if args.json:
  219. envelope = {
  220. "data": capped,
  221. "meta": {
  222. "count": len(capped),
  223. "total_matched": len(shown),
  224. "flaky": flaky_n,
  225. "unexpected": unexp_n,
  226. "schema": SCHEMA,
  227. },
  228. }
  229. print(json.dumps(envelope, indent=2))
  230. else:
  231. print("outcome\tretries\tstatuses\tduration_ms\tlocation\ttitle")
  232. for f in capped:
  233. print(
  234. f"{f['outcome']}\t{f['retries']}\t{'->'.join(f['statuses'])}\t"
  235. f"{f['durationMs']}\t{f['location']}\t{f['title']}"
  236. )
  237. return EXIT_FINDINGS if signal_present else EXIT_OK
  238. if __name__ == "__main__":
  239. sys.exit(main())