detect-segments.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #!/usr/bin/env python3
  2. """Silence/scene boundaries as JSON segments — for STT chunking, dead-air cuts, shot splits.
  3. ffmpeg's silencedetect and scene-score output is human-oriented log text on
  4. stderr; this script runs the right filter and parses it into clean segments.
  5. --silence also derives the inverse (speech segments), which is what STT chunking
  6. and the cuts-land-in-silence EDL verification actually consume.
  7. Usage: detect-segments.py [--silence | --scenes] [options] [--json] <file>
  8. Input: one media file as positional
  9. Output: stdout = TSV segments (kind, start, end, duration), or --json envelope
  10. (schema claude-mods.ffmpeg-ops.segments/v1)
  11. Stderr: progress, errors
  12. Exit: 0 ok, 2 usage, 3 file not found, 4 stream missing for mode / parse failure,
  13. 5 ffmpeg missing
  14. Examples:
  15. detect-segments.py --silence interview.mp4
  16. detect-segments.py --silence --noise -35dB --min-silence 0.8 --json in.mp4 | jq '.data.speech'
  17. detect-segments.py --scenes --scene-threshold 0.3 --json in.mp4 | jq '.data.cuts'
  18. """
  19. import argparse
  20. import json
  21. import re
  22. import shutil
  23. import subprocess
  24. import sys
  25. from pathlib import Path
  26. from typing import NoReturn
  27. SCHEMA = "claude-mods.ffmpeg-ops.segments/v1"
  28. EXIT_OK, EXIT_USAGE, EXIT_NOT_FOUND, EXIT_VALIDATION, EXIT_MISSING_DEP = 0, 2, 3, 4, 5
  29. def err(json_mode: bool, code: str, message: str, exit_code: int) -> NoReturn:
  30. if json_mode:
  31. print(json.dumps({"error": {"code": code, "message": message, "details": {}}}))
  32. print(f"ERROR: {message}", file=sys.stderr)
  33. sys.exit(exit_code)
  34. def media_duration(ffprobe: str, path: Path) -> float:
  35. proc = subprocess.run(
  36. [ffprobe, "-v", "error", "-show_entries", "format=duration",
  37. "-of", "default=nw=1:nk=1", str(path)],
  38. capture_output=True, text=True)
  39. try:
  40. return float(proc.stdout.strip())
  41. except ValueError:
  42. return 0.0
  43. def detect_silence(ffmpeg: str, path: Path, noise: str, min_silence: float,
  44. duration: float) -> dict:
  45. proc = subprocess.run(
  46. [ffmpeg, "-hide_banner", "-nostats", "-i", str(path),
  47. "-af", f"silencedetect=noise={noise}:d={min_silence}",
  48. "-vn", "-f", "null", "-"],
  49. capture_output=True, text=True)
  50. if proc.returncode != 0:
  51. return {"_error": (proc.stderr.strip().splitlines() or ["unknown"])[-1]}
  52. starts = [float(m) for m in re.findall(r"silence_start:\s*(-?[\d.]+)", proc.stderr)]
  53. ends = [float(m) for m in re.findall(r"silence_end:\s*(-?[\d.]+)", proc.stderr)]
  54. # A silence running to EOF has a start but no end line.
  55. if len(starts) == len(ends) + 1:
  56. ends.append(duration)
  57. silences = [{"start": round(max(0.0, s), 3), "end": round(e, 3),
  58. "duration": round(e - s, 3)}
  59. for s, e in zip(starts, ends)]
  60. speech, cursor = [], 0.0
  61. for sil in silences:
  62. if sil["start"] > cursor + 0.01:
  63. speech.append({"start": round(cursor, 3), "end": sil["start"],
  64. "duration": round(sil["start"] - cursor, 3)})
  65. cursor = sil["end"]
  66. if duration > cursor + 0.01:
  67. speech.append({"start": round(cursor, 3), "end": round(duration, 3),
  68. "duration": round(duration - cursor, 3)})
  69. return {"silences": silences, "speech": speech}
  70. def detect_scenes(ffmpeg: str, path: Path, threshold: float, duration: float) -> dict:
  71. # metadata=print:file=- routes the per-frame report to STDOUT — a clean parse,
  72. # unlike silencedetect which only logs to stderr.
  73. proc = subprocess.run(
  74. [ffmpeg, "-hide_banner", "-nostats", "-i", str(path),
  75. "-vf", f"select='gt(scene,{threshold})',metadata=print:file=-",
  76. "-an", "-f", "null", "-"],
  77. capture_output=True, text=True)
  78. if proc.returncode != 0:
  79. return {"_error": (proc.stderr.strip().splitlines() or ["unknown"])[-1]}
  80. cuts, scores = [], []
  81. pts_re = re.compile(r"pts_time:(-?[\d.]+)")
  82. score_re = re.compile(r"lavfi\.scene_score=([\d.]+)")
  83. pending_pts = None
  84. for line in proc.stdout.splitlines():
  85. m = pts_re.search(line)
  86. if m:
  87. pending_pts = float(m.group(1))
  88. continue
  89. m = score_re.search(line)
  90. if m and pending_pts is not None:
  91. cuts.append(round(pending_pts, 3))
  92. scores.append(float(m.group(1)))
  93. pending_pts = None
  94. segments, cursor = [], 0.0
  95. for c in cuts:
  96. if c > cursor + 0.01:
  97. segments.append({"start": round(cursor, 3), "end": c,
  98. "duration": round(c - cursor, 3)})
  99. cursor = c
  100. if duration > cursor + 0.01:
  101. segments.append({"start": round(cursor, 3), "end": round(duration, 3),
  102. "duration": round(duration - cursor, 3)})
  103. return {"cuts": cuts, "scores": scores, "segments": segments}
  104. def main() -> int:
  105. ap = argparse.ArgumentParser(
  106. description="Detect silence or scene-change boundaries as JSON segments.",
  107. epilog="Examples:\n"
  108. " detect-segments.py --silence interview.mp4\n"
  109. " detect-segments.py --scenes --json in.mp4 | jq '.data.cuts'\n",
  110. formatter_class=argparse.RawDescriptionHelpFormatter)
  111. ap.add_argument("file", help="media file to analyze")
  112. mode = ap.add_mutually_exclusive_group()
  113. mode.add_argument("--silence", action="store_true",
  114. help="detect audio silence + derive speech segments (default)")
  115. mode.add_argument("--scenes", action="store_true",
  116. help="detect video scene changes")
  117. ap.add_argument("--noise", default="-30dB",
  118. help="silence threshold, e.g. -30dB (default) or -35dB")
  119. ap.add_argument("--min-silence", type=float, default=0.5,
  120. help="minimum silence duration in seconds (default 0.5)")
  121. ap.add_argument("--scene-threshold", type=float, default=0.4,
  122. help="scene-change score threshold 0..1 (default 0.4)")
  123. ap.add_argument("--json", action="store_true", help="emit JSON envelope on stdout")
  124. args = ap.parse_args()
  125. ffmpeg, ffprobe = shutil.which("ffmpeg"), shutil.which("ffprobe")
  126. if not ffmpeg or not ffprobe:
  127. err(args.json, "MISSING_DEPENDENCY", "ffmpeg/ffprobe not found on PATH",
  128. EXIT_MISSING_DEP)
  129. path = Path(args.file)
  130. if not path.is_file():
  131. err(args.json, "NOT_FOUND", f"file not found: {path}", EXIT_NOT_FOUND)
  132. duration = media_duration(ffprobe, path)
  133. mode_name = "scenes" if args.scenes else "silence"
  134. print(f"detecting {mode_name} in {path.name}...", file=sys.stderr)
  135. if args.scenes:
  136. result = detect_scenes(ffmpeg, path, args.scene_threshold, duration)
  137. params = {"scene_threshold": args.scene_threshold}
  138. else:
  139. result = detect_silence(ffmpeg, path, args.noise, args.min_silence, duration)
  140. params = {"noise": args.noise, "min_silence_s": args.min_silence}
  141. if "_error" in result:
  142. err(args.json, "VALIDATION",
  143. f"{mode_name} analysis failed (missing stream for mode?): {result['_error']}",
  144. EXIT_VALIDATION)
  145. data = {"file": str(path), "mode": mode_name, "duration_s": round(duration, 3),
  146. "params": params, **result}
  147. if args.json:
  148. print(json.dumps({"data": data, "meta": {"schema": SCHEMA}}, indent=2))
  149. return EXIT_OK
  150. if args.scenes:
  151. for seg in data["segments"]:
  152. print(f"scene\t{seg['start']}\t{seg['end']}\t{seg['duration']}")
  153. else:
  154. for seg in data["silences"]:
  155. print(f"silence\t{seg['start']}\t{seg['end']}\t{seg['duration']}")
  156. for seg in data["speech"]:
  157. print(f"speech\t{seg['start']}\t{seg['end']}\t{seg['duration']}")
  158. return EXIT_OK
  159. if __name__ == "__main__":
  160. sys.exit(main())