benchmark.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #!/usr/bin/env python3
  2. """
  3. Benchmark: markitdown vs Jina Reader vs Firecrawl
  4. Compare speed, accuracy, formatting, and parallel execution
  5. """
  6. import subprocess
  7. import time
  8. import os
  9. import sys
  10. import concurrent.futures
  11. from pathlib import Path
  12. from urllib.parse import quote
  13. # Force UTF-8 encoding on Windows
  14. if sys.platform == "win32":
  15. import codecs
  16. sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, errors="replace")
  17. sys.stderr = codecs.getwriter("utf-8")(sys.stderr.buffer, errors="replace")
  18. # Test corpus - 10 URLs of varying complexity
  19. URLS = [
  20. # News articles - use stable landing pages
  21. ("guardian-tech", "https://www.theguardian.com/technology"),
  22. ("bbc-news", "https://www.bbc.com/news"),
  23. # Documentation
  24. ("python-docs", "https://docs.python.org/3/library/asyncio.html"),
  25. ("mdn-fetch", "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API"),
  26. ("rust-book", "https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html"),
  27. # Feature-rich / Complex
  28. ("github-repo", "https://github.com/microsoft/markitdown"),
  29. ("hackernews", "https://news.ycombinator.com/"),
  30. ("wikipedia", "https://en.wikipedia.org/wiki/Markdown"),
  31. # Simple / Minimal
  32. ("example-com", "https://example.com"),
  33. ("httpbin", "https://httpbin.org/html"),
  34. ]
  35. OUTPUT_DIR = Path(__file__).parent / "output"
  36. def fetch_with_markitdown(url: str, name: str) -> dict:
  37. """Fetch URL with markitdown, return timing and output"""
  38. output_file = OUTPUT_DIR / f"{name}_markitdown.md"
  39. start = time.perf_counter()
  40. try:
  41. result = subprocess.run(
  42. ["markitdown", url],
  43. capture_output=True,
  44. text=True,
  45. timeout=60,
  46. encoding="utf-8",
  47. errors="replace"
  48. )
  49. elapsed = time.perf_counter() - start
  50. output = result.stdout or ""
  51. error = result.stderr or ""
  52. success = result.returncode == 0 and len(output) > 50
  53. except subprocess.TimeoutExpired:
  54. elapsed = 60.0
  55. output = ""
  56. error = "TIMEOUT"
  57. success = False
  58. except Exception as e:
  59. elapsed = time.perf_counter() - start
  60. output = ""
  61. error = str(e)
  62. success = False
  63. if success and output:
  64. output_file.write_text(output, encoding="utf-8")
  65. return {
  66. "tool": "markitdown",
  67. "name": name,
  68. "url": url,
  69. "time": elapsed,
  70. "success": success,
  71. "output_len": len(output),
  72. "error": error if not success else None,
  73. "output_file": str(output_file) if success else None
  74. }
  75. def fetch_with_jina(url: str, name: str) -> dict:
  76. """Fetch URL with Jina Reader, return timing and output"""
  77. output_file = OUTPUT_DIR / f"{name}_jina.md"
  78. jina_url = f"https://r.jina.ai/{url}"
  79. start = time.perf_counter()
  80. try:
  81. result = subprocess.run(
  82. ["curl", "-s", "-L", "--max-time", "60", jina_url],
  83. capture_output=True,
  84. text=True,
  85. timeout=65,
  86. encoding="utf-8",
  87. errors="replace"
  88. )
  89. elapsed = time.perf_counter() - start
  90. output = result.stdout or ""
  91. error = result.stderr
  92. success = result.returncode == 0 and len(output) > 100
  93. except subprocess.TimeoutExpired:
  94. elapsed = 60.0
  95. output = ""
  96. error = "TIMEOUT"
  97. success = False
  98. except Exception as e:
  99. elapsed = time.perf_counter() - start
  100. output = ""
  101. error = str(e)
  102. success = False
  103. if success and output:
  104. output_file.write_text(output, encoding="utf-8")
  105. return {
  106. "tool": "jina",
  107. "name": name,
  108. "url": url,
  109. "time": elapsed,
  110. "success": success,
  111. "output_len": len(output),
  112. "error": error if not success else None,
  113. "output_file": str(output_file) if success else None
  114. }
  115. def fetch_with_firecrawl(url: str, name: str) -> dict:
  116. """Fetch URL with Firecrawl, return timing and output"""
  117. output_file = OUTPUT_DIR / f"{name}_firecrawl.md"
  118. start = time.perf_counter()
  119. try:
  120. # On Windows, firecrawl is a .cmd script - need shell=True
  121. result = subprocess.run(
  122. f"firecrawl {url}",
  123. capture_output=True,
  124. text=True,
  125. timeout=90, # Firecrawl can be slower due to JS rendering
  126. encoding="utf-8",
  127. errors="replace",
  128. shell=True
  129. )
  130. elapsed = time.perf_counter() - start
  131. output = result.stdout or ""
  132. error = result.stderr
  133. success = result.returncode == 0 and len(output) > 100
  134. except subprocess.TimeoutExpired:
  135. elapsed = 90.0
  136. output = ""
  137. error = "TIMEOUT"
  138. success = False
  139. except Exception as e:
  140. elapsed = time.perf_counter() - start
  141. output = ""
  142. error = str(e)
  143. success = False
  144. if success and output:
  145. output_file.write_text(output, encoding="utf-8")
  146. return {
  147. "tool": "firecrawl",
  148. "name": name,
  149. "url": url,
  150. "time": elapsed,
  151. "success": success,
  152. "output_len": len(output),
  153. "error": error if not success else None,
  154. "output_file": str(output_file) if success else None
  155. }
  156. def run_sequential():
  157. """Run all tests sequentially"""
  158. print("\n" + "="*60)
  159. print("SEQUENTIAL EXECUTION")
  160. print("="*60)
  161. results = {"markitdown": [], "jina": [], "firecrawl": []}
  162. for name, url in URLS:
  163. print(f"\nTesting: {name}")
  164. print(f" URL: {url}")
  165. # markitdown
  166. r1 = fetch_with_markitdown(url, name)
  167. status1 = "OK" if r1["success"] else "FAIL"
  168. print(f" markitdown: {r1['time']:.2f}s, {r1['output_len']:,} chars - {status1}")
  169. results["markitdown"].append(r1)
  170. # jina
  171. r2 = fetch_with_jina(url, name)
  172. status2 = "OK" if r2["success"] else "FAIL"
  173. print(f" jina: {r2['time']:.2f}s, {r2['output_len']:,} chars - {status2}")
  174. results["jina"].append(r2)
  175. # firecrawl
  176. r3 = fetch_with_firecrawl(url, name)
  177. status3 = "OK" if r3["success"] else "FAIL"
  178. print(f" firecrawl: {r3['time']:.2f}s, {r3['output_len']:,} chars - {status3}")
  179. results["firecrawl"].append(r3)
  180. return results
  181. def run_parallel(tool: str, max_workers: int = 5):
  182. """Run all tests in parallel for a single tool"""
  183. print(f"\n{'='*60}")
  184. print(f"PARALLEL EXECUTION: {tool} (max_workers={max_workers})")
  185. print("="*60)
  186. fetch_fns = {
  187. "markitdown": fetch_with_markitdown,
  188. "jina": fetch_with_jina,
  189. "firecrawl": fetch_with_firecrawl
  190. }
  191. fetch_fn = fetch_fns[tool]
  192. start = time.perf_counter()
  193. with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
  194. futures = {
  195. executor.submit(fetch_fn, url, name): name
  196. for name, url in URLS
  197. }
  198. results = []
  199. for future in concurrent.futures.as_completed(futures):
  200. name = futures[future]
  201. result = future.result()
  202. status = "OK" if result["success"] else f"FAIL"
  203. print(f" {name}: {result['time']:.2f}s - {status}")
  204. results.append(result)
  205. total_time = time.perf_counter() - start
  206. print(f"\nTotal parallel time: {total_time:.2f}s")
  207. return results, total_time
  208. def print_summary(seq_results: dict, par_results: dict):
  209. """Print comparison summary"""
  210. print("\n" + "="*60)
  211. print("SUMMARY")
  212. print("="*60)
  213. # Sequential times
  214. md_times = [r["time"] for r in seq_results["markitdown"] if r["success"]]
  215. jina_times = [r["time"] for r in seq_results["jina"] if r["success"]]
  216. fc_times = [r["time"] for r in seq_results["firecrawl"] if r["success"]]
  217. md_success = sum(1 for r in seq_results["markitdown"] if r["success"])
  218. jina_success = sum(1 for r in seq_results["jina"] if r["success"])
  219. fc_success = sum(1 for r in seq_results["firecrawl"] if r["success"])
  220. md_chars = sum(r["output_len"] for r in seq_results["markitdown"] if r["success"])
  221. jina_chars = sum(r["output_len"] for r in seq_results["jina"] if r["success"])
  222. fc_chars = sum(r["output_len"] for r in seq_results["firecrawl"] if r["success"])
  223. def safe_avg(times):
  224. return sum(times)/len(times) if times else 0
  225. print("\n## Speed (Sequential)")
  226. print(f"| Metric | markitdown | Jina | Firecrawl |")
  227. print(f"|--------|------------|------|-----------|")
  228. print(f"| Avg time | {safe_avg(md_times):.2f}s | {safe_avg(jina_times):.2f}s | {safe_avg(fc_times):.2f}s |")
  229. print(f"| Total time | {sum(md_times):.2f}s | {sum(jina_times):.2f}s | {sum(fc_times):.2f}s |")
  230. print(f"| Success rate | {md_success}/{len(URLS)} | {jina_success}/{len(URLS)} | {fc_success}/{len(URLS)} |")
  231. print("\n## Speed (Parallel, 5 workers)")
  232. print(f"| Metric | markitdown | Jina | Firecrawl |")
  233. print(f"|--------|------------|------|-----------|")
  234. print(f"| Total time | {par_results['markitdown'][1]:.2f}s | {par_results['jina'][1]:.2f}s | {par_results['firecrawl'][1]:.2f}s |")
  235. print("\n## Output Size")
  236. print(f"| Metric | markitdown | Jina | Firecrawl |")
  237. print(f"|--------|------------|------|-----------|")
  238. print(f"| Total chars | {md_chars:,} | {jina_chars:,} | {fc_chars:,} |")
  239. print(f"| Avg chars | {md_chars//max(md_success,1):,} | {jina_chars//max(jina_success,1):,} | {fc_chars//max(fc_success,1):,} |")
  240. print("\n## Per-URL Comparison")
  241. print(f"| URL | markitdown | Jina | Firecrawl | Winner |")
  242. print(f"|-----|------------|------|-----------|--------|")
  243. for i, (name, url) in enumerate(URLS):
  244. md = seq_results["markitdown"][i]
  245. jn = seq_results["jina"][i]
  246. fc = seq_results["firecrawl"][i]
  247. md_str = f"{md['time']:.1f}s" if md["success"] else "FAIL"
  248. jn_str = f"{jn['time']:.1f}s" if jn["success"] else "FAIL"
  249. fc_str = f"{fc['time']:.1f}s" if fc["success"] else "FAIL"
  250. # Determine winner by speed among successful tools
  251. successful = []
  252. if md["success"]: successful.append(("markitdown", md["time"]))
  253. if jn["success"]: successful.append(("Jina", jn["time"]))
  254. if fc["success"]: successful.append(("Firecrawl", fc["time"]))
  255. if successful:
  256. winner = min(successful, key=lambda x: x[1])[0]
  257. else:
  258. winner = "None"
  259. print(f"| {name} | {md_str} | {jn_str} | {fc_str} | {winner} |")
  260. print(f"\nOutput files saved to: {OUTPUT_DIR}")
  261. def main():
  262. # Create output directory
  263. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  264. print("Benchmark: markitdown vs Jina Reader vs Firecrawl")
  265. print(f"Testing {len(URLS)} URLs")
  266. # Run sequential tests
  267. seq_results = run_sequential()
  268. # Run parallel tests
  269. par_results = {
  270. "markitdown": run_parallel("markitdown", max_workers=5),
  271. "jina": run_parallel("jina", max_workers=5),
  272. "firecrawl": run_parallel("firecrawl", max_workers=5),
  273. }
  274. # Print summary
  275. print_summary(seq_results, par_results)
  276. if __name__ == "__main__":
  277. main()