|
|
@@ -0,0 +1,325 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+Benchmark: markitdown vs Jina Reader vs Firecrawl
|
|
|
+Compare speed, accuracy, formatting, and parallel execution
|
|
|
+"""
|
|
|
+
|
|
|
+import subprocess
|
|
|
+import time
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import concurrent.futures
|
|
|
+from pathlib import Path
|
|
|
+from urllib.parse import quote
|
|
|
+
|
|
|
+# Force UTF-8 encoding on Windows
|
|
|
+if sys.platform == "win32":
|
|
|
+ import codecs
|
|
|
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, errors="replace")
|
|
|
+ sys.stderr = codecs.getwriter("utf-8")(sys.stderr.buffer, errors="replace")
|
|
|
+
|
|
|
+# Test corpus - 10 URLs of varying complexity
|
|
|
+URLS = [
|
|
|
+ # News articles - use stable landing pages
|
|
|
+ ("guardian-tech", "https://www.theguardian.com/technology"),
|
|
|
+ ("bbc-news", "https://www.bbc.com/news"),
|
|
|
+
|
|
|
+ # Documentation
|
|
|
+ ("python-docs", "https://docs.python.org/3/library/asyncio.html"),
|
|
|
+ ("mdn-fetch", "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API"),
|
|
|
+ ("rust-book", "https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html"),
|
|
|
+
|
|
|
+ # Feature-rich / Complex
|
|
|
+ ("github-repo", "https://github.com/microsoft/markitdown"),
|
|
|
+ ("hackernews", "https://news.ycombinator.com/"),
|
|
|
+ ("wikipedia", "https://en.wikipedia.org/wiki/Markdown"),
|
|
|
+
|
|
|
+ # Simple / Minimal
|
|
|
+ ("example-com", "https://example.com"),
|
|
|
+ ("httpbin", "https://httpbin.org/html"),
|
|
|
+]
|
|
|
+
|
|
|
+OUTPUT_DIR = Path(__file__).parent / "output"
|
|
|
+
|
|
|
+def fetch_with_markitdown(url: str, name: str) -> dict:
|
|
|
+ """Fetch URL with markitdown, return timing and output"""
|
|
|
+ output_file = OUTPUT_DIR / f"{name}_markitdown.md"
|
|
|
+ start = time.perf_counter()
|
|
|
+ try:
|
|
|
+ result = subprocess.run(
|
|
|
+ ["markitdown", url],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=60,
|
|
|
+ encoding="utf-8",
|
|
|
+ errors="replace"
|
|
|
+ )
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = result.stdout or ""
|
|
|
+ error = result.stderr or ""
|
|
|
+ success = result.returncode == 0 and len(output) > 50
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ elapsed = 60.0
|
|
|
+ output = ""
|
|
|
+ error = "TIMEOUT"
|
|
|
+ success = False
|
|
|
+ except Exception as e:
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = ""
|
|
|
+ error = str(e)
|
|
|
+ success = False
|
|
|
+
|
|
|
+ if success and output:
|
|
|
+ output_file.write_text(output, encoding="utf-8")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "tool": "markitdown",
|
|
|
+ "name": name,
|
|
|
+ "url": url,
|
|
|
+ "time": elapsed,
|
|
|
+ "success": success,
|
|
|
+ "output_len": len(output),
|
|
|
+ "error": error if not success else None,
|
|
|
+ "output_file": str(output_file) if success else None
|
|
|
+ }
|
|
|
+
|
|
|
+def fetch_with_jina(url: str, name: str) -> dict:
|
|
|
+ """Fetch URL with Jina Reader, return timing and output"""
|
|
|
+ output_file = OUTPUT_DIR / f"{name}_jina.md"
|
|
|
+ jina_url = f"https://r.jina.ai/{url}"
|
|
|
+ start = time.perf_counter()
|
|
|
+ try:
|
|
|
+ result = subprocess.run(
|
|
|
+ ["curl", "-s", "-L", "--max-time", "60", jina_url],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=65,
|
|
|
+ encoding="utf-8",
|
|
|
+ errors="replace"
|
|
|
+ )
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = result.stdout or ""
|
|
|
+ error = result.stderr
|
|
|
+ success = result.returncode == 0 and len(output) > 100
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ elapsed = 60.0
|
|
|
+ output = ""
|
|
|
+ error = "TIMEOUT"
|
|
|
+ success = False
|
|
|
+ except Exception as e:
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = ""
|
|
|
+ error = str(e)
|
|
|
+ success = False
|
|
|
+
|
|
|
+ if success and output:
|
|
|
+ output_file.write_text(output, encoding="utf-8")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "tool": "jina",
|
|
|
+ "name": name,
|
|
|
+ "url": url,
|
|
|
+ "time": elapsed,
|
|
|
+ "success": success,
|
|
|
+ "output_len": len(output),
|
|
|
+ "error": error if not success else None,
|
|
|
+ "output_file": str(output_file) if success else None
|
|
|
+ }
|
|
|
+
|
|
|
+def fetch_with_firecrawl(url: str, name: str) -> dict:
|
|
|
+ """Fetch URL with Firecrawl, return timing and output"""
|
|
|
+ output_file = OUTPUT_DIR / f"{name}_firecrawl.md"
|
|
|
+ start = time.perf_counter()
|
|
|
+ try:
|
|
|
+ # On Windows, firecrawl is a .cmd script - need shell=True
|
|
|
+ result = subprocess.run(
|
|
|
+ f"firecrawl {url}",
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=90, # Firecrawl can be slower due to JS rendering
|
|
|
+ encoding="utf-8",
|
|
|
+ errors="replace",
|
|
|
+ shell=True
|
|
|
+ )
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = result.stdout or ""
|
|
|
+ error = result.stderr
|
|
|
+ success = result.returncode == 0 and len(output) > 100
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ elapsed = 90.0
|
|
|
+ output = ""
|
|
|
+ error = "TIMEOUT"
|
|
|
+ success = False
|
|
|
+ except Exception as e:
|
|
|
+ elapsed = time.perf_counter() - start
|
|
|
+ output = ""
|
|
|
+ error = str(e)
|
|
|
+ success = False
|
|
|
+
|
|
|
+ if success and output:
|
|
|
+ output_file.write_text(output, encoding="utf-8")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "tool": "firecrawl",
|
|
|
+ "name": name,
|
|
|
+ "url": url,
|
|
|
+ "time": elapsed,
|
|
|
+ "success": success,
|
|
|
+ "output_len": len(output),
|
|
|
+ "error": error if not success else None,
|
|
|
+ "output_file": str(output_file) if success else None
|
|
|
+ }
|
|
|
+
|
|
|
+def run_sequential():
|
|
|
+ """Run all tests sequentially"""
|
|
|
+ print("\n" + "="*60)
|
|
|
+ print("SEQUENTIAL EXECUTION")
|
|
|
+ print("="*60)
|
|
|
+
|
|
|
+ results = {"markitdown": [], "jina": [], "firecrawl": []}
|
|
|
+
|
|
|
+ for name, url in URLS:
|
|
|
+ print(f"\nTesting: {name}")
|
|
|
+ print(f" URL: {url}")
|
|
|
+
|
|
|
+ # markitdown
|
|
|
+ r1 = fetch_with_markitdown(url, name)
|
|
|
+ status1 = "OK" if r1["success"] else "FAIL"
|
|
|
+ print(f" markitdown: {r1['time']:.2f}s, {r1['output_len']:,} chars - {status1}")
|
|
|
+ results["markitdown"].append(r1)
|
|
|
+
|
|
|
+ # jina
|
|
|
+ r2 = fetch_with_jina(url, name)
|
|
|
+ status2 = "OK" if r2["success"] else "FAIL"
|
|
|
+ print(f" jina: {r2['time']:.2f}s, {r2['output_len']:,} chars - {status2}")
|
|
|
+ results["jina"].append(r2)
|
|
|
+
|
|
|
+ # firecrawl
|
|
|
+ r3 = fetch_with_firecrawl(url, name)
|
|
|
+ status3 = "OK" if r3["success"] else "FAIL"
|
|
|
+ print(f" firecrawl: {r3['time']:.2f}s, {r3['output_len']:,} chars - {status3}")
|
|
|
+ results["firecrawl"].append(r3)
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+def run_parallel(tool: str, max_workers: int = 5):
|
|
|
+ """Run all tests in parallel for a single tool"""
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print(f"PARALLEL EXECUTION: {tool} (max_workers={max_workers})")
|
|
|
+ print("="*60)
|
|
|
+
|
|
|
+ fetch_fns = {
|
|
|
+ "markitdown": fetch_with_markitdown,
|
|
|
+ "jina": fetch_with_jina,
|
|
|
+ "firecrawl": fetch_with_firecrawl
|
|
|
+ }
|
|
|
+ fetch_fn = fetch_fns[tool]
|
|
|
+
|
|
|
+ start = time.perf_counter()
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
+ futures = {
|
|
|
+ executor.submit(fetch_fn, url, name): name
|
|
|
+ for name, url in URLS
|
|
|
+ }
|
|
|
+ results = []
|
|
|
+ for future in concurrent.futures.as_completed(futures):
|
|
|
+ name = futures[future]
|
|
|
+ result = future.result()
|
|
|
+ status = "OK" if result["success"] else f"FAIL"
|
|
|
+ print(f" {name}: {result['time']:.2f}s - {status}")
|
|
|
+ results.append(result)
|
|
|
+
|
|
|
+ total_time = time.perf_counter() - start
|
|
|
+ print(f"\nTotal parallel time: {total_time:.2f}s")
|
|
|
+
|
|
|
+ return results, total_time
|
|
|
+
|
|
|
+def print_summary(seq_results: dict, par_results: dict):
|
|
|
+ """Print comparison summary"""
|
|
|
+ print("\n" + "="*60)
|
|
|
+ print("SUMMARY")
|
|
|
+ print("="*60)
|
|
|
+
|
|
|
+ # Sequential times
|
|
|
+ md_times = [r["time"] for r in seq_results["markitdown"] if r["success"]]
|
|
|
+ jina_times = [r["time"] for r in seq_results["jina"] if r["success"]]
|
|
|
+ fc_times = [r["time"] for r in seq_results["firecrawl"] if r["success"]]
|
|
|
+
|
|
|
+ md_success = sum(1 for r in seq_results["markitdown"] if r["success"])
|
|
|
+ jina_success = sum(1 for r in seq_results["jina"] if r["success"])
|
|
|
+ fc_success = sum(1 for r in seq_results["firecrawl"] if r["success"])
|
|
|
+
|
|
|
+ md_chars = sum(r["output_len"] for r in seq_results["markitdown"] if r["success"])
|
|
|
+ jina_chars = sum(r["output_len"] for r in seq_results["jina"] if r["success"])
|
|
|
+ fc_chars = sum(r["output_len"] for r in seq_results["firecrawl"] if r["success"])
|
|
|
+
|
|
|
+ def safe_avg(times):
|
|
|
+ return sum(times)/len(times) if times else 0
|
|
|
+
|
|
|
+ print("\n## Speed (Sequential)")
|
|
|
+ print(f"| Metric | markitdown | Jina | Firecrawl |")
|
|
|
+ print(f"|--------|------------|------|-----------|")
|
|
|
+ print(f"| Avg time | {safe_avg(md_times):.2f}s | {safe_avg(jina_times):.2f}s | {safe_avg(fc_times):.2f}s |")
|
|
|
+ print(f"| Total time | {sum(md_times):.2f}s | {sum(jina_times):.2f}s | {sum(fc_times):.2f}s |")
|
|
|
+ print(f"| Success rate | {md_success}/{len(URLS)} | {jina_success}/{len(URLS)} | {fc_success}/{len(URLS)} |")
|
|
|
+
|
|
|
+ print("\n## Speed (Parallel, 5 workers)")
|
|
|
+ print(f"| Metric | markitdown | Jina | Firecrawl |")
|
|
|
+ print(f"|--------|------------|------|-----------|")
|
|
|
+ print(f"| Total time | {par_results['markitdown'][1]:.2f}s | {par_results['jina'][1]:.2f}s | {par_results['firecrawl'][1]:.2f}s |")
|
|
|
+
|
|
|
+ print("\n## Output Size")
|
|
|
+ print(f"| Metric | markitdown | Jina | Firecrawl |")
|
|
|
+ print(f"|--------|------------|------|-----------|")
|
|
|
+ print(f"| Total chars | {md_chars:,} | {jina_chars:,} | {fc_chars:,} |")
|
|
|
+ print(f"| Avg chars | {md_chars//max(md_success,1):,} | {jina_chars//max(jina_success,1):,} | {fc_chars//max(fc_success,1):,} |")
|
|
|
+
|
|
|
+ print("\n## Per-URL Comparison")
|
|
|
+ print(f"| URL | markitdown | Jina | Firecrawl | Winner |")
|
|
|
+ print(f"|-----|------------|------|-----------|--------|")
|
|
|
+ for i, (name, url) in enumerate(URLS):
|
|
|
+ md = seq_results["markitdown"][i]
|
|
|
+ jn = seq_results["jina"][i]
|
|
|
+ fc = seq_results["firecrawl"][i]
|
|
|
+
|
|
|
+ md_str = f"{md['time']:.1f}s" if md["success"] else "FAIL"
|
|
|
+ jn_str = f"{jn['time']:.1f}s" if jn["success"] else "FAIL"
|
|
|
+ fc_str = f"{fc['time']:.1f}s" if fc["success"] else "FAIL"
|
|
|
+
|
|
|
+ # Determine winner by speed among successful tools
|
|
|
+ successful = []
|
|
|
+ if md["success"]: successful.append(("markitdown", md["time"]))
|
|
|
+ if jn["success"]: successful.append(("Jina", jn["time"]))
|
|
|
+ if fc["success"]: successful.append(("Firecrawl", fc["time"]))
|
|
|
+
|
|
|
+ if successful:
|
|
|
+ winner = min(successful, key=lambda x: x[1])[0]
|
|
|
+ else:
|
|
|
+ winner = "None"
|
|
|
+
|
|
|
+ print(f"| {name} | {md_str} | {jn_str} | {fc_str} | {winner} |")
|
|
|
+
|
|
|
+ print(f"\nOutput files saved to: {OUTPUT_DIR}")
|
|
|
+
|
|
|
+def main():
|
|
|
+ # Create output directory
|
|
|
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ print("Benchmark: markitdown vs Jina Reader vs Firecrawl")
|
|
|
+ print(f"Testing {len(URLS)} URLs")
|
|
|
+
|
|
|
+ # Run sequential tests
|
|
|
+ seq_results = run_sequential()
|
|
|
+
|
|
|
+ # Run parallel tests
|
|
|
+ par_results = {
|
|
|
+ "markitdown": run_parallel("markitdown", max_workers=5),
|
|
|
+ "jina": run_parallel("jina", max_workers=5),
|
|
|
+ "firecrawl": run_parallel("firecrawl", max_workers=5),
|
|
|
+ }
|
|
|
+
|
|
|
+ # Print summary
|
|
|
+ print_summary(seq_results, par_results)
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|