| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- #!/usr/bin/env python3
- """
- Benchmark: markitdown vs Jina Reader vs Firecrawl
- Compare speed, accuracy, formatting, and parallel execution
- """
- import subprocess
- import time
- import os
- import sys
- import concurrent.futures
- from pathlib import Path
- from urllib.parse import quote
- # Force UTF-8 encoding on Windows
- if sys.platform == "win32":
- import codecs
- sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, errors="replace")
- sys.stderr = codecs.getwriter("utf-8")(sys.stderr.buffer, errors="replace")
- # Test corpus - 10 URLs of varying complexity
- URLS = [
- # News articles - use stable landing pages
- ("guardian-tech", "https://www.theguardian.com/technology"),
- ("bbc-news", "https://www.bbc.com/news"),
- # Documentation
- ("python-docs", "https://docs.python.org/3/library/asyncio.html"),
- ("mdn-fetch", "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API"),
- ("rust-book", "https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html"),
- # Feature-rich / Complex
- ("github-repo", "https://github.com/microsoft/markitdown"),
- ("hackernews", "https://news.ycombinator.com/"),
- ("wikipedia", "https://en.wikipedia.org/wiki/Markdown"),
- # Simple / Minimal
- ("example-com", "https://example.com"),
- ("httpbin", "https://httpbin.org/html"),
- ]
- OUTPUT_DIR = Path(__file__).parent / "output"
- def fetch_with_markitdown(url: str, name: str) -> dict:
- """Fetch URL with markitdown, return timing and output"""
- output_file = OUTPUT_DIR / f"{name}_markitdown.md"
- start = time.perf_counter()
- try:
- result = subprocess.run(
- ["markitdown", url],
- capture_output=True,
- text=True,
- timeout=60,
- encoding="utf-8",
- errors="replace"
- )
- elapsed = time.perf_counter() - start
- output = result.stdout or ""
- error = result.stderr or ""
- success = result.returncode == 0 and len(output) > 50
- except subprocess.TimeoutExpired:
- elapsed = 60.0
- output = ""
- error = "TIMEOUT"
- success = False
- except Exception as e:
- elapsed = time.perf_counter() - start
- output = ""
- error = str(e)
- success = False
- if success and output:
- output_file.write_text(output, encoding="utf-8")
- return {
- "tool": "markitdown",
- "name": name,
- "url": url,
- "time": elapsed,
- "success": success,
- "output_len": len(output),
- "error": error if not success else None,
- "output_file": str(output_file) if success else None
- }
- def fetch_with_jina(url: str, name: str) -> dict:
- """Fetch URL with Jina Reader, return timing and output"""
- output_file = OUTPUT_DIR / f"{name}_jina.md"
- jina_url = f"https://r.jina.ai/{url}"
- start = time.perf_counter()
- try:
- result = subprocess.run(
- ["curl", "-s", "-L", "--max-time", "60", jina_url],
- capture_output=True,
- text=True,
- timeout=65,
- encoding="utf-8",
- errors="replace"
- )
- elapsed = time.perf_counter() - start
- output = result.stdout or ""
- error = result.stderr
- success = result.returncode == 0 and len(output) > 100
- except subprocess.TimeoutExpired:
- elapsed = 60.0
- output = ""
- error = "TIMEOUT"
- success = False
- except Exception as e:
- elapsed = time.perf_counter() - start
- output = ""
- error = str(e)
- success = False
- if success and output:
- output_file.write_text(output, encoding="utf-8")
- return {
- "tool": "jina",
- "name": name,
- "url": url,
- "time": elapsed,
- "success": success,
- "output_len": len(output),
- "error": error if not success else None,
- "output_file": str(output_file) if success else None
- }
- def fetch_with_firecrawl(url: str, name: str) -> dict:
- """Fetch URL with Firecrawl, return timing and output"""
- output_file = OUTPUT_DIR / f"{name}_firecrawl.md"
- start = time.perf_counter()
- try:
- # On Windows, firecrawl is a .cmd script - need shell=True
- result = subprocess.run(
- f"firecrawl {url}",
- capture_output=True,
- text=True,
- timeout=90, # Firecrawl can be slower due to JS rendering
- encoding="utf-8",
- errors="replace",
- shell=True
- )
- elapsed = time.perf_counter() - start
- output = result.stdout or ""
- error = result.stderr
- success = result.returncode == 0 and len(output) > 100
- except subprocess.TimeoutExpired:
- elapsed = 90.0
- output = ""
- error = "TIMEOUT"
- success = False
- except Exception as e:
- elapsed = time.perf_counter() - start
- output = ""
- error = str(e)
- success = False
- if success and output:
- output_file.write_text(output, encoding="utf-8")
- return {
- "tool": "firecrawl",
- "name": name,
- "url": url,
- "time": elapsed,
- "success": success,
- "output_len": len(output),
- "error": error if not success else None,
- "output_file": str(output_file) if success else None
- }
- def run_sequential():
- """Run all tests sequentially"""
- print("\n" + "="*60)
- print("SEQUENTIAL EXECUTION")
- print("="*60)
- results = {"markitdown": [], "jina": [], "firecrawl": []}
- for name, url in URLS:
- print(f"\nTesting: {name}")
- print(f" URL: {url}")
- # markitdown
- r1 = fetch_with_markitdown(url, name)
- status1 = "OK" if r1["success"] else "FAIL"
- print(f" markitdown: {r1['time']:.2f}s, {r1['output_len']:,} chars - {status1}")
- results["markitdown"].append(r1)
- # jina
- r2 = fetch_with_jina(url, name)
- status2 = "OK" if r2["success"] else "FAIL"
- print(f" jina: {r2['time']:.2f}s, {r2['output_len']:,} chars - {status2}")
- results["jina"].append(r2)
- # firecrawl
- r3 = fetch_with_firecrawl(url, name)
- status3 = "OK" if r3["success"] else "FAIL"
- print(f" firecrawl: {r3['time']:.2f}s, {r3['output_len']:,} chars - {status3}")
- results["firecrawl"].append(r3)
- return results
- def run_parallel(tool: str, max_workers: int = 5):
- """Run all tests in parallel for a single tool"""
- print(f"\n{'='*60}")
- print(f"PARALLEL EXECUTION: {tool} (max_workers={max_workers})")
- print("="*60)
- fetch_fns = {
- "markitdown": fetch_with_markitdown,
- "jina": fetch_with_jina,
- "firecrawl": fetch_with_firecrawl
- }
- fetch_fn = fetch_fns[tool]
- start = time.perf_counter()
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
- futures = {
- executor.submit(fetch_fn, url, name): name
- for name, url in URLS
- }
- results = []
- for future in concurrent.futures.as_completed(futures):
- name = futures[future]
- result = future.result()
- status = "OK" if result["success"] else f"FAIL"
- print(f" {name}: {result['time']:.2f}s - {status}")
- results.append(result)
- total_time = time.perf_counter() - start
- print(f"\nTotal parallel time: {total_time:.2f}s")
- return results, total_time
- def print_summary(seq_results: dict, par_results: dict):
- """Print comparison summary"""
- print("\n" + "="*60)
- print("SUMMARY")
- print("="*60)
- # Sequential times
- md_times = [r["time"] for r in seq_results["markitdown"] if r["success"]]
- jina_times = [r["time"] for r in seq_results["jina"] if r["success"]]
- fc_times = [r["time"] for r in seq_results["firecrawl"] if r["success"]]
- md_success = sum(1 for r in seq_results["markitdown"] if r["success"])
- jina_success = sum(1 for r in seq_results["jina"] if r["success"])
- fc_success = sum(1 for r in seq_results["firecrawl"] if r["success"])
- md_chars = sum(r["output_len"] for r in seq_results["markitdown"] if r["success"])
- jina_chars = sum(r["output_len"] for r in seq_results["jina"] if r["success"])
- fc_chars = sum(r["output_len"] for r in seq_results["firecrawl"] if r["success"])
- def safe_avg(times):
- return sum(times)/len(times) if times else 0
- print("\n## Speed (Sequential)")
- print(f"| Metric | markitdown | Jina | Firecrawl |")
- print(f"|--------|------------|------|-----------|")
- print(f"| Avg time | {safe_avg(md_times):.2f}s | {safe_avg(jina_times):.2f}s | {safe_avg(fc_times):.2f}s |")
- print(f"| Total time | {sum(md_times):.2f}s | {sum(jina_times):.2f}s | {sum(fc_times):.2f}s |")
- print(f"| Success rate | {md_success}/{len(URLS)} | {jina_success}/{len(URLS)} | {fc_success}/{len(URLS)} |")
- print("\n## Speed (Parallel, 5 workers)")
- print(f"| Metric | markitdown | Jina | Firecrawl |")
- print(f"|--------|------------|------|-----------|")
- print(f"| Total time | {par_results['markitdown'][1]:.2f}s | {par_results['jina'][1]:.2f}s | {par_results['firecrawl'][1]:.2f}s |")
- print("\n## Output Size")
- print(f"| Metric | markitdown | Jina | Firecrawl |")
- print(f"|--------|------------|------|-----------|")
- print(f"| Total chars | {md_chars:,} | {jina_chars:,} | {fc_chars:,} |")
- print(f"| Avg chars | {md_chars//max(md_success,1):,} | {jina_chars//max(jina_success,1):,} | {fc_chars//max(fc_success,1):,} |")
- print("\n## Per-URL Comparison")
- print(f"| URL | markitdown | Jina | Firecrawl | Winner |")
- print(f"|-----|------------|------|-----------|--------|")
- for i, (name, url) in enumerate(URLS):
- md = seq_results["markitdown"][i]
- jn = seq_results["jina"][i]
- fc = seq_results["firecrawl"][i]
- md_str = f"{md['time']:.1f}s" if md["success"] else "FAIL"
- jn_str = f"{jn['time']:.1f}s" if jn["success"] else "FAIL"
- fc_str = f"{fc['time']:.1f}s" if fc["success"] else "FAIL"
- # Determine winner by speed among successful tools
- successful = []
- if md["success"]: successful.append(("markitdown", md["time"]))
- if jn["success"]: successful.append(("Jina", jn["time"]))
- if fc["success"]: successful.append(("Firecrawl", fc["time"]))
- if successful:
- winner = min(successful, key=lambda x: x[1])[0]
- else:
- winner = "None"
- print(f"| {name} | {md_str} | {jn_str} | {fc_str} | {winner} |")
- print(f"\nOutput files saved to: {OUTPUT_DIR}")
- def main():
- # Create output directory
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
- print("Benchmark: markitdown vs Jina Reader vs Firecrawl")
- print(f"Testing {len(URLS)} URLs")
- # Run sequential tests
- seq_results = run_sequential()
- # Run parallel tests
- par_results = {
- "markitdown": run_parallel("markitdown", max_workers=5),
- "jina": run_parallel("jina", max_workers=5),
- "firecrawl": run_parallel("firecrawl", max_workers=5),
- }
- # Print summary
- print_summary(seq_results, par_results)
- if __name__ == "__main__":
- main()
|