fetch.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. #!/usr/bin/env python3
  2. """
  3. Pulse Fetch - Parallel URL fetching for Claude Code news digest.
  4. Uses asyncio + ThreadPoolExecutor to fetch multiple URLs via Firecrawl simultaneously.
  5. Outputs JSON with fetched content for LLM summarization.
  6. Usage:
  7. python fetch.py # Fetch all sources
  8. python fetch.py --sources blogs # Fetch only blogs
  9. python fetch.py --max-workers 20 # Increase parallelism
  10. python fetch.py --output pulse.json
  11. python fetch.py --discover-articles # Extract recent articles from blog homepages
  12. """
  13. import os
  14. import sys
  15. import json
  16. import re
  17. from datetime import datetime, timezone
  18. from concurrent.futures import ThreadPoolExecutor, as_completed
  19. from pathlib import Path
  20. from urllib.parse import urlparse, urljoin
  21. import argparse
  22. # Try to import firecrawl
  23. try:
  24. from firecrawl import FirecrawlApp
  25. FIRECRAWL_AVAILABLE = True
  26. except ImportError:
  27. FIRECRAWL_AVAILABLE = False
  28. print("Warning: firecrawl not installed. Install with: pip install firecrawl-py")
  29. # Sources configuration
  30. SOURCES = {
  31. "official": [
  32. {"name": "Anthropic Engineering", "url": "https://www.anthropic.com/engineering", "type": "blog"},
  33. {"name": "Claude Blog", "url": "https://claude.ai/blog", "type": "blog"},
  34. {"name": "Claude Code Docs", "url": "https://code.claude.com", "type": "docs"},
  35. ],
  36. "blogs": [
  37. {"name": "Simon Willison", "url": "https://simonwillison.net", "type": "blog"},
  38. {"name": "Every", "url": "https://every.to", "type": "blog"},
  39. {"name": "SSHH Blog", "url": "https://blog.sshh.io", "type": "blog"},
  40. {"name": "Lee Han Chung", "url": "https://leehanchung.github.io", "type": "blog"},
  41. {"name": "Nick Nisi", "url": "https://nicknisi.com", "type": "blog"},
  42. {"name": "HumanLayer", "url": "https://www.humanlayer.dev/blog", "type": "blog"},
  43. {"name": "Chris Dzombak", "url": "https://www.dzombak.com/blog", "type": "blog"},
  44. {"name": "GitButler", "url": "https://blog.gitbutler.com", "type": "blog"},
  45. {"name": "Docker Blog", "url": "https://www.docker.com/blog", "type": "blog"},
  46. {"name": "Nx Blog", "url": "https://nx.dev/blog", "type": "blog"},
  47. {"name": "Yee Fei Ooi", "url": "https://medium.com/@ooi_yee_fei", "type": "blog"},
  48. ],
  49. "community": [
  50. {"name": "SkillsMP", "url": "https://skillsmp.com", "type": "marketplace"},
  51. {"name": "Awesome Claude AI", "url": "https://awesomeclaude.ai", "type": "directory"},
  52. ],
  53. }
  54. # Relevance keywords for filtering
  55. RELEVANCE_KEYWORDS = [
  56. "claude", "claude code", "anthropic", "mcp", "model context protocol",
  57. "agent", "skill", "subagent", "cli", "terminal", "prompt engineering",
  58. "cursor", "windsurf", "copilot", "aider", "coding assistant", "hooks"
  59. ]
  60. # Patterns to identify article links in markdown content
  61. ARTICLE_LINK_PATTERNS = [
  62. # Standard markdown links with date-like paths
  63. r'\[([^\]]+)\]\((https?://[^\)]+/\d{4}/[^\)]+)\)',
  64. # Links with /blog/, /posts/, /p/ paths
  65. r'\[([^\]]+)\]\((https?://[^\)]+/(?:blog|posts?|p|articles?)/[^\)]+)\)',
  66. # Links with slugified titles (word-word-word pattern)
  67. r'\[([^\]]+)\]\((https?://[^\)]+/[\w]+-[\w]+-[\w]+[^\)]*)\)',
  68. ]
  69. # Exclude patterns (navigation, categories, tags, etc.)
  70. EXCLUDE_PATTERNS = [
  71. r'/tag/', r'/category/', r'/author/', r'/page/', r'/archive/',
  72. r'/about', r'/contact', r'/subscribe', r'/newsletter', r'/feed',
  73. r'/search', r'/login', r'/signup', r'/privacy', r'/terms',
  74. r'\.xml$', r'\.rss$', r'\.atom$', r'#', r'\?',
  75. ]
  76. def fetch_url_firecrawl(app: 'FirecrawlApp', source: dict) -> dict:
  77. """Fetch a single URL using Firecrawl API."""
  78. url = source["url"]
  79. name = source["name"]
  80. try:
  81. result = app.scrape(url, formats=['markdown'])
  82. # Handle both dict and object responses
  83. if hasattr(result, 'markdown'):
  84. markdown = result.markdown or ''
  85. metadata = result.metadata.__dict__ if hasattr(result.metadata, '__dict__') else {}
  86. else:
  87. markdown = result.get('markdown', '')
  88. metadata = result.get('metadata', {})
  89. return {
  90. "name": name,
  91. "url": url,
  92. "type": source.get("type", "unknown"),
  93. "status": "success",
  94. "content": markdown[:50000], # Limit content size
  95. "title": metadata.get('title', name),
  96. "description": metadata.get('description', ''),
  97. "fetched_at": datetime.utcnow().isoformat() + "Z",
  98. }
  99. except Exception as e:
  100. return {
  101. "name": name,
  102. "url": url,
  103. "type": source.get("type", "unknown"),
  104. "status": "error",
  105. "error": str(e),
  106. "fetched_at": datetime.utcnow().isoformat() + "Z",
  107. }
  108. def get_firecrawl_api_key():
  109. """Get Firecrawl API key from env or config file."""
  110. import re
  111. # Try environment variable first
  112. key = os.getenv('FIRECRAWL_API_KEY')
  113. if key:
  114. return key
  115. # Try ~/.claude/delegate.yaml
  116. config_path = os.path.expanduser("~/.claude/delegate.yaml")
  117. if os.path.exists(config_path):
  118. try:
  119. with open(config_path, encoding="utf-8") as f:
  120. content = f.read()
  121. # Parse the api_keys block and find firecrawl
  122. in_api_keys = False
  123. for line in content.split('\n'):
  124. stripped = line.strip()
  125. if stripped.startswith('api_keys:'):
  126. in_api_keys = True
  127. continue
  128. if in_api_keys and stripped and not line.startswith(' ') and not line.startswith('\t'):
  129. if not stripped.startswith('#'):
  130. in_api_keys = False
  131. if in_api_keys and 'firecrawl:' in stripped.lower():
  132. match = re.search(r'firecrawl:\s*["\']?([^"\'\n#]+)', stripped, re.IGNORECASE)
  133. if match:
  134. return match.group(1).strip()
  135. except Exception:
  136. pass
  137. return None
  138. def fetch_all_parallel(sources: list, max_workers: int = 10) -> list:
  139. """Fetch all URLs in parallel using ThreadPoolExecutor."""
  140. if not FIRECRAWL_AVAILABLE:
  141. print("Error: firecrawl not available")
  142. return []
  143. api_key = get_firecrawl_api_key()
  144. if not api_key:
  145. print("Error: FIRECRAWL_API_KEY not set. Set env var or add to ~/.claude/delegate.yaml")
  146. return []
  147. app = FirecrawlApp(api_key=api_key)
  148. results = []
  149. total = len(sources)
  150. completed = 0
  151. print(f"Fetching {total} URLs with {max_workers} workers...")
  152. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  153. # Submit all tasks
  154. future_to_source = {
  155. executor.submit(fetch_url_firecrawl, app, source): source
  156. for source in sources
  157. }
  158. # Process results as they complete
  159. for future in as_completed(future_to_source):
  160. source = future_to_source[future]
  161. completed += 1
  162. try:
  163. result = future.result()
  164. results.append(result)
  165. status = "OK" if result["status"] == "success" else "FAIL"
  166. print(f"[{completed}/{total}] {status}: {source['name']}")
  167. except Exception as e:
  168. print(f"[{completed}/{total}] ERROR: {source['name']} - {e}")
  169. results.append({
  170. "name": source["name"],
  171. "url": source["url"],
  172. "status": "error",
  173. "error": str(e),
  174. })
  175. return results
  176. def extract_article_links(content: str, base_url: str, max_articles: int = 5) -> list:
  177. """Extract article links from markdown content."""
  178. articles = []
  179. seen_urls = set()
  180. base_domain = urlparse(base_url).netloc
  181. for pattern in ARTICLE_LINK_PATTERNS:
  182. matches = re.findall(pattern, content)
  183. for title, url in matches:
  184. # Skip if already seen
  185. if url in seen_urls:
  186. continue
  187. # Skip excluded patterns
  188. if any(re.search(exc, url, re.IGNORECASE) for exc in EXCLUDE_PATTERNS):
  189. continue
  190. # Ensure same domain or relative URL
  191. parsed = urlparse(url)
  192. if parsed.netloc and parsed.netloc != base_domain:
  193. continue
  194. # Clean up title
  195. title = title.strip()
  196. if len(title) < 5 or len(title) > 200:
  197. continue
  198. # Skip generic link text
  199. if title.lower() in ['read more', 'continue reading', 'link', 'here', 'click here']:
  200. continue
  201. seen_urls.add(url)
  202. articles.append({
  203. "title": title,
  204. "url": url,
  205. })
  206. return articles[:max_articles]
  207. def discover_articles(sources: list, max_workers: int = 10, max_articles_per_source: int = 5) -> list:
  208. """Fetch blog homepages and extract recent article links."""
  209. if not FIRECRAWL_AVAILABLE:
  210. print("Error: firecrawl not available")
  211. return []
  212. api_key = get_firecrawl_api_key()
  213. if not api_key:
  214. print("Error: FIRECRAWL_API_KEY not set. Set env var or add to ~/.claude/delegate.yaml")
  215. return []
  216. # First, fetch all blog homepages
  217. print(f"Phase 1: Fetching {len(sources)} blog homepages...")
  218. homepage_results = fetch_all_parallel(sources, max_workers=max_workers)
  219. # Extract article links from each
  220. all_articles = []
  221. print(f"\nPhase 2: Extracting article links...")
  222. for result in homepage_results:
  223. if result["status"] != "success":
  224. continue
  225. content = result.get("content", "")
  226. base_url = result["url"]
  227. source_name = result["name"]
  228. articles = extract_article_links(content, base_url, max_articles=max_articles_per_source)
  229. print(f" {source_name}: found {len(articles)} articles")
  230. for article in articles:
  231. all_articles.append({
  232. "name": article["title"],
  233. "url": article["url"],
  234. "type": "article",
  235. "source_name": source_name,
  236. "source_url": base_url,
  237. })
  238. if not all_articles:
  239. print("No articles found to fetch")
  240. return homepage_results
  241. # Phase 3: Fetch individual articles
  242. print(f"\nPhase 3: Fetching {len(all_articles)} individual articles...")
  243. article_results = fetch_all_parallel(all_articles, max_workers=max_workers)
  244. # Add source info to results
  245. for i, result in enumerate(article_results):
  246. if i < len(all_articles):
  247. result["source_name"] = all_articles[i].get("source_name", "")
  248. result["source_url"] = all_articles[i].get("source_url", "")
  249. return article_results
  250. def filter_relevant_content(results: list) -> list:
  251. """Filter results to only those with Claude Code relevant content."""
  252. relevant = []
  253. for result in results:
  254. if result["status"] != "success":
  255. continue
  256. content = ((result.get("content") or "") + " " +
  257. (result.get("title") or "") + " " +
  258. (result.get("description") or "")).lower()
  259. # Check for relevance keywords
  260. for keyword in RELEVANCE_KEYWORDS:
  261. if keyword.lower() in content:
  262. result["relevant_keyword"] = keyword
  263. relevant.append(result)
  264. break
  265. return relevant
  266. def main():
  267. parser = argparse.ArgumentParser(description="Pulse Fetch - Parallel URL fetching")
  268. parser.add_argument("--sources", choices=["all", "official", "blogs", "community"],
  269. default="all", help="Source category to fetch")
  270. parser.add_argument("--max-workers", type=int, default=10,
  271. help="Maximum parallel workers (default: 10)")
  272. parser.add_argument("--output", "-o", type=str, default=None,
  273. help="Output JSON file (default: stdout)")
  274. parser.add_argument("--filter-relevant", action="store_true",
  275. help="Only include results with relevant keywords")
  276. parser.add_argument("--discover-articles", action="store_true",
  277. help="Extract and fetch individual articles from blog homepages")
  278. parser.add_argument("--max-articles-per-source", type=int, default=5,
  279. help="Max articles to fetch per source (default: 5)")
  280. args = parser.parse_args()
  281. # Collect sources based on selection
  282. if args.sources == "all":
  283. sources = []
  284. for category in SOURCES.values():
  285. sources.extend(category)
  286. else:
  287. sources = SOURCES.get(args.sources, [])
  288. if not sources:
  289. print(f"No sources found for category: {args.sources}")
  290. return 1
  291. # Fetch URLs - either discover articles or just fetch homepages
  292. if args.discover_articles:
  293. # Filter to only blog-type sources for article discovery
  294. blog_sources = [s for s in sources if s.get("type") == "blog"]
  295. if not blog_sources:
  296. print("No blog sources found for article discovery")
  297. return 1
  298. results = discover_articles(
  299. blog_sources,
  300. max_workers=args.max_workers,
  301. max_articles_per_source=args.max_articles_per_source
  302. )
  303. else:
  304. results = fetch_all_parallel(sources, max_workers=args.max_workers)
  305. # Filter if requested
  306. if args.filter_relevant:
  307. results = filter_relevant_content(results)
  308. print(f"\nFiltered to {len(results)} relevant results")
  309. # Prepare output
  310. output = {
  311. "fetched_at": datetime.utcnow().isoformat() + "Z",
  312. "total_sources": len(sources),
  313. "successful": len([r for r in results if r.get("status") == "success"]),
  314. "failed": len([r for r in results if r.get("status") != "success"]),
  315. "results": results,
  316. }
  317. # Output
  318. json_output = json.dumps(output, indent=2)
  319. if args.output:
  320. Path(args.output).write_text(json_output, encoding="utf-8")
  321. print(f"\nResults saved to: {args.output}")
  322. else:
  323. print("\n" + "=" * 60)
  324. print("RESULTS")
  325. print("=" * 60)
  326. print(json_output)
  327. # Summary
  328. print(f"\n{'=' * 60}")
  329. print(f"SUMMARY: {output['successful']}/{output['total_sources']} successful")
  330. print(f"{'=' * 60}")
  331. return 0
  332. if __name__ == "__main__":
  333. sys.exit(main())