| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401 |
- #!/usr/bin/env python3
- """
- Pulse Fetch - Parallel URL fetching for Claude Code news digest.
- Uses asyncio + ThreadPoolExecutor to fetch multiple URLs via Firecrawl simultaneously.
- Outputs JSON with fetched content for LLM summarization.
- Usage:
- python fetch.py # Fetch all sources
- python fetch.py --sources blogs # Fetch only blogs
- python fetch.py --max-workers 20 # Increase parallelism
- python fetch.py --output pulse.json
- python fetch.py --discover-articles # Extract recent articles from blog homepages
- """
- import os
- import sys
- import json
- import re
- from datetime import datetime, timezone
- from concurrent.futures import ThreadPoolExecutor, as_completed
- from pathlib import Path
- from urllib.parse import urlparse, urljoin
- import argparse
- # Try to import firecrawl
- try:
- from firecrawl import FirecrawlApp
- FIRECRAWL_AVAILABLE = True
- except ImportError:
- FIRECRAWL_AVAILABLE = False
- print("Warning: firecrawl not installed. Install with: pip install firecrawl-py")
- # Sources configuration
- SOURCES = {
- "official": [
- {"name": "Anthropic Engineering", "url": "https://www.anthropic.com/engineering", "type": "blog"},
- {"name": "Claude Blog", "url": "https://claude.ai/blog", "type": "blog"},
- {"name": "Claude Code Docs", "url": "https://code.claude.com", "type": "docs"},
- ],
- "blogs": [
- {"name": "Simon Willison", "url": "https://simonwillison.net", "type": "blog"},
- {"name": "Every", "url": "https://every.to", "type": "blog"},
- {"name": "SSHH Blog", "url": "https://blog.sshh.io", "type": "blog"},
- {"name": "Lee Han Chung", "url": "https://leehanchung.github.io", "type": "blog"},
- {"name": "Nick Nisi", "url": "https://nicknisi.com", "type": "blog"},
- {"name": "HumanLayer", "url": "https://www.humanlayer.dev/blog", "type": "blog"},
- {"name": "Chris Dzombak", "url": "https://www.dzombak.com/blog", "type": "blog"},
- {"name": "GitButler", "url": "https://blog.gitbutler.com", "type": "blog"},
- {"name": "Docker Blog", "url": "https://www.docker.com/blog", "type": "blog"},
- {"name": "Nx Blog", "url": "https://nx.dev/blog", "type": "blog"},
- {"name": "Yee Fei Ooi", "url": "https://medium.com/@ooi_yee_fei", "type": "blog"},
- ],
- "community": [
- {"name": "SkillsMP", "url": "https://skillsmp.com", "type": "marketplace"},
- {"name": "Awesome Claude AI", "url": "https://awesomeclaude.ai", "type": "directory"},
- ],
- }
- # Relevance keywords for filtering
- RELEVANCE_KEYWORDS = [
- "claude", "claude code", "anthropic", "mcp", "model context protocol",
- "agent", "skill", "subagent", "cli", "terminal", "prompt engineering",
- "cursor", "windsurf", "copilot", "aider", "coding assistant", "hooks"
- ]
- # Patterns to identify article links in markdown content
- ARTICLE_LINK_PATTERNS = [
- # Standard markdown links with date-like paths
- r'\[([^\]]+)\]\((https?://[^\)]+/\d{4}/[^\)]+)\)',
- # Links with /blog/, /posts/, /p/ paths
- r'\[([^\]]+)\]\((https?://[^\)]+/(?:blog|posts?|p|articles?)/[^\)]+)\)',
- # Links with slugified titles (word-word-word pattern)
- r'\[([^\]]+)\]\((https?://[^\)]+/[\w]+-[\w]+-[\w]+[^\)]*)\)',
- ]
- # Exclude patterns (navigation, categories, tags, etc.)
- EXCLUDE_PATTERNS = [
- r'/tag/', r'/category/', r'/author/', r'/page/', r'/archive/',
- r'/about', r'/contact', r'/subscribe', r'/newsletter', r'/feed',
- r'/search', r'/login', r'/signup', r'/privacy', r'/terms',
- r'\.xml$', r'\.rss$', r'\.atom$', r'#', r'\?',
- ]
- def fetch_url_firecrawl(app: 'FirecrawlApp', source: dict) -> dict:
- """Fetch a single URL using Firecrawl API."""
- url = source["url"]
- name = source["name"]
- try:
- result = app.scrape(url, formats=['markdown'])
- # Handle both dict and object responses
- if hasattr(result, 'markdown'):
- markdown = result.markdown or ''
- metadata = result.metadata.__dict__ if hasattr(result.metadata, '__dict__') else {}
- else:
- markdown = result.get('markdown', '')
- metadata = result.get('metadata', {})
- return {
- "name": name,
- "url": url,
- "type": source.get("type", "unknown"),
- "status": "success",
- "content": markdown[:50000], # Limit content size
- "title": metadata.get('title', name),
- "description": metadata.get('description', ''),
- "fetched_at": datetime.utcnow().isoformat() + "Z",
- }
- except Exception as e:
- return {
- "name": name,
- "url": url,
- "type": source.get("type", "unknown"),
- "status": "error",
- "error": str(e),
- "fetched_at": datetime.utcnow().isoformat() + "Z",
- }
- def get_firecrawl_api_key():
- """Get Firecrawl API key from env or config file."""
- import re
- # Try environment variable first
- key = os.getenv('FIRECRAWL_API_KEY')
- if key:
- return key
- # Try ~/.claude/delegate.yaml
- config_path = os.path.expanduser("~/.claude/delegate.yaml")
- if os.path.exists(config_path):
- try:
- with open(config_path, encoding="utf-8") as f:
- content = f.read()
- # Parse the api_keys block and find firecrawl
- in_api_keys = False
- for line in content.split('\n'):
- stripped = line.strip()
- if stripped.startswith('api_keys:'):
- in_api_keys = True
- continue
- if in_api_keys and stripped and not line.startswith(' ') and not line.startswith('\t'):
- if not stripped.startswith('#'):
- in_api_keys = False
- if in_api_keys and 'firecrawl:' in stripped.lower():
- match = re.search(r'firecrawl:\s*["\']?([^"\'\n#]+)', stripped, re.IGNORECASE)
- if match:
- return match.group(1).strip()
- except Exception:
- pass
- return None
- def fetch_all_parallel(sources: list, max_workers: int = 10) -> list:
- """Fetch all URLs in parallel using ThreadPoolExecutor."""
- if not FIRECRAWL_AVAILABLE:
- print("Error: firecrawl not available")
- return []
- api_key = get_firecrawl_api_key()
- if not api_key:
- print("Error: FIRECRAWL_API_KEY not set. Set env var or add to ~/.claude/delegate.yaml")
- return []
- app = FirecrawlApp(api_key=api_key)
- results = []
- total = len(sources)
- completed = 0
- print(f"Fetching {total} URLs with {max_workers} workers...")
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- # Submit all tasks
- future_to_source = {
- executor.submit(fetch_url_firecrawl, app, source): source
- for source in sources
- }
- # Process results as they complete
- for future in as_completed(future_to_source):
- source = future_to_source[future]
- completed += 1
- try:
- result = future.result()
- results.append(result)
- status = "OK" if result["status"] == "success" else "FAIL"
- print(f"[{completed}/{total}] {status}: {source['name']}")
- except Exception as e:
- print(f"[{completed}/{total}] ERROR: {source['name']} - {e}")
- results.append({
- "name": source["name"],
- "url": source["url"],
- "status": "error",
- "error": str(e),
- })
- return results
- def extract_article_links(content: str, base_url: str, max_articles: int = 5) -> list:
- """Extract article links from markdown content."""
- articles = []
- seen_urls = set()
- base_domain = urlparse(base_url).netloc
- for pattern in ARTICLE_LINK_PATTERNS:
- matches = re.findall(pattern, content)
- for title, url in matches:
- # Skip if already seen
- if url in seen_urls:
- continue
- # Skip excluded patterns
- if any(re.search(exc, url, re.IGNORECASE) for exc in EXCLUDE_PATTERNS):
- continue
- # Ensure same domain or relative URL
- parsed = urlparse(url)
- if parsed.netloc and parsed.netloc != base_domain:
- continue
- # Clean up title
- title = title.strip()
- if len(title) < 5 or len(title) > 200:
- continue
- # Skip generic link text
- if title.lower() in ['read more', 'continue reading', 'link', 'here', 'click here']:
- continue
- seen_urls.add(url)
- articles.append({
- "title": title,
- "url": url,
- })
- return articles[:max_articles]
- def discover_articles(sources: list, max_workers: int = 10, max_articles_per_source: int = 5) -> list:
- """Fetch blog homepages and extract recent article links."""
- if not FIRECRAWL_AVAILABLE:
- print("Error: firecrawl not available")
- return []
- api_key = get_firecrawl_api_key()
- if not api_key:
- print("Error: FIRECRAWL_API_KEY not set. Set env var or add to ~/.claude/delegate.yaml")
- return []
- # First, fetch all blog homepages
- print(f"Phase 1: Fetching {len(sources)} blog homepages...")
- homepage_results = fetch_all_parallel(sources, max_workers=max_workers)
- # Extract article links from each
- all_articles = []
- print(f"\nPhase 2: Extracting article links...")
- for result in homepage_results:
- if result["status"] != "success":
- continue
- content = result.get("content", "")
- base_url = result["url"]
- source_name = result["name"]
- articles = extract_article_links(content, base_url, max_articles=max_articles_per_source)
- print(f" {source_name}: found {len(articles)} articles")
- for article in articles:
- all_articles.append({
- "name": article["title"],
- "url": article["url"],
- "type": "article",
- "source_name": source_name,
- "source_url": base_url,
- })
- if not all_articles:
- print("No articles found to fetch")
- return homepage_results
- # Phase 3: Fetch individual articles
- print(f"\nPhase 3: Fetching {len(all_articles)} individual articles...")
- article_results = fetch_all_parallel(all_articles, max_workers=max_workers)
- # Add source info to results
- for i, result in enumerate(article_results):
- if i < len(all_articles):
- result["source_name"] = all_articles[i].get("source_name", "")
- result["source_url"] = all_articles[i].get("source_url", "")
- return article_results
- def filter_relevant_content(results: list) -> list:
- """Filter results to only those with Claude Code relevant content."""
- relevant = []
- for result in results:
- if result["status"] != "success":
- continue
- content = ((result.get("content") or "") + " " +
- (result.get("title") or "") + " " +
- (result.get("description") or "")).lower()
- # Check for relevance keywords
- for keyword in RELEVANCE_KEYWORDS:
- if keyword.lower() in content:
- result["relevant_keyword"] = keyword
- relevant.append(result)
- break
- return relevant
- def main():
- parser = argparse.ArgumentParser(description="Pulse Fetch - Parallel URL fetching")
- parser.add_argument("--sources", choices=["all", "official", "blogs", "community"],
- default="all", help="Source category to fetch")
- parser.add_argument("--max-workers", type=int, default=10,
- help="Maximum parallel workers (default: 10)")
- parser.add_argument("--output", "-o", type=str, default=None,
- help="Output JSON file (default: stdout)")
- parser.add_argument("--filter-relevant", action="store_true",
- help="Only include results with relevant keywords")
- parser.add_argument("--discover-articles", action="store_true",
- help="Extract and fetch individual articles from blog homepages")
- parser.add_argument("--max-articles-per-source", type=int, default=5,
- help="Max articles to fetch per source (default: 5)")
- args = parser.parse_args()
- # Collect sources based on selection
- if args.sources == "all":
- sources = []
- for category in SOURCES.values():
- sources.extend(category)
- else:
- sources = SOURCES.get(args.sources, [])
- if not sources:
- print(f"No sources found for category: {args.sources}")
- return 1
- # Fetch URLs - either discover articles or just fetch homepages
- if args.discover_articles:
- # Filter to only blog-type sources for article discovery
- blog_sources = [s for s in sources if s.get("type") == "blog"]
- if not blog_sources:
- print("No blog sources found for article discovery")
- return 1
- results = discover_articles(
- blog_sources,
- max_workers=args.max_workers,
- max_articles_per_source=args.max_articles_per_source
- )
- else:
- results = fetch_all_parallel(sources, max_workers=args.max_workers)
- # Filter if requested
- if args.filter_relevant:
- results = filter_relevant_content(results)
- print(f"\nFiltered to {len(results)} relevant results")
- # Prepare output
- output = {
- "fetched_at": datetime.utcnow().isoformat() + "Z",
- "total_sources": len(sources),
- "successful": len([r for r in results if r.get("status") == "success"]),
- "failed": len([r for r in results if r.get("status") != "success"]),
- "results": results,
- }
- # Output
- json_output = json.dumps(output, indent=2)
- if args.output:
- Path(args.output).write_text(json_output, encoding="utf-8")
- print(f"\nResults saved to: {args.output}")
- else:
- print("\n" + "=" * 60)
- print("RESULTS")
- print("=" * 60)
- print(json_output)
- # Summary
- print(f"\n{'=' * 60}")
- print(f"SUMMARY: {output['successful']}/{output['total_sources']} successful")
- print(f"{'=' * 60}")
- return 0
- if __name__ == "__main__":
- sys.exit(main())
|