|
|
@@ -2,22 +2,28 @@
|
|
|
"""Estimate the token/$ cost of an outer loop by pattern × cadence × model.
|
|
|
|
|
|
A loop's cost is runs/day × tokens/run × price, and sub-agents multiply tokens/run.
|
|
|
-This computes that before you commit to a cadence. Pricing reads from
|
|
|
-assets/model-pricing.json (date-stamped; skills/claude-api-ops is the source of
|
|
|
-truth — run its check-model-table.py if you suspect drift).
|
|
|
+This computes that - and, crucially, models **prompt caching**: a loop re-sends the
|
|
|
+SAME run.md + system prefix every tick (the Ralph property), which is the textbook
|
|
|
+caching case. Whether caching helps depends on cadence vs cache TTL, so this picks the
|
|
|
+TTL and reports the cached projection alongside the naive one.
|
|
|
+
|
|
|
+Pricing reads from assets/model-pricing.json (date-stamped; skills/claude-api-ops is
|
|
|
+the source of truth - run its check-model-table.py if you suspect drift).
|
|
|
|
|
|
Usage: loop-cost.py --pattern P --cadence C --model M [OPTIONS]
|
|
|
Input: argv flags only (no stdin).
|
|
|
Output: stdout = the cost breakdown (plain rows, or --json envelope). Data only.
|
|
|
-Stderr: the assumptions note, errors.
|
|
|
+Stderr: the assumptions + caching note, errors.
|
|
|
Exit: 0 ok, 2 usage, 3 pricing file missing, 4 bad cadence/model/pattern
|
|
|
|
|
|
-Estimates, not guarantees — reconcile against the loop's run-log.md actuals. The
|
|
|
-cheapest lever is cadence (halving frequency halves cost); the next is model.
|
|
|
+Estimates, not guarantees - reconcile against the loop's run-log.md actuals. Levers in
|
|
|
+order of impact: cadence (halving frequency halves cost), prompt caching (model below),
|
|
|
+model tier.
|
|
|
|
|
|
Examples:
|
|
|
loop-cost.py --pattern pr-babysitter --cadence 10m --model claude-haiku-4-5
|
|
|
loop-cost.py --pattern ci-sweeper --cadence 15m --model claude-sonnet-4-6 --days 30 --json
|
|
|
+ loop-cost.py --pattern daily-triage --cadence 6h --model claude-opus-4-8 # too slow to cache
|
|
|
loop-cost.py --list-models
|
|
|
"""
|
|
|
from __future__ import annotations
|
|
|
@@ -36,11 +42,26 @@ EX_VALIDATION = 4
|
|
|
|
|
|
DEFAULT_PRICING = Path(__file__).resolve().parent.parent / "assets" / "model-pricing.json"
|
|
|
|
|
|
+# Prompt-caching multipliers vs base input price (claude-api-ops/references/caching-and-cost.md).
|
|
|
+CACHE_WRITE_5M = 1.25 # write a 5-minute-TTL entry
|
|
|
+CACHE_WRITE_1H = 2.0 # write a 1-hour-TTL entry
|
|
|
+CACHE_READ = 0.1 # read any cached entry
|
|
|
+
|
|
|
+# Minimum cacheable prefix (tokens) - below this the cache_control marker is silently
|
|
|
+# ignored (caching-and-cost.md). A loop whose static prefix is smaller can't cache.
|
|
|
+MIN_PREFIX = {
|
|
|
+ "claude-fable-5": 512,
|
|
|
+ "claude-opus-4-8": 1024,
|
|
|
+ "claude-sonnet-4-6": 1024,
|
|
|
+ "claude-haiku-4-5": 4096,
|
|
|
+}
|
|
|
+DEFAULT_MIN_PREFIX = 1024
|
|
|
+
|
|
|
|
|
|
class Term:
|
|
|
- """Minimal ANSI helper (term.sh is bash-only; per TERMINAL-DESIGN.md §9 the
|
|
|
- Python port is inline). Honors FORCE_COLOR / NO_COLOR / TERM_ASCII and the
|
|
|
- bound stream's TTY + encoding, so piped data stays plain ASCII."""
|
|
|
+ """Minimal ANSI helper (term.sh is bash-only; per TERMINAL-DESIGN.md §9 the Python
|
|
|
+ port is inline). Honors FORCE_COLOR / NO_COLOR / TERM_ASCII and the bound stream's
|
|
|
+ TTY + encoding, so piped data stays plain ASCII."""
|
|
|
|
|
|
_C = {"green": "\033[32m", "cyan": "\033[36m", "dim": "\033[2m", "off": "\033[0m"}
|
|
|
|
|
|
@@ -95,15 +116,63 @@ def runs_per_day(cadence: str, override: float | None) -> float:
|
|
|
if re.fullmatch(r"\d+ \* \* \* \*", s):
|
|
|
return 24.0
|
|
|
print(
|
|
|
- f"error: cannot derive runs/day from cadence '{cadence}' — "
|
|
|
+ f"error: cannot derive runs/day from cadence '{cadence}' - "
|
|
|
"use Nm/Nh/Nd, `*/N * * * *`, or pass --runs-per-day",
|
|
|
file=sys.stderr,
|
|
|
)
|
|
|
raise SystemExit(EX_VALIDATION)
|
|
|
|
|
|
|
|
|
+def caching_projection(in_tok, out_tok, sub, in_price, out_price, rpd, model,
|
|
|
+ prefix_frac, ttl_choice):
|
|
|
+ """Model prompt-caching of the static run-prompt prefix across ticks.
|
|
|
+
|
|
|
+ Returns a dict: ttl, beneficial, reason, cost_per_run/day, prefix_tokens.
|
|
|
+ The cache stays warm only when the tick interval is <= the TTL (reads refresh it);
|
|
|
+ a loop slower than the 1h max TTL writes a cold entry every tick - caching can't help.
|
|
|
+ """
|
|
|
+ interval_min = 1440.0 / rpd if rpd > 0 else 1e9
|
|
|
+ prefix_tokens = int(round(in_tok * prefix_frac))
|
|
|
+ variable_in = in_tok - prefix_tokens
|
|
|
+ min_prefix = MIN_PREFIX.get(model, DEFAULT_MIN_PREFIX)
|
|
|
+
|
|
|
+ # Pick TTL: smallest that stays warm at this cadence.
|
|
|
+ if ttl_choice == "5m":
|
|
|
+ ttl, warm = "5m", interval_min <= 5
|
|
|
+ elif ttl_choice == "1h":
|
|
|
+ ttl, warm = "1h", interval_min <= 60
|
|
|
+ else: # auto
|
|
|
+ if interval_min <= 5:
|
|
|
+ ttl, warm = "5m", True
|
|
|
+ elif interval_min <= 60:
|
|
|
+ ttl, warm = "1h", True
|
|
|
+ else:
|
|
|
+ ttl, warm = None, False
|
|
|
+
|
|
|
+ out_cost_day = out_tok / 1e6 * out_price * rpd
|
|
|
+
|
|
|
+ if prefix_tokens < min_prefix:
|
|
|
+ return {"ttl": ttl, "beneficial": False,
|
|
|
+ "reason": f"static prefix ~{prefix_tokens} tok < {model} minimum {min_prefix} tok "
|
|
|
+ "- cache marker silently ignored; enlarge the run prompt/system or skip caching",
|
|
|
+ "prefix_tokens": prefix_tokens, "cost_per_day": None, "cost_per_run": None}
|
|
|
+ if not warm or ttl is None:
|
|
|
+ return {"ttl": ttl, "beneficial": False,
|
|
|
+ "reason": f"tick interval ~{interval_min:.0f} min exceeds the cache TTL "
|
|
|
+ "- the entry expires between ticks, so every tick is a cold write; caching won't help",
|
|
|
+ "prefix_tokens": prefix_tokens, "cost_per_day": None, "cost_per_run": None}
|
|
|
+
|
|
|
+ write_mult = CACHE_WRITE_5M if ttl == "5m" else CACHE_WRITE_1H
|
|
|
+ # Per day, warm: ~1 cache write of the prefix + (rpd-1) reads; variable input + output full price.
|
|
|
+ prefix_day = prefix_tokens / 1e6 * in_price * (write_mult + max(rpd - 1, 0) * CACHE_READ)
|
|
|
+ variable_day = variable_in / 1e6 * in_price * rpd
|
|
|
+ cost_day = (prefix_day + variable_day + out_cost_day) * sub
|
|
|
+ return {"ttl": ttl, "beneficial": True, "reason": "",
|
|
|
+ "prefix_tokens": prefix_tokens, "write_mult": write_mult,
|
|
|
+ "cost_per_day": cost_day, "cost_per_run": cost_day / rpd if rpd else cost_day}
|
|
|
+
|
|
|
+
|
|
|
def fmt_money(x: float) -> str:
|
|
|
- """Human dollar string: cents below $100, 4 decimals below $1 for tiny per-run costs."""
|
|
|
if x < 1:
|
|
|
return f"${x:.4f}"
|
|
|
return f"${x:,.2f}"
|
|
|
@@ -112,7 +181,7 @@ def fmt_money(x: float) -> str:
|
|
|
def main(argv: list[str]) -> int:
|
|
|
p = argparse.ArgumentParser(
|
|
|
prog="loop-cost.py",
|
|
|
- description="Estimate outer-loop cost by pattern × cadence × model.",
|
|
|
+ description="Estimate outer-loop cost by pattern × cadence × model, with prompt caching.",
|
|
|
)
|
|
|
p.add_argument("--pattern", default="custom", help="catalog pattern key (default: custom)")
|
|
|
p.add_argument("--cadence", default="1h", help="10m | 1h | 6h | 1d, or a cron string (default: 1h)")
|
|
|
@@ -122,6 +191,11 @@ def main(argv: list[str]) -> int:
|
|
|
p.add_argument("--input-tokens", type=int, default=None, help="override per-run input tokens")
|
|
|
p.add_argument("--output-tokens", type=int, default=None, help="override per-run output tokens")
|
|
|
p.add_argument("--subagents", type=int, default=None, help="override the sub-agent fan-out multiplier")
|
|
|
+ p.add_argument("--cache-prefix-frac", type=float, default=0.6,
|
|
|
+ help="fraction of input that is the static, cacheable run-prompt prefix (default: 0.6)")
|
|
|
+ p.add_argument("--cache-ttl", choices=["auto", "5m", "1h"], default="auto",
|
|
|
+ help="cache TTL to model (default: auto - pick by cadence)")
|
|
|
+ p.add_argument("--no-cache", action="store_true", help="report the uncached cost only")
|
|
|
p.add_argument("--pricing", default=str(DEFAULT_PRICING), help="path to model-pricing.json")
|
|
|
p.add_argument("--list-models", action="store_true", help="print the pricing table + as-of date, exit 0")
|
|
|
p.add_argument("--json", action="store_true", help="emit a JSON envelope")
|
|
|
@@ -135,7 +209,6 @@ def main(argv: list[str]) -> int:
|
|
|
as_of = pricing.get("_as_of", "unknown")
|
|
|
pattern_defaults = pricing.get("_pattern_defaults", {})
|
|
|
|
|
|
- # ── --list-models ──
|
|
|
if args.list_models:
|
|
|
if args.json:
|
|
|
print(json.dumps({"data": models, "meta": {"as_of": as_of, "schema": "claude-mods.loop-ops.pricing/v1"}}, indent=2))
|
|
|
@@ -149,26 +222,27 @@ def main(argv: list[str]) -> int:
|
|
|
if args.days <= 0:
|
|
|
print("error: --days must be positive", file=sys.stderr)
|
|
|
return EX_VALIDATION
|
|
|
+ if not (0.0 <= args.cache_prefix_frac <= 1.0):
|
|
|
+ print("error: --cache-prefix-frac must be between 0 and 1", file=sys.stderr)
|
|
|
+ return EX_VALIDATION
|
|
|
|
|
|
- # ── model ──
|
|
|
if args.model not in models:
|
|
|
- print(f"error: unknown model '{args.model}' — known: {', '.join(models) or '(none)'}", file=sys.stderr)
|
|
|
+ print(f"error: unknown model '{args.model}' - known: {', '.join(models) or '(none)'}", file=sys.stderr)
|
|
|
return EX_VALIDATION
|
|
|
in_price = float(models[args.model]["input_per_mtok"])
|
|
|
out_price = float(models[args.model]["output_per_mtok"])
|
|
|
|
|
|
- # ── tokens/run: overrides win, else pattern defaults ──
|
|
|
if args.input_tokens is not None and args.output_tokens is not None:
|
|
|
in_tok, out_tok = args.input_tokens, args.output_tokens
|
|
|
sub = args.subagents if args.subagents is not None else 1
|
|
|
- elif args.pattern in pattern_defaults:
|
|
|
+ elif args.pattern in pattern_defaults and not args.pattern.startswith("_"):
|
|
|
d = pattern_defaults[args.pattern]
|
|
|
in_tok = args.input_tokens if args.input_tokens is not None else int(d["input"])
|
|
|
out_tok = args.output_tokens if args.output_tokens is not None else int(d["output"])
|
|
|
sub = args.subagents if args.subagents is not None else int(d.get("subagents", 1))
|
|
|
else:
|
|
|
print(
|
|
|
- f"error: unknown pattern '{args.pattern}' — pass --input-tokens and "
|
|
|
+ f"error: unknown pattern '{args.pattern}' - pass --input-tokens and "
|
|
|
f"--output-tokens, or use one of: {', '.join(k for k in pattern_defaults if not k.startswith('_'))}",
|
|
|
file=sys.stderr,
|
|
|
)
|
|
|
@@ -180,7 +254,7 @@ def main(argv: list[str]) -> int:
|
|
|
|
|
|
rpd = runs_per_day(args.cadence, args.runs_per_day)
|
|
|
|
|
|
- # ── cost math ──
|
|
|
+ # ── uncached (naive) ──
|
|
|
cost_in = in_tok / 1_000_000 * in_price
|
|
|
cost_out = out_tok / 1_000_000 * out_price
|
|
|
cost_run = (cost_in + cost_out) * sub
|
|
|
@@ -188,25 +262,32 @@ def main(argv: list[str]) -> int:
|
|
|
cost_day = cost_run * rpd
|
|
|
cost_horizon = cost_day * args.days
|
|
|
|
|
|
+ # ── cached projection ──
|
|
|
+ cache = None
|
|
|
+ if not args.no_cache:
|
|
|
+ cache = caching_projection(in_tok, out_tok, sub, in_price, out_price, rpd,
|
|
|
+ args.model, args.cache_prefix_frac, args.cache_ttl)
|
|
|
+
|
|
|
if args.json:
|
|
|
- envelope = {
|
|
|
- "data": {
|
|
|
- "pattern": args.pattern,
|
|
|
- "model": args.model,
|
|
|
- "cadence": args.cadence,
|
|
|
- "runs_per_day": round(rpd, 3),
|
|
|
- "tokens_per_run": tokens_run,
|
|
|
- "input_tokens": in_tok,
|
|
|
- "output_tokens": out_tok,
|
|
|
- "subagents": sub,
|
|
|
- "cost_per_run": round(cost_run, 6),
|
|
|
- "cost_per_day": round(cost_day, 4),
|
|
|
- "days": args.days,
|
|
|
- "cost_per_horizon": round(cost_horizon, 2),
|
|
|
- },
|
|
|
- "meta": {"as_of": as_of, "schema": "claude-mods.loop-ops.cost/v1"},
|
|
|
+ data = {
|
|
|
+ "pattern": args.pattern, "model": args.model, "cadence": args.cadence,
|
|
|
+ "runs_per_day": round(rpd, 3), "tokens_per_run": tokens_run,
|
|
|
+ "input_tokens": in_tok, "output_tokens": out_tok, "subagents": sub,
|
|
|
+ "cost_per_run": round(cost_run, 6), "cost_per_day": round(cost_day, 4),
|
|
|
+ "days": args.days, "cost_per_horizon": round(cost_horizon, 2),
|
|
|
}
|
|
|
- print(json.dumps(envelope, indent=2))
|
|
|
+ if cache is not None:
|
|
|
+ if cache["beneficial"]:
|
|
|
+ cd = cache["cost_per_day"]
|
|
|
+ data["caching"] = {
|
|
|
+ "beneficial": True, "ttl": cache["ttl"], "prefix_tokens": cache["prefix_tokens"],
|
|
|
+ "cost_per_day": round(cd, 4), "cost_per_horizon": round(cd * args.days, 2),
|
|
|
+ "savings_pct": round((cost_day - cd) / cost_day * 100, 1) if cost_day else 0.0,
|
|
|
+ }
|
|
|
+ else:
|
|
|
+ data["caching"] = {"beneficial": False, "reason": cache["reason"],
|
|
|
+ "prefix_tokens": cache["prefix_tokens"]}
|
|
|
+ print(json.dumps({"data": data, "meta": {"as_of": as_of, "schema": "claude-mods.loop-ops.cost/v1"}}, indent=2))
|
|
|
return EX_OK
|
|
|
|
|
|
t = Term(sys.stderr)
|
|
|
@@ -216,12 +297,21 @@ def main(argv: list[str]) -> int:
|
|
|
print(f"{'tokens/run:':<16}{tokens_run:,} ({in_tok:,} in + {out_tok:,} out) x {sub} subagent(s)")
|
|
|
print(f"{'cost/run:':<16}{fmt_money(cost_run)}")
|
|
|
print(f"{'cost/day:':<16}{fmt_money(cost_day)}")
|
|
|
- print(f"{'cost/'+str(args.days)+'d:':<16}{t.c('cyan', fmt_money(cost_horizon))}")
|
|
|
- print(
|
|
|
- f"estimate (as of {as_of} pricing) - reconcile against run-log.md actuals; "
|
|
|
- "cadence is the biggest lever",
|
|
|
- file=sys.stderr,
|
|
|
- )
|
|
|
+ print(f"{'cost/'+str(args.days)+'d:':<16}{fmt_money(cost_horizon)} (uncached)")
|
|
|
+ if cache is not None:
|
|
|
+ if cache["beneficial"]:
|
|
|
+ cd, ch = cache["cost_per_day"], cache["cost_per_day"] * args.days
|
|
|
+ save = (cost_day - cd) / cost_day * 100 if cost_day else 0.0
|
|
|
+ print(f"{'cached/'+str(args.days)+'d:':<16}{t.c('cyan', fmt_money(ch))} "
|
|
|
+ f"({t.c('green', f'-{save:.0f}%')}, TTL {cache['ttl']}, prefix ~{cache['prefix_tokens']:,} tok)")
|
|
|
+ print(f"recommendation: cache the static run.md+system prefix at TTL {cache['ttl']} "
|
|
|
+ f"-> ~-{save:.0f}%/mo. Keep run.md BYTE-IDENTICAL every tick or the cache never hits.",
|
|
|
+ file=sys.stderr)
|
|
|
+ else:
|
|
|
+ print(f"caching: not beneficial here", file=sys.stderr)
|
|
|
+ print(f" why: {cache['reason']}", file=sys.stderr)
|
|
|
+ print(f"estimate (as of {as_of} pricing) - reconcile against run-log.md actuals; "
|
|
|
+ "cadence is the biggest lever, then caching, then model tier", file=sys.stderr)
|
|
|
return EX_OK
|
|
|
|
|
|
|