diff --git a/ops/pipeline-v2/lib/evaluate.py b/ops/pipeline-v2/lib/evaluate.py index ff6dab8a9..104635ec2 100644 --- a/ops/pipeline-v2/lib/evaluate.py +++ b/ops/pipeline-v2/lib/evaluate.py @@ -493,6 +493,9 @@ async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_iss async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: """Evaluate a single PR. Returns result dict.""" + from . import costs + pr_cost = 0.0 + # Check eval attempt budget before claiming row = conn.execute("SELECT eval_attempts FROM prs WHERE number = ?", (pr_number,)).fetchone() eval_attempts = (row["eval_attempts"] or 0) if row else 0 @@ -608,10 +611,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: json.dumps({"pr": pr_number, "tier": tier}), ) else: - tier, triage_usage = await triage_pr(diff) - # Record triage cost - from . import costs - costs.record_usage( + tier, triage_usage, _triage_reason = await triage_pr(diff) + pr_cost += costs.record_usage( conn, config.TRIAGE_MODEL, "eval_triage", input_tokens=triage_usage.get("prompt_tokens", 0), output_tokens=triage_usage.get("completion_tokens", 0), @@ -674,6 +675,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: # OpenRouter failure (timeout, error) — revert to open for retry. # NOT a rate limit — don't trigger 15-min backoff, just skip this PR. conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) + if pr_cost > 0: + conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number)) return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"} domain_verdict = _parse_verdict(domain_review, agent) @@ -714,6 +717,15 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: # Disposition: check if this PR should be terminated or kept open await _dispose_rejected_pr(conn, pr_number, eval_attempts, domain_issues) + if domain_verdict != "skipped": + pr_cost += costs.record_usage( + conn, config.EVAL_DOMAIN_MODEL, "eval_domain", + input_tokens=domain_usage.get("prompt_tokens", 0), + output_tokens=domain_usage.get("completion_tokens", 0), + backend="openrouter", + ) + if pr_cost > 0: + conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number)) return { "pr": pr_number, "domain_verdict": domain_verdict, @@ -731,6 +743,15 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: if leo_review is None: # DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle). conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) + if domain_verdict != "skipped": + pr_cost += costs.record_usage( + conn, config.EVAL_DOMAIN_MODEL, "eval_domain", + input_tokens=domain_usage.get("prompt_tokens", 0), + output_tokens=domain_usage.get("completion_tokens", 0), + backend="openrouter", + ) + if pr_cost > 0: + conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number)) reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed" return {"pr": pr_number, "skipped": True, "reason": reason} @@ -834,10 +855,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: await _dispose_rejected_pr(conn, pr_number, eval_attempts, all_issues) # Record cost (only for reviews that actually ran) - from . import costs - if domain_verdict != "skipped": - costs.record_usage( + pr_cost += costs.record_usage( conn, config.EVAL_DOMAIN_MODEL, "eval_domain", input_tokens=domain_usage.get("prompt_tokens", 0), output_tokens=domain_usage.get("completion_tokens", 0), @@ -845,15 +864,23 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: ) if leo_verdict not in ("skipped",): if tier == "DEEP": - costs.record_usage(conn, config.EVAL_LEO_MODEL, "eval_leo", backend="max") + pr_cost += costs.record_usage( + conn, config.EVAL_LEO_MODEL, "eval_leo", + input_tokens=leo_usage.get("prompt_tokens", 0), + output_tokens=leo_usage.get("completion_tokens", 0), + backend="max", + ) else: - costs.record_usage( + pr_cost += costs.record_usage( conn, config.EVAL_LEO_STANDARD_MODEL, "eval_leo", input_tokens=leo_usage.get("prompt_tokens", 0), output_tokens=leo_usage.get("completion_tokens", 0), backend="openrouter", ) + if pr_cost > 0: + conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number)) + return { "pr": pr_number, "tier": tier,