epimetheus: fix eval crash + wire per-PR cost tracking
Three bugs fixed: 1. triage_pr() returns 3 values but line 611 unpacked 2 → ValueError on every non-deterministic PR (circuit breaker opened, 5 PRs stuck) 2. costs import was inside triage else-block → NameError on deterministic routes 3. pr_cost never written to prs.cost_usd → 0% cost tracking across 1,118 PRs Cost tracking now covers all 4 exit paths: domain failed, domain rejected, Leo failed, and normal completion. Uses additive UPDATE (cost_usd + ?) so re-evals accumulate correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d8a64d479f
commit
154f36f2d3
1 changed files with 36 additions and 9 deletions
|
|
@ -493,6 +493,9 @@ async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_iss
|
||||||
|
|
||||||
async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
"""Evaluate a single PR. Returns result dict."""
|
"""Evaluate a single PR. Returns result dict."""
|
||||||
|
from . import costs
|
||||||
|
pr_cost = 0.0
|
||||||
|
|
||||||
# Check eval attempt budget before claiming
|
# Check eval attempt budget before claiming
|
||||||
row = conn.execute("SELECT eval_attempts FROM prs WHERE number = ?", (pr_number,)).fetchone()
|
row = conn.execute("SELECT eval_attempts FROM prs WHERE number = ?", (pr_number,)).fetchone()
|
||||||
eval_attempts = (row["eval_attempts"] or 0) if row else 0
|
eval_attempts = (row["eval_attempts"] or 0) if row else 0
|
||||||
|
|
@ -608,10 +611,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
json.dumps({"pr": pr_number, "tier": tier}),
|
json.dumps({"pr": pr_number, "tier": tier}),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tier, triage_usage = await triage_pr(diff)
|
tier, triage_usage, _triage_reason = await triage_pr(diff)
|
||||||
# Record triage cost
|
pr_cost += costs.record_usage(
|
||||||
from . import costs
|
|
||||||
costs.record_usage(
|
|
||||||
conn, config.TRIAGE_MODEL, "eval_triage",
|
conn, config.TRIAGE_MODEL, "eval_triage",
|
||||||
input_tokens=triage_usage.get("prompt_tokens", 0),
|
input_tokens=triage_usage.get("prompt_tokens", 0),
|
||||||
output_tokens=triage_usage.get("completion_tokens", 0),
|
output_tokens=triage_usage.get("completion_tokens", 0),
|
||||||
|
|
@ -674,6 +675,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
# OpenRouter failure (timeout, error) — revert to open for retry.
|
# OpenRouter failure (timeout, error) — revert to open for retry.
|
||||||
# NOT a rate limit — don't trigger 15-min backoff, just skip this PR.
|
# NOT a rate limit — don't trigger 15-min backoff, just skip this PR.
|
||||||
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
||||||
|
if pr_cost > 0:
|
||||||
|
conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number))
|
||||||
return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"}
|
return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"}
|
||||||
|
|
||||||
domain_verdict = _parse_verdict(domain_review, agent)
|
domain_verdict = _parse_verdict(domain_review, agent)
|
||||||
|
|
@ -714,6 +717,15 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
# Disposition: check if this PR should be terminated or kept open
|
# Disposition: check if this PR should be terminated or kept open
|
||||||
await _dispose_rejected_pr(conn, pr_number, eval_attempts, domain_issues)
|
await _dispose_rejected_pr(conn, pr_number, eval_attempts, domain_issues)
|
||||||
|
|
||||||
|
if domain_verdict != "skipped":
|
||||||
|
pr_cost += costs.record_usage(
|
||||||
|
conn, config.EVAL_DOMAIN_MODEL, "eval_domain",
|
||||||
|
input_tokens=domain_usage.get("prompt_tokens", 0),
|
||||||
|
output_tokens=domain_usage.get("completion_tokens", 0),
|
||||||
|
backend="openrouter",
|
||||||
|
)
|
||||||
|
if pr_cost > 0:
|
||||||
|
conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number))
|
||||||
return {
|
return {
|
||||||
"pr": pr_number,
|
"pr": pr_number,
|
||||||
"domain_verdict": domain_verdict,
|
"domain_verdict": domain_verdict,
|
||||||
|
|
@ -731,6 +743,15 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
if leo_review is None:
|
if leo_review is None:
|
||||||
# DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle).
|
# DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle).
|
||||||
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
||||||
|
if domain_verdict != "skipped":
|
||||||
|
pr_cost += costs.record_usage(
|
||||||
|
conn, config.EVAL_DOMAIN_MODEL, "eval_domain",
|
||||||
|
input_tokens=domain_usage.get("prompt_tokens", 0),
|
||||||
|
output_tokens=domain_usage.get("completion_tokens", 0),
|
||||||
|
backend="openrouter",
|
||||||
|
)
|
||||||
|
if pr_cost > 0:
|
||||||
|
conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number))
|
||||||
reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed"
|
reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed"
|
||||||
return {"pr": pr_number, "skipped": True, "reason": reason}
|
return {"pr": pr_number, "skipped": True, "reason": reason}
|
||||||
|
|
||||||
|
|
@ -834,10 +855,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
await _dispose_rejected_pr(conn, pr_number, eval_attempts, all_issues)
|
await _dispose_rejected_pr(conn, pr_number, eval_attempts, all_issues)
|
||||||
|
|
||||||
# Record cost (only for reviews that actually ran)
|
# Record cost (only for reviews that actually ran)
|
||||||
from . import costs
|
|
||||||
|
|
||||||
if domain_verdict != "skipped":
|
if domain_verdict != "skipped":
|
||||||
costs.record_usage(
|
pr_cost += costs.record_usage(
|
||||||
conn, config.EVAL_DOMAIN_MODEL, "eval_domain",
|
conn, config.EVAL_DOMAIN_MODEL, "eval_domain",
|
||||||
input_tokens=domain_usage.get("prompt_tokens", 0),
|
input_tokens=domain_usage.get("prompt_tokens", 0),
|
||||||
output_tokens=domain_usage.get("completion_tokens", 0),
|
output_tokens=domain_usage.get("completion_tokens", 0),
|
||||||
|
|
@ -845,15 +864,23 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
||||||
)
|
)
|
||||||
if leo_verdict not in ("skipped",):
|
if leo_verdict not in ("skipped",):
|
||||||
if tier == "DEEP":
|
if tier == "DEEP":
|
||||||
costs.record_usage(conn, config.EVAL_LEO_MODEL, "eval_leo", backend="max")
|
pr_cost += costs.record_usage(
|
||||||
|
conn, config.EVAL_LEO_MODEL, "eval_leo",
|
||||||
|
input_tokens=leo_usage.get("prompt_tokens", 0),
|
||||||
|
output_tokens=leo_usage.get("completion_tokens", 0),
|
||||||
|
backend="max",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
costs.record_usage(
|
pr_cost += costs.record_usage(
|
||||||
conn, config.EVAL_LEO_STANDARD_MODEL, "eval_leo",
|
conn, config.EVAL_LEO_STANDARD_MODEL, "eval_leo",
|
||||||
input_tokens=leo_usage.get("prompt_tokens", 0),
|
input_tokens=leo_usage.get("prompt_tokens", 0),
|
||||||
output_tokens=leo_usage.get("completion_tokens", 0),
|
output_tokens=leo_usage.get("completion_tokens", 0),
|
||||||
backend="openrouter",
|
backend="openrouter",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if pr_cost > 0:
|
||||||
|
conn.execute("UPDATE prs SET cost_usd = cost_usd + ? WHERE number = ?", (pr_cost, pr_number))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"pr": pr_number,
|
"pr": pr_number,
|
||||||
"tier": tier,
|
"tier": tier,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue