ganymede: extract lib/llm.py from evaluate.py (Phase 3c)
Some checks failed
CI / lint-and-test (pull_request) Has been cancelled

- What: LLM transport (OpenRouter, Claude CLI), prompt templates
  (triage/domain/Leo), and review runner functions moved to lib/llm.py.
  evaluate.py retains PR lifecycle orchestration, SQLite state, Forgejo
  posting, rate limit backoff, and evaluate_cycle.
- Why: evaluate.py was 734 lines mixing orchestration with LLM concerns.
  Now 455 lines orchestration + 250 lines LLM transport. Each module has
  a single responsibility.
- Connections: completes Phase 3 structural refactor (forgejo.py + domains.py
  + llm.py). teleo-pipeline.py updated to import kill_active_subprocesses
  from lib.llm.

Pentagon-Agent: Ganymede <F99EBFA6-547B-4096-BEEA-1D59C3E4028A>
This commit is contained in:
m3taversal 2026-03-13 15:40:18 +00:00
parent ff5162d5ba
commit 85b86a918a
3 changed files with 324 additions and 288 deletions

View file

@ -1,4 +1,4 @@
"""Evaluate stage — triage + domain review + Leo review.
"""Evaluate stage — PR lifecycle orchestration.
Ported from eval-worker.sh. Key architectural change: domain-first, Leo-last.
Sonnet (domain review) filters before Opus (Leo review) to maximize value per
@ -13,9 +13,9 @@ Flow per PR:
6. If both approve status = 'approved' (merge module picks it up)
Design reviewed by Ganymede, Rhea, Vida, Theseus.
LLM transport and prompts extracted to lib/llm.py (Phase 3c).
"""
import asyncio
import json
import logging
import re
@ -25,228 +25,10 @@ from . import config, db
from .domains import agent_for_domain, detect_domain_from_diff
from .forgejo import api as forgejo_api
from .forgejo import get_agent_token, get_pr_diff, repo_path
from .llm import run_domain_review, run_leo_review, triage_pr
logger = logging.getLogger("pipeline.evaluate")
# Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8)
_active_subprocesses: set = set()
async def kill_active_subprocesses():
"""Kill all tracked Claude CLI subprocesses. Called during graceful shutdown."""
for proc in list(_active_subprocesses):
if proc.returncode is None:
logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid)
try:
proc.kill()
await proc.wait()
except ProcessLookupError:
pass
_active_subprocesses.clear()
REVIEW_STYLE_GUIDE = (
"Be concise. Only mention what fails or is interesting. "
"Do not summarize what the PR does — the diff speaks for itself. "
"If everything passes, say so in one line and approve."
)
# ─── Prompt templates ──────────────────────────────────────────────────────
TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT.
DEEP use when ANY of these apply:
- PR adds or modifies claims rated "likely" or higher confidence
- PR touches agent beliefs or creates cross-domain wiki links
- PR challenges an existing claim (has "challenged_by" or contradicts existing)
- PR modifies axiom-level beliefs
- PR is a cross-domain synthesis claim
STANDARD use when:
- New claims in established domain areas
- Enrichments to existing claims (confirm/extend)
- New hypothesis-level beliefs
- Source archives with extraction results
LIGHT use ONLY when ALL changes fit these categories:
- Entity attribute updates (factual corrections, new data points)
- Source archiving without extraction
- Formatting fixes, typo corrections
- Status field changes
IMPORTANT: When uncertain, classify UP, not down. Always err toward more review.
Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line.
--- PR DIFF ---
{diff}"""
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
Review this PR from your domain expertise:
1. Technical accuracy are the claims factually correct in your domain?
2. Domain duplicates does your domain already have substantially similar claims?
3. Missing context is important domain context absent that would change interpretation?
4. Confidence calibration from your domain expertise, is the confidence level right?
5. Enrichment opportunities should this connect to existing claims via wiki links?
{style_guide}
If you are requesting changes, tag the specific issues:
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
End your review with exactly one of:
<!-- VERDICT:{agent_upper}:APPROVE -->
<!-- VERDICT:{agent_upper}:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR against the quality criteria:
1. Schema compliance YAML frontmatter, prose-as-title, required fields
2. Duplicate check does this claim already exist?
3. Confidence calibration appropriate for the evidence?
4. Wiki link validity references real claims?
5. Source quality credible for the claim?
6. Domain assignment correct domain?
7. Epistemic hygiene specific enough to be wrong?
{style_guide}
If requesting changes, tag the issues:
<!-- ISSUES: tag1, tag2 -->
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check:
1. Cross-domain implications does this claim affect beliefs in other domains?
2. Confidence calibration is the confidence level justified by the evidence?
3. Contradiction check does this contradict any existing claims without explicit argument?
4. Wiki link validity do all wiki links reference real, existing claims?
5. Axiom integrity if touching axiom-level beliefs, is the justification extraordinary?
6. Source quality is the source credible for the claim being made?
7. Duplicate check does a substantially similar claim already exist?
8. Enrichment vs new claim should this be an enrichment to an existing claim instead?
9. Domain assignment is the claim in the correct domain?
10. Schema compliance YAML frontmatter, prose-as-title format, required fields
11. Epistemic hygiene is the claim specific enough to be wrong?
{style_guide}
If requesting changes, tag the issues:
<!-- ISSUES: tag1, tag2 -->
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
# ─── API helpers ───────────────────────────────────────────────────────────
async def _openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None:
"""Call OpenRouter API. Returns response text or None on failure."""
import aiohttp
key_file = config.SECRETS_DIR / "openrouter-key"
if not key_file.exists():
logger.error("OpenRouter key file not found")
return None
key = key_file.read_text().strip()
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 4096,
"temperature": 0.2,
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(
config.OPENROUTER_URL,
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json=payload,
timeout=aiohttp.ClientTimeout(total=timeout_sec),
) as resp:
if resp.status >= 400:
text = await resp.text()
logger.error("OpenRouter %s%d: %s", model, resp.status, text[:200])
return None
data = await resp.json()
return data.get("choices", [{}])[0].get("message", {}).get("content")
except Exception as e:
logger.error("OpenRouter error: %s%s", model, e)
return None
async def _claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None:
"""Call Claude via CLI (Claude Max subscription). Returns response or None."""
proc = await asyncio.create_subprocess_exec(
str(config.CLAUDE_CLI),
"-p",
"--model",
model,
"--output-format",
"text",
cwd=cwd or str(config.REPO_DIR),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=prompt.encode()),
timeout=timeout_sec,
)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
logger.error("Claude CLI timed out after %ds", timeout_sec)
return None
finally:
_active_subprocesses.discard(proc)
out_text = (stdout or b"").decode()
err_text = (stderr or b"").decode()
# Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message
combined_lower = (out_text + err_text).lower()
if "hit your limit" in combined_lower or "rate limit" in combined_lower:
logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200])
return "RATE_LIMITED"
if proc.returncode != 0:
logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200])
return None
return out_text.strip()
# ─── Diff helpers ──────────────────────────────────────────────────────────
@ -321,69 +103,6 @@ def _parse_issues(review_text: str) -> list[str]:
return [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
# ─── Review execution ─────────────────────────────────────────────────────
async def _triage_pr(diff: str) -> str:
"""Triage PR via Haiku → DEEP/STANDARD/LIGHT."""
prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage
result = await _openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30)
if not result:
logger.warning("Triage failed, defaulting to STANDARD")
return "STANDARD"
tier = result.split("\n")[0].strip().upper()
if tier in ("DEEP", "STANDARD", "LIGHT"):
reason = result.split("\n")[1].strip() if "\n" in result else ""
logger.info("Triage: %s%s", tier, reason[:100])
return tier
logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20])
return "STANDARD"
async def _run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
prompt = DOMAIN_PROMPT.format(
agent=agent,
agent_upper=agent.upper(),
domain=domain,
style_guide=REVIEW_STYLE_GUIDE,
diff=diff,
files=files,
)
# Try Claude Max Sonnet first
result = await _claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
if policy == "overflow":
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
result = await _openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
else:
logger.info("Claude Max rate limited, queuing domain review")
return None
return result
async def _run_leo_review(diff: str, files: str, tier: str) -> str | None:
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
result = await _claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Leo review queues — don't waste Opus calls (never overflow)
logger.info("Claude Max Opus rate limited, queuing Leo review")
return None
return result
async def _post_formal_approvals(pr_number: int, pr_author: str):
"""Submit formal Forgejo reviews from 2 agents (not the PR author)."""
approvals = 0
@ -461,7 +180,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
# Step 1: Triage (if not already triaged)
if tier is None:
tier = await _triage_pr(diff)
tier = await triage_pr(diff)
conn.execute("UPDATE prs SET tier = ? WHERE number = ?", (tier, pr_number))
# Update last_attempt timestamp (status already set to 'reviewing' by atomic claim above)
@ -483,7 +202,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
logger.info("PR #%d: domain review already done (%s), skipping to Leo", pr_number, domain_verdict)
else:
logger.info("PR #%d: domain review (%s/%s, tier=%s)", pr_number, agent, domain, tier)
domain_review = await _run_domain_review(review_diff, files, domain or "general", agent)
domain_review = await run_domain_review(review_diff, files, domain or "general", agent)
if domain_review is None:
# Rate limited, couldn't overflow — revert to open for retry
@ -521,7 +240,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
leo_verdict = "skipped"
if tier != "LIGHT":
logger.info("PR #%d: Leo review (tier=%s)", pr_number, tier)
leo_review = await _run_leo_review(review_diff, files, tier)
leo_review = await run_leo_review(review_diff, files, tier)
if leo_review is None:
# Opus rate limited — revert to open for retry (keep domain verdict)
@ -678,6 +397,22 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
for row in rows:
try:
# During Opus backoff, skip PRs that already completed domain review
# (they'd just hit the Opus limit again). Only process PRs still
# needing triage or domain review.
if opus_backoff:
existing = conn.execute(
"SELECT domain_verdict FROM prs WHERE number = ?",
(row["number"],),
).fetchone()
if existing and existing["domain_verdict"] not in ("pending", None):
logger.debug(
"PR #%d: skipping during Opus backoff (domain already %s)",
row["number"],
existing["domain_verdict"],
)
continue
result = await evaluate_pr(conn, row["number"], tier=row["tier"])
if result.get("skipped"):
reason = result.get("reason", "")
@ -691,6 +426,7 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
_rate_limit_backoff_until = datetime.now(timezone.utc) + timedelta(
minutes=_RATE_LIMIT_BACKOFF_MINUTES
)
opus_backoff = True # Update local flag so in-loop guard kicks in
logger.info(
"Opus rate limited — backing off Opus for %d min, continuing triage+domain",
_RATE_LIMIT_BACKOFF_MINUTES,

299
lib/llm.py Normal file
View file

@ -0,0 +1,299 @@
"""LLM transport and review prompts — shared by all evaluation stages.
Extracted from evaluate.py (Phase 3c refactor). This module owns:
- Prompt templates (triage, domain, Leo)
- OpenRouter API transport
- Claude CLI transport with subprocess tracking
- Review runner functions (triage, domain, Leo)
Orchestration (PR lifecycle, SQLite state, Forgejo posting) stays in evaluate.py.
"""
import asyncio
import logging
import aiohttp
from . import config
logger = logging.getLogger("pipeline.llm")
# Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8)
_active_subprocesses: set = set()
async def kill_active_subprocesses():
"""Kill all tracked Claude CLI subprocesses. Called during graceful shutdown."""
for proc in list(_active_subprocesses):
if proc.returncode is None:
logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid)
try:
proc.kill()
await proc.wait()
except ProcessLookupError:
pass
_active_subprocesses.clear()
REVIEW_STYLE_GUIDE = (
"Be concise. Only mention what fails or is interesting. "
"Do not summarize what the PR does — the diff speaks for itself. "
"If everything passes, say so in one line and approve."
)
# ─── Prompt templates ──────────────────────────────────────────────────────
TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT.
DEEP use when ANY of these apply:
- PR adds or modifies claims rated "likely" or higher confidence
- PR touches agent beliefs or creates cross-domain wiki links
- PR challenges an existing claim (has "challenged_by" or contradicts existing)
- PR modifies axiom-level beliefs
- PR is a cross-domain synthesis claim
STANDARD use when:
- New claims in established domain areas
- Enrichments to existing claims (confirm/extend)
- New hypothesis-level beliefs
- Source archives with extraction results
LIGHT use ONLY when ALL changes fit these categories:
- Entity attribute updates (factual corrections, new data points)
- Source archiving without extraction
- Formatting fixes, typo corrections
- Status field changes
IMPORTANT: When uncertain, classify UP, not down. Always err toward more review.
Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line.
--- PR DIFF ---
{diff}"""
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
Review this PR from your domain expertise:
1. Technical accuracy are the claims factually correct in your domain?
2. Domain duplicates does your domain already have substantially similar claims?
3. Missing context is important domain context absent that would change interpretation?
4. Confidence calibration from your domain expertise, is the confidence level right?
5. Enrichment opportunities should this connect to existing claims via wiki links?
{style_guide}
If you are requesting changes, tag the specific issues:
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
End your review with exactly one of:
<!-- VERDICT:{agent_upper}:APPROVE -->
<!-- VERDICT:{agent_upper}:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR against the quality criteria:
1. Schema compliance YAML frontmatter, prose-as-title, required fields
2. Duplicate check does this claim already exist?
3. Confidence calibration appropriate for the evidence?
4. Wiki link validity references real claims?
5. Source quality credible for the claim?
6. Domain assignment correct domain?
7. Epistemic hygiene specific enough to be wrong?
{style_guide}
If requesting changes, tag the issues:
<!-- ISSUES: tag1, tag2 -->
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check:
1. Cross-domain implications does this claim affect beliefs in other domains?
2. Confidence calibration is the confidence level justified by the evidence?
3. Contradiction check does this contradict any existing claims without explicit argument?
4. Wiki link validity do all wiki links reference real, existing claims?
5. Axiom integrity if touching axiom-level beliefs, is the justification extraordinary?
6. Source quality is the source credible for the claim being made?
7. Duplicate check does a substantially similar claim already exist?
8. Enrichment vs new claim should this be an enrichment to an existing claim instead?
9. Domain assignment is the claim in the correct domain?
10. Schema compliance YAML frontmatter, prose-as-title format, required fields
11. Epistemic hygiene is the claim specific enough to be wrong?
{style_guide}
If requesting changes, tag the issues:
<!-- ISSUES: tag1, tag2 -->
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
# ─── API helpers ───────────────────────────────────────────────────────────
async def openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None:
"""Call OpenRouter API. Returns response text or None on failure."""
key_file = config.SECRETS_DIR / "openrouter-key"
if not key_file.exists():
logger.error("OpenRouter key file not found")
return None
key = key_file.read_text().strip()
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 4096,
"temperature": 0.2,
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(
config.OPENROUTER_URL,
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json=payload,
timeout=aiohttp.ClientTimeout(total=timeout_sec),
) as resp:
if resp.status >= 400:
text = await resp.text()
logger.error("OpenRouter %s%d: %s", model, resp.status, text[:200])
return None
data = await resp.json()
return data.get("choices", [{}])[0].get("message", {}).get("content")
except Exception as e:
logger.error("OpenRouter error: %s%s", model, e)
return None
async def claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None:
"""Call Claude via CLI (Claude Max subscription). Returns response or None."""
proc = await asyncio.create_subprocess_exec(
str(config.CLAUDE_CLI),
"-p",
"--model",
model,
"--output-format",
"text",
cwd=cwd or str(config.REPO_DIR),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=prompt.encode()),
timeout=timeout_sec,
)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
logger.error("Claude CLI timed out after %ds", timeout_sec)
return None
finally:
_active_subprocesses.discard(proc)
out_text = (stdout or b"").decode()
err_text = (stderr or b"").decode()
# Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message
combined_lower = (out_text + err_text).lower()
if "hit your limit" in combined_lower or "rate limit" in combined_lower:
logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200])
return "RATE_LIMITED"
if proc.returncode != 0:
logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200])
return None
return out_text.strip()
# ─── Review execution ─────────────────────────────────────────────────────
async def triage_pr(diff: str) -> str:
"""Triage PR via Haiku → DEEP/STANDARD/LIGHT."""
prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage
result = await openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30)
if not result:
logger.warning("Triage failed, defaulting to STANDARD")
return "STANDARD"
tier = result.split("\n")[0].strip().upper()
if tier in ("DEEP", "STANDARD", "LIGHT"):
reason = result.split("\n")[1].strip() if "\n" in result else ""
logger.info("Triage: %s%s", tier, reason[:100])
return tier
logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20])
return "STANDARD"
async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
prompt = DOMAIN_PROMPT.format(
agent=agent,
agent_upper=agent.upper(),
domain=domain,
style_guide=REVIEW_STYLE_GUIDE,
diff=diff,
files=files,
)
# Try Claude Max Sonnet first
result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
if policy == "overflow":
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
else:
logger.info("Claude Max rate limited, queuing domain review")
return None
return result
async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Leo review queues — don't waste Opus calls (never overflow)
logger.info("Claude Max Opus rate limited, queuing Leo review")
return None
return result

View file

@ -18,8 +18,9 @@ sys.path.insert(0, str(Path(__file__).parent))
from lib import config, db
from lib import log as logmod
from lib.breaker import CircuitBreaker
from lib.evaluate import evaluate_cycle, kill_active_subprocesses
from lib.evaluate import evaluate_cycle
from lib.health import start_health_server, stop_health_server
from lib.llm import kill_active_subprocesses
from lib.merge import merge_cycle
from lib.validate import validate_cycle