ganymede: extract lib/llm.py from evaluate.py (Phase 3c)
Some checks failed
CI / lint-and-test (pull_request) Has been cancelled
Some checks failed
CI / lint-and-test (pull_request) Has been cancelled
- What: LLM transport (OpenRouter, Claude CLI), prompt templates (triage/domain/Leo), and review runner functions moved to lib/llm.py. evaluate.py retains PR lifecycle orchestration, SQLite state, Forgejo posting, rate limit backoff, and evaluate_cycle. - Why: evaluate.py was 734 lines mixing orchestration with LLM concerns. Now 455 lines orchestration + 250 lines LLM transport. Each module has a single responsibility. - Connections: completes Phase 3 structural refactor (forgejo.py + domains.py + llm.py). teleo-pipeline.py updated to import kill_active_subprocesses from lib.llm. Pentagon-Agent: Ganymede <F99EBFA6-547B-4096-BEEA-1D59C3E4028A>
This commit is contained in:
parent
ff5162d5ba
commit
85b86a918a
3 changed files with 324 additions and 288 deletions
310
lib/evaluate.py
310
lib/evaluate.py
|
|
@ -1,4 +1,4 @@
|
|||
"""Evaluate stage — triage + domain review + Leo review.
|
||||
"""Evaluate stage — PR lifecycle orchestration.
|
||||
|
||||
Ported from eval-worker.sh. Key architectural change: domain-first, Leo-last.
|
||||
Sonnet (domain review) filters before Opus (Leo review) to maximize value per
|
||||
|
|
@ -13,9 +13,9 @@ Flow per PR:
|
|||
6. If both approve → status = 'approved' (merge module picks it up)
|
||||
|
||||
Design reviewed by Ganymede, Rhea, Vida, Theseus.
|
||||
LLM transport and prompts extracted to lib/llm.py (Phase 3c).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
|
@ -25,228 +25,10 @@ from . import config, db
|
|||
from .domains import agent_for_domain, detect_domain_from_diff
|
||||
from .forgejo import api as forgejo_api
|
||||
from .forgejo import get_agent_token, get_pr_diff, repo_path
|
||||
from .llm import run_domain_review, run_leo_review, triage_pr
|
||||
|
||||
logger = logging.getLogger("pipeline.evaluate")
|
||||
|
||||
# Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8)
|
||||
_active_subprocesses: set = set()
|
||||
|
||||
|
||||
async def kill_active_subprocesses():
|
||||
"""Kill all tracked Claude CLI subprocesses. Called during graceful shutdown."""
|
||||
for proc in list(_active_subprocesses):
|
||||
if proc.returncode is None:
|
||||
logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid)
|
||||
try:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
_active_subprocesses.clear()
|
||||
|
||||
|
||||
REVIEW_STYLE_GUIDE = (
|
||||
"Be concise. Only mention what fails or is interesting. "
|
||||
"Do not summarize what the PR does — the diff speaks for itself. "
|
||||
"If everything passes, say so in one line and approve."
|
||||
)
|
||||
|
||||
|
||||
# ─── Prompt templates ──────────────────────────────────────────────────────
|
||||
|
||||
TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT.
|
||||
|
||||
DEEP — use when ANY of these apply:
|
||||
- PR adds or modifies claims rated "likely" or higher confidence
|
||||
- PR touches agent beliefs or creates cross-domain wiki links
|
||||
- PR challenges an existing claim (has "challenged_by" or contradicts existing)
|
||||
- PR modifies axiom-level beliefs
|
||||
- PR is a cross-domain synthesis claim
|
||||
|
||||
STANDARD — use when:
|
||||
- New claims in established domain areas
|
||||
- Enrichments to existing claims (confirm/extend)
|
||||
- New hypothesis-level beliefs
|
||||
- Source archives with extraction results
|
||||
|
||||
LIGHT — use ONLY when ALL changes fit these categories:
|
||||
- Entity attribute updates (factual corrections, new data points)
|
||||
- Source archiving without extraction
|
||||
- Formatting fixes, typo corrections
|
||||
- Status field changes
|
||||
|
||||
IMPORTANT: When uncertain, classify UP, not down. Always err toward more review.
|
||||
|
||||
Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line.
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}"""
|
||||
|
||||
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR from your domain expertise:
|
||||
1. Technical accuracy — are the claims factually correct in your domain?
|
||||
2. Domain duplicates — does your domain already have substantially similar claims?
|
||||
3. Missing context — is important domain context absent that would change interpretation?
|
||||
4. Confidence calibration — from your domain expertise, is the confidence level right?
|
||||
5. Enrichment opportunities — should this connect to existing claims via wiki links?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If you are requesting changes, tag the specific issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:{agent_upper}:APPROVE -->
|
||||
<!-- VERDICT:{agent_upper}:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR against the quality criteria:
|
||||
1. Schema compliance — YAML frontmatter, prose-as-title, required fields
|
||||
2. Duplicate check — does this claim already exist?
|
||||
3. Confidence calibration — appropriate for the evidence?
|
||||
4. Wiki link validity — references real claims?
|
||||
5. Source quality — credible for the claim?
|
||||
6. Domain assignment — correct domain?
|
||||
7. Epistemic hygiene — specific enough to be wrong?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If requesting changes, tag the issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:LEO:APPROVE -->
|
||||
<!-- VERDICT:LEO:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check:
|
||||
1. Cross-domain implications — does this claim affect beliefs in other domains?
|
||||
2. Confidence calibration — is the confidence level justified by the evidence?
|
||||
3. Contradiction check — does this contradict any existing claims without explicit argument?
|
||||
4. Wiki link validity — do all wiki links reference real, existing claims?
|
||||
5. Axiom integrity — if touching axiom-level beliefs, is the justification extraordinary?
|
||||
6. Source quality — is the source credible for the claim being made?
|
||||
7. Duplicate check — does a substantially similar claim already exist?
|
||||
8. Enrichment vs new claim — should this be an enrichment to an existing claim instead?
|
||||
9. Domain assignment — is the claim in the correct domain?
|
||||
10. Schema compliance — YAML frontmatter, prose-as-title format, required fields
|
||||
11. Epistemic hygiene — is the claim specific enough to be wrong?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If requesting changes, tag the issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:LEO:APPROVE -->
|
||||
<!-- VERDICT:LEO:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
|
||||
# ─── API helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None:
|
||||
"""Call OpenRouter API. Returns response text or None on failure."""
|
||||
import aiohttp
|
||||
|
||||
key_file = config.SECRETS_DIR / "openrouter-key"
|
||||
if not key_file.exists():
|
||||
logger.error("OpenRouter key file not found")
|
||||
return None
|
||||
key = key_file.read_text().strip()
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 4096,
|
||||
"temperature": 0.2,
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
config.OPENROUTER_URL,
|
||||
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout_sec),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
text = await resp.text()
|
||||
logger.error("OpenRouter %s → %d: %s", model, resp.status, text[:200])
|
||||
return None
|
||||
data = await resp.json()
|
||||
return data.get("choices", [{}])[0].get("message", {}).get("content")
|
||||
except Exception as e:
|
||||
logger.error("OpenRouter error: %s → %s", model, e)
|
||||
return None
|
||||
|
||||
|
||||
async def _claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None:
|
||||
"""Call Claude via CLI (Claude Max subscription). Returns response or None."""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
str(config.CLAUDE_CLI),
|
||||
"-p",
|
||||
"--model",
|
||||
model,
|
||||
"--output-format",
|
||||
"text",
|
||||
cwd=cwd or str(config.REPO_DIR),
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(input=prompt.encode()),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
logger.error("Claude CLI timed out after %ds", timeout_sec)
|
||||
return None
|
||||
finally:
|
||||
_active_subprocesses.discard(proc)
|
||||
|
||||
out_text = (stdout or b"").decode()
|
||||
err_text = (stderr or b"").decode()
|
||||
|
||||
# Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message
|
||||
combined_lower = (out_text + err_text).lower()
|
||||
if "hit your limit" in combined_lower or "rate limit" in combined_lower:
|
||||
logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200])
|
||||
return "RATE_LIMITED"
|
||||
|
||||
if proc.returncode != 0:
|
||||
logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200])
|
||||
return None
|
||||
|
||||
return out_text.strip()
|
||||
|
||||
|
||||
# ─── Diff helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -321,69 +103,6 @@ def _parse_issues(review_text: str) -> list[str]:
|
|||
return [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
|
||||
|
||||
|
||||
# ─── Review execution ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _triage_pr(diff: str) -> str:
|
||||
"""Triage PR via Haiku → DEEP/STANDARD/LIGHT."""
|
||||
prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage
|
||||
result = await _openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30)
|
||||
if not result:
|
||||
logger.warning("Triage failed, defaulting to STANDARD")
|
||||
return "STANDARD"
|
||||
|
||||
tier = result.split("\n")[0].strip().upper()
|
||||
if tier in ("DEEP", "STANDARD", "LIGHT"):
|
||||
reason = result.split("\n")[1].strip() if "\n" in result else ""
|
||||
logger.info("Triage: %s — %s", tier, reason[:100])
|
||||
return tier
|
||||
|
||||
logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20])
|
||||
return "STANDARD"
|
||||
|
||||
|
||||
async def _run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
|
||||
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
|
||||
prompt = DOMAIN_PROMPT.format(
|
||||
agent=agent,
|
||||
agent_upper=agent.upper(),
|
||||
domain=domain,
|
||||
style_guide=REVIEW_STYLE_GUIDE,
|
||||
diff=diff,
|
||||
files=files,
|
||||
)
|
||||
|
||||
# Try Claude Max Sonnet first
|
||||
result = await _claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
|
||||
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
|
||||
if policy == "overflow":
|
||||
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
|
||||
result = await _openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
else:
|
||||
logger.info("Claude Max rate limited, queuing domain review")
|
||||
return None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _run_leo_review(diff: str, files: str, tier: str) -> str | None:
|
||||
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
|
||||
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
|
||||
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
|
||||
|
||||
result = await _claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Leo review queues — don't waste Opus calls (never overflow)
|
||||
logger.info("Claude Max Opus rate limited, queuing Leo review")
|
||||
return None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _post_formal_approvals(pr_number: int, pr_author: str):
|
||||
"""Submit formal Forgejo reviews from 2 agents (not the PR author)."""
|
||||
approvals = 0
|
||||
|
|
@ -461,7 +180,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
|||
|
||||
# Step 1: Triage (if not already triaged)
|
||||
if tier is None:
|
||||
tier = await _triage_pr(diff)
|
||||
tier = await triage_pr(diff)
|
||||
conn.execute("UPDATE prs SET tier = ? WHERE number = ?", (tier, pr_number))
|
||||
|
||||
# Update last_attempt timestamp (status already set to 'reviewing' by atomic claim above)
|
||||
|
|
@ -483,7 +202,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
|||
logger.info("PR #%d: domain review already done (%s), skipping to Leo", pr_number, domain_verdict)
|
||||
else:
|
||||
logger.info("PR #%d: domain review (%s/%s, tier=%s)", pr_number, agent, domain, tier)
|
||||
domain_review = await _run_domain_review(review_diff, files, domain or "general", agent)
|
||||
domain_review = await run_domain_review(review_diff, files, domain or "general", agent)
|
||||
|
||||
if domain_review is None:
|
||||
# Rate limited, couldn't overflow — revert to open for retry
|
||||
|
|
@ -521,7 +240,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
|||
leo_verdict = "skipped"
|
||||
if tier != "LIGHT":
|
||||
logger.info("PR #%d: Leo review (tier=%s)", pr_number, tier)
|
||||
leo_review = await _run_leo_review(review_diff, files, tier)
|
||||
leo_review = await run_leo_review(review_diff, files, tier)
|
||||
|
||||
if leo_review is None:
|
||||
# Opus rate limited — revert to open for retry (keep domain verdict)
|
||||
|
|
@ -678,6 +397,22 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
|
||||
for row in rows:
|
||||
try:
|
||||
# During Opus backoff, skip PRs that already completed domain review
|
||||
# (they'd just hit the Opus limit again). Only process PRs still
|
||||
# needing triage or domain review.
|
||||
if opus_backoff:
|
||||
existing = conn.execute(
|
||||
"SELECT domain_verdict FROM prs WHERE number = ?",
|
||||
(row["number"],),
|
||||
).fetchone()
|
||||
if existing and existing["domain_verdict"] not in ("pending", None):
|
||||
logger.debug(
|
||||
"PR #%d: skipping during Opus backoff (domain already %s)",
|
||||
row["number"],
|
||||
existing["domain_verdict"],
|
||||
)
|
||||
continue
|
||||
|
||||
result = await evaluate_pr(conn, row["number"], tier=row["tier"])
|
||||
if result.get("skipped"):
|
||||
reason = result.get("reason", "")
|
||||
|
|
@ -691,6 +426,7 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
_rate_limit_backoff_until = datetime.now(timezone.utc) + timedelta(
|
||||
minutes=_RATE_LIMIT_BACKOFF_MINUTES
|
||||
)
|
||||
opus_backoff = True # Update local flag so in-loop guard kicks in
|
||||
logger.info(
|
||||
"Opus rate limited — backing off Opus for %d min, continuing triage+domain",
|
||||
_RATE_LIMIT_BACKOFF_MINUTES,
|
||||
|
|
|
|||
299
lib/llm.py
Normal file
299
lib/llm.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
"""LLM transport and review prompts — shared by all evaluation stages.
|
||||
|
||||
Extracted from evaluate.py (Phase 3c refactor). This module owns:
|
||||
- Prompt templates (triage, domain, Leo)
|
||||
- OpenRouter API transport
|
||||
- Claude CLI transport with subprocess tracking
|
||||
- Review runner functions (triage, domain, Leo)
|
||||
|
||||
Orchestration (PR lifecycle, SQLite state, Forgejo posting) stays in evaluate.py.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import aiohttp
|
||||
|
||||
from . import config
|
||||
|
||||
logger = logging.getLogger("pipeline.llm")
|
||||
|
||||
# Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8)
|
||||
_active_subprocesses: set = set()
|
||||
|
||||
|
||||
async def kill_active_subprocesses():
|
||||
"""Kill all tracked Claude CLI subprocesses. Called during graceful shutdown."""
|
||||
for proc in list(_active_subprocesses):
|
||||
if proc.returncode is None:
|
||||
logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid)
|
||||
try:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
_active_subprocesses.clear()
|
||||
|
||||
|
||||
REVIEW_STYLE_GUIDE = (
|
||||
"Be concise. Only mention what fails or is interesting. "
|
||||
"Do not summarize what the PR does — the diff speaks for itself. "
|
||||
"If everything passes, say so in one line and approve."
|
||||
)
|
||||
|
||||
|
||||
# ─── Prompt templates ──────────────────────────────────────────────────────
|
||||
|
||||
TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT.
|
||||
|
||||
DEEP — use when ANY of these apply:
|
||||
- PR adds or modifies claims rated "likely" or higher confidence
|
||||
- PR touches agent beliefs or creates cross-domain wiki links
|
||||
- PR challenges an existing claim (has "challenged_by" or contradicts existing)
|
||||
- PR modifies axiom-level beliefs
|
||||
- PR is a cross-domain synthesis claim
|
||||
|
||||
STANDARD — use when:
|
||||
- New claims in established domain areas
|
||||
- Enrichments to existing claims (confirm/extend)
|
||||
- New hypothesis-level beliefs
|
||||
- Source archives with extraction results
|
||||
|
||||
LIGHT — use ONLY when ALL changes fit these categories:
|
||||
- Entity attribute updates (factual corrections, new data points)
|
||||
- Source archiving without extraction
|
||||
- Formatting fixes, typo corrections
|
||||
- Status field changes
|
||||
|
||||
IMPORTANT: When uncertain, classify UP, not down. Always err toward more review.
|
||||
|
||||
Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line.
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}"""
|
||||
|
||||
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR from your domain expertise:
|
||||
1. Technical accuracy — are the claims factually correct in your domain?
|
||||
2. Domain duplicates — does your domain already have substantially similar claims?
|
||||
3. Missing context — is important domain context absent that would change interpretation?
|
||||
4. Confidence calibration — from your domain expertise, is the confidence level right?
|
||||
5. Enrichment opportunities — should this connect to existing claims via wiki links?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If you are requesting changes, tag the specific issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:{agent_upper}:APPROVE -->
|
||||
<!-- VERDICT:{agent_upper}:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR against the quality criteria:
|
||||
1. Schema compliance — YAML frontmatter, prose-as-title, required fields
|
||||
2. Duplicate check — does this claim already exist?
|
||||
3. Confidence calibration — appropriate for the evidence?
|
||||
4. Wiki link validity — references real claims?
|
||||
5. Source quality — credible for the claim?
|
||||
6. Domain assignment — correct domain?
|
||||
7. Epistemic hygiene — specific enough to be wrong?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If requesting changes, tag the issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:LEO:APPROVE -->
|
||||
<!-- VERDICT:LEO:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check:
|
||||
1. Cross-domain implications — does this claim affect beliefs in other domains?
|
||||
2. Confidence calibration — is the confidence level justified by the evidence?
|
||||
3. Contradiction check — does this contradict any existing claims without explicit argument?
|
||||
4. Wiki link validity — do all wiki links reference real, existing claims?
|
||||
5. Axiom integrity — if touching axiom-level beliefs, is the justification extraordinary?
|
||||
6. Source quality — is the source credible for the claim being made?
|
||||
7. Duplicate check — does a substantially similar claim already exist?
|
||||
8. Enrichment vs new claim — should this be an enrichment to an existing claim instead?
|
||||
9. Domain assignment — is the claim in the correct domain?
|
||||
10. Schema compliance — YAML frontmatter, prose-as-title format, required fields
|
||||
11. Epistemic hygiene — is the claim specific enough to be wrong?
|
||||
|
||||
{style_guide}
|
||||
|
||||
If requesting changes, tag the issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:LEO:APPROVE -->
|
||||
<!-- VERDICT:LEO:REQUEST_CHANGES -->
|
||||
|
||||
--- PR DIFF ---
|
||||
{diff}
|
||||
|
||||
--- CHANGED FILES ---
|
||||
{files}"""
|
||||
|
||||
|
||||
# ─── API helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None:
|
||||
"""Call OpenRouter API. Returns response text or None on failure."""
|
||||
key_file = config.SECRETS_DIR / "openrouter-key"
|
||||
if not key_file.exists():
|
||||
logger.error("OpenRouter key file not found")
|
||||
return None
|
||||
key = key_file.read_text().strip()
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 4096,
|
||||
"temperature": 0.2,
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
config.OPENROUTER_URL,
|
||||
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout_sec),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
text = await resp.text()
|
||||
logger.error("OpenRouter %s → %d: %s", model, resp.status, text[:200])
|
||||
return None
|
||||
data = await resp.json()
|
||||
return data.get("choices", [{}])[0].get("message", {}).get("content")
|
||||
except Exception as e:
|
||||
logger.error("OpenRouter error: %s → %s", model, e)
|
||||
return None
|
||||
|
||||
|
||||
async def claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None:
|
||||
"""Call Claude via CLI (Claude Max subscription). Returns response or None."""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
str(config.CLAUDE_CLI),
|
||||
"-p",
|
||||
"--model",
|
||||
model,
|
||||
"--output-format",
|
||||
"text",
|
||||
cwd=cwd or str(config.REPO_DIR),
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(input=prompt.encode()),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
logger.error("Claude CLI timed out after %ds", timeout_sec)
|
||||
return None
|
||||
finally:
|
||||
_active_subprocesses.discard(proc)
|
||||
|
||||
out_text = (stdout or b"").decode()
|
||||
err_text = (stderr or b"").decode()
|
||||
|
||||
# Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message
|
||||
combined_lower = (out_text + err_text).lower()
|
||||
if "hit your limit" in combined_lower or "rate limit" in combined_lower:
|
||||
logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200])
|
||||
return "RATE_LIMITED"
|
||||
|
||||
if proc.returncode != 0:
|
||||
logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200])
|
||||
return None
|
||||
|
||||
return out_text.strip()
|
||||
|
||||
|
||||
# ─── Review execution ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def triage_pr(diff: str) -> str:
|
||||
"""Triage PR via Haiku → DEEP/STANDARD/LIGHT."""
|
||||
prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage
|
||||
result = await openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30)
|
||||
if not result:
|
||||
logger.warning("Triage failed, defaulting to STANDARD")
|
||||
return "STANDARD"
|
||||
|
||||
tier = result.split("\n")[0].strip().upper()
|
||||
if tier in ("DEEP", "STANDARD", "LIGHT"):
|
||||
reason = result.split("\n")[1].strip() if "\n" in result else ""
|
||||
logger.info("Triage: %s — %s", tier, reason[:100])
|
||||
return tier
|
||||
|
||||
logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20])
|
||||
return "STANDARD"
|
||||
|
||||
|
||||
async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
|
||||
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
|
||||
prompt = DOMAIN_PROMPT.format(
|
||||
agent=agent,
|
||||
agent_upper=agent.upper(),
|
||||
domain=domain,
|
||||
style_guide=REVIEW_STYLE_GUIDE,
|
||||
diff=diff,
|
||||
files=files,
|
||||
)
|
||||
|
||||
# Try Claude Max Sonnet first
|
||||
result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
|
||||
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
|
||||
if policy == "overflow":
|
||||
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
|
||||
result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
else:
|
||||
logger.info("Claude Max rate limited, queuing domain review")
|
||||
return None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
|
||||
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
|
||||
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
|
||||
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
|
||||
|
||||
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Leo review queues — don't waste Opus calls (never overflow)
|
||||
logger.info("Claude Max Opus rate limited, queuing Leo review")
|
||||
return None
|
||||
|
||||
return result
|
||||
|
|
@ -18,8 +18,9 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|||
from lib import config, db
|
||||
from lib import log as logmod
|
||||
from lib.breaker import CircuitBreaker
|
||||
from lib.evaluate import evaluate_cycle, kill_active_subprocesses
|
||||
from lib.evaluate import evaluate_cycle
|
||||
from lib.health import start_health_server, stop_health_server
|
||||
from lib.llm import kill_active_subprocesses
|
||||
from lib.merge import merge_cycle
|
||||
from lib.validate import validate_cycle
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue