"""LLM transport and review prompts — shared by all evaluation stages. Extracted from evaluate.py (Phase 3c refactor). This module owns: - Prompt templates (triage, domain, Leo) - OpenRouter API transport - Claude CLI transport with subprocess tracking - Review runner functions (triage, domain, Leo) Orchestration (PR lifecycle, SQLite state, Forgejo posting) stays in evaluate.py. """ import asyncio import logging import aiohttp from . import config logger = logging.getLogger("pipeline.llm") # Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8) _active_subprocesses: set = set() async def kill_active_subprocesses(): """Kill all tracked Claude CLI subprocesses. Called during graceful shutdown.""" for proc in list(_active_subprocesses): if proc.returncode is None: logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid) try: proc.kill() await proc.wait() except ProcessLookupError: pass _active_subprocesses.clear() REVIEW_STYLE_GUIDE = ( "You MUST show your work. For each criterion, write one sentence with your finding. " "Do not summarize what the PR does — evaluate it. " "If a criterion passes, say what you checked and why it passes. " "If a criterion fails, explain the specific problem. " "Responses like 'Everything passes' with no evidence of checking will be treated as review failures. " "Be concise but substantive — one sentence per criterion, not one sentence total." ) # ─── Prompt templates ────────────────────────────────────────────────────── TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT. DEEP — use when ANY of these apply: - PR adds or modifies claims rated "likely" or higher confidence - PR touches agent beliefs or creates cross-domain wiki links - PR challenges an existing claim (has "challenged_by" or contradicts existing) - PR modifies axiom-level beliefs - PR is a cross-domain synthesis claim STANDARD — use when: - New claims in established domain areas - Enrichments to existing claims (confirm/extend) - New hypothesis-level beliefs - Source archives with extraction results LIGHT — use ONLY when ALL changes fit these categories: - Entity attribute updates (factual corrections, new data points) - Source archiving without extraction - Formatting fixes, typo corrections - Status field changes IMPORTANT: When uncertain, classify UP, not down. Always err toward more review. Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line. --- PR DIFF --- {diff}""" DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base. Review this PR. For EACH criterion below, write one sentence stating what you found: 1. **Factual accuracy** — Are the claims factually correct? Name any specific errors. 2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files. 3. **Confidence calibration** — Is the confidence level right for the evidence provided? Name the level and say if it matches. 4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? Flag any that look broken. VERDICT RULES — read carefully: - APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible. - REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence). - Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE. - Do NOT invent problems. If a criterion passes, say it passes. {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base. Review this PR. For EACH criterion below, write one sentence stating what you found: 1. **Schema** — Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)? 2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim? 3. **Confidence** — Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories) 4. **Wiki links** — Do [[links]] in the diff point to real files? Flag any that look invented. 5. **Source quality** — Is the source credible for this claim? 6. **Specificity** — Could someone disagree with this claim? If it's too vague to be wrong, flag it. {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base. Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check: 1. Cross-domain implications — does this claim affect beliefs in other domains? 2. Confidence calibration — is the confidence level justified by the evidence? 3. Contradiction check — does this contradict any existing claims without explicit argument? 4. Wiki link validity — do all wiki links reference real, existing claims? 5. Axiom integrity — if touching axiom-level beliefs, is the justification extraordinary? 6. Source quality — is the source credible for the claim being made? 7. Duplicate check — does a substantially similar claim already exist? 8. Enrichment vs new claim — should this be an enrichment to an existing claim instead? 9. Domain assignment — is the claim in the correct domain? 10. Schema compliance — YAML frontmatter, prose-as-title format, required fields 11. Epistemic hygiene — is the claim specific enough to be wrong? {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" # ─── API helpers ─────────────────────────────────────────────────────────── async def openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None: """Call OpenRouter API. Returns response text or None on failure.""" key_file = config.SECRETS_DIR / "openrouter-key" if not key_file.exists(): logger.error("OpenRouter key file not found") return None key = key_file.read_text().strip() payload = { "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 4096, "temperature": 0.2, } try: async with aiohttp.ClientSession() as session: async with session.post( config.OPENROUTER_URL, headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"}, json=payload, timeout=aiohttp.ClientTimeout(total=timeout_sec), ) as resp: if resp.status >= 400: text = await resp.text() logger.error("OpenRouter %s → %d: %s", model, resp.status, text[:200]) return None data = await resp.json() return data.get("choices", [{}])[0].get("message", {}).get("content") except Exception as e: logger.error("OpenRouter error: %s → %s", model, e) return None async def claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None: """Call Claude via CLI (Claude Max subscription). Returns response or None.""" proc = await asyncio.create_subprocess_exec( str(config.CLAUDE_CLI), "-p", "--model", model, "--output-format", "text", cwd=cwd or str(config.REPO_DIR), stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8) try: stdout, stderr = await asyncio.wait_for( proc.communicate(input=prompt.encode()), timeout=timeout_sec, ) except asyncio.TimeoutError: proc.kill() await proc.wait() logger.error("Claude CLI timed out after %ds", timeout_sec) return None finally: _active_subprocesses.discard(proc) out_text = (stdout or b"").decode() err_text = (stderr or b"").decode() # Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message combined_lower = (out_text + err_text).lower() if "hit your limit" in combined_lower or "rate limit" in combined_lower: logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200]) return "RATE_LIMITED" if proc.returncode != 0: logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200]) return None return out_text.strip() # ─── Review execution ───────────────────────────────────────────────────── async def triage_pr(diff: str) -> str: """Triage PR via Haiku → DEEP/STANDARD/LIGHT.""" prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage result = await openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30) if not result: logger.warning("Triage failed, defaulting to STANDARD") return "STANDARD" tier = result.split("\n")[0].strip().upper() if tier in ("DEEP", "STANDARD", "LIGHT"): reason = result.split("\n")[1].strip() if "\n" in result else "" logger.info("Triage: %s — %s", tier, reason[:100]) return tier logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20]) return "STANDARD" async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None: """Run domain review via OpenRouter GPT-4o. Decoupled from Claude Max to avoid account-level rate limits blocking domain reviews. Different model lineage also reduces correlated blind spots. """ prompt = DOMAIN_PROMPT.format( agent=agent, agent_upper=agent.upper(), domain=domain, style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files, ) result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result async def run_leo_review(diff: str, files: str, tier: str) -> str | None: """Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter). Opus is scarce — reserved for DEEP eval and overnight research sessions. STANDARD goes straight to GPT-4o. Domain review is the primary gate; Leo review is a quality check that doesn't need Opus for routine claims. """ prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files) if tier == "DEEP": # DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes. result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) if result == "RATE_LIMITED": logger.info("Claude Max Opus rate limited, queuing DEEP Leo review") return None return result else: # STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from # domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max # rate limit untouched for Opus DEEP + overnight research. result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result