"""LLM transport and review prompts — shared by all evaluation stages. Extracted from evaluate.py (Phase 3c refactor). This module owns: - Prompt templates (triage, domain, Leo) - OpenRouter API transport - Claude CLI transport with subprocess tracking - Review runner functions (triage, domain, Leo) Orchestration (PR lifecycle, SQLite state, Forgejo posting) stays in evaluate.py. """ import asyncio import json import logging import aiohttp from . import config logger = logging.getLogger("pipeline.llm") # Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8) _active_subprocesses: set = set() async def kill_active_subprocesses(): """Kill all tracked Claude CLI subprocesses. Called during graceful shutdown.""" for proc in list(_active_subprocesses): if proc.returncode is None: logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid) try: proc.kill() await proc.wait() except ProcessLookupError: pass _active_subprocesses.clear() REVIEW_STYLE_GUIDE = ( "You MUST show your work. For each criterion, write one sentence with your finding. " "Do not summarize what the PR does — evaluate it. " "If a criterion passes, say what you checked and why it passes. " "If a criterion fails, explain the specific problem. " "Responses like 'Everything passes' with no evidence of checking will be treated as review failures. " "Be concise but substantive — one sentence per criterion, not one sentence total." ) # ─── Prompt templates ────────────────────────────────────────────────────── TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT. DEEP — use ONLY when the PR could change the knowledge graph structure: - PR modifies files in core/ or foundations/ (structural KB changes) - PR challenges an existing claim (has "challenged_by" field or explicitly argues against an existing claim) - PR modifies axiom-level beliefs in agents/*/beliefs.md - PR is a cross-domain synthesis claim that draws conclusions across 2+ domains DEEP is rare — most new claims are STANDARD even if they have high confidence or cross-domain wiki links. Adding a new "likely" claim about futarchy is STANDARD. Arguing that an existing claim is wrong is DEEP. STANDARD — the DEFAULT for most PRs: - New claims in any domain at any confidence level - Enrichments to existing claims (adding evidence, extending arguments) - New hypothesis-level beliefs - Source archives with extraction results - Claims with cross-domain wiki links (this is normal, not exceptional) LIGHT — use ONLY when ALL changes fit these categories: - Entity attribute updates (factual corrections, new data points) - Source archiving without extraction - Formatting fixes, typo corrections - Status field changes IMPORTANT: When uncertain between DEEP and STANDARD, choose STANDARD. Most claims are STANDARD. DEEP is reserved for structural changes to the knowledge base, not for complex or important-sounding claims. Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line. --- PR DIFF --- {diff}""" DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base. IMPORTANT — This PR may contain different content types: - **Claims** (type: claim): arguable assertions with confidence levels. Review fully. - **Entities** (type: entity, files in entities/): descriptive records of projects, people, protocols. Do NOT reject entities for missing confidence or source fields — they have a different schema. - **Sources** (files in inbox/): archive metadata. Auto-approve these. Review this PR. For EACH criterion below, write one sentence stating what you found: 1. **Factual accuracy** — Are the claims/entities factually correct? Name any specific errors. 2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files. Shared entity files (like metadao.md or futardio.md) appearing in multiple PRs are NOT duplicates — they are expected enrichments. 3. **Confidence calibration** — For claims only. Is the confidence level right for the evidence? Entities don't have confidence levels. 4. **Wiki links** — Note any broken [[wiki links]], but do NOT let them affect your verdict. Broken links are expected — linked claims often exist in other open PRs that haven't merged yet. ALWAYS APPROVE even if wiki links are broken. VERDICT RULES — read carefully: - APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible. - APPROVE entity files (type: entity) unless they contain factual errors. - APPROVE even if wiki links are broken — this is NEVER a reason to REQUEST_CHANGES. - REQUEST_CHANGES only for these BLOCKING issues: factual errors, copy-pasted duplicate evidence, or confidence that is clearly wrong (e.g. "proven" with no evidence). - If the ONLY issues you find are broken wiki links: you MUST APPROVE. - Do NOT invent problems. If a criterion passes, say it passes. {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base. IMPORTANT — Content types have DIFFERENT schemas: - **Claims** (type: claim): require type, domain, confidence, source, created, description. Title must be a prose proposition. - **Entities** (type: entity, files in entities/): require ONLY type, domain, description. NO confidence, NO source, NO created date. Short filenames like "metadao.md" are correct — entities are NOT claims. - **Sources** (files in inbox/): different schema entirely. Do NOT flag sources for missing claim fields. Do NOT flag entity files for missing confidence, source, or created fields. Do NOT flag entity filenames for being too short or not prose propositions. These are different content types with different rules. Review this PR. For EACH criterion below, write one sentence stating what you found: 1. **Schema** — Does each file have valid frontmatter FOR ITS TYPE? (Claims need full schema. Entities need only type+domain+description.) 2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim? 3. **Confidence** — For claims only: name the confidence level. Does the evidence justify it? 4. **Wiki links** — Note any broken [[links]], but do NOT let them affect your verdict. Broken links are expected — linked claims often exist in other open PRs. ALWAYS APPROVE even if wiki links are broken. 5. **Source quality** — Is the source credible for this claim? 6. **Specificity** — For claims only: could someone disagree? If it's too vague to be wrong, flag it. VERDICT: APPROVE if the claims are factually correct and evidence supports them. Broken wiki links are NEVER a reason to REQUEST_CHANGES. If broken links are the ONLY issue, you MUST APPROVE. {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base. Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check: 1. Cross-domain implications — does this claim affect beliefs in other domains? 2. Confidence calibration — is the confidence level justified by the evidence? 3. Contradiction check — does this contradict any existing claims without explicit argument? 4. Wiki link validity — note any broken links, but do NOT let them affect your verdict. Broken links are expected (linked claims may be in other PRs). NEVER REQUEST_CHANGES for broken wiki links alone. 5. Axiom integrity — if touching axiom-level beliefs, is the justification extraordinary? 6. Source quality — is the source credible for the claim being made? 7. Duplicate check — does a substantially similar claim already exist? 8. Enrichment vs new claim — should this be an enrichment to an existing claim instead? 9. Domain assignment — is the claim in the correct domain? 10. Schema compliance — YAML frontmatter, prose-as-title format, required fields 11. Epistemic hygiene — is the claim specific enough to be wrong? {style_guide} If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags): Valid tags: frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: --- PR DIFF --- {diff} --- CHANGED FILES --- {files}""" BATCH_DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base. You are reviewing {n_prs} PRs in a single batch. For EACH PR, apply all criteria INDEPENDENTLY. Do not mix content between PRs. Each PR is a separate evaluation. For EACH PR, check these criteria (one sentence each): 1. **Factual accuracy** — Are the claims factually correct? Name any specific errors. 2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? 3. **Confidence calibration** — Is the confidence level right for the evidence provided? 4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? VERDICT RULES — read carefully: - APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible. - REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong. - Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE. - Do NOT invent problems. If a criterion passes, say it passes. {style_guide} For EACH PR, write your full review, then end that PR's section with the verdict tag. If requesting changes, tag the specific issues: Valid tags: frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error {pr_sections} IMPORTANT: You MUST provide a verdict for every PR listed above. For each PR, end with exactly one of: where NUMBER is the PR number shown in the section header.""" # ─── API helpers ─────────────────────────────────────────────────────────── async def openrouter_call( model: str, prompt: str, timeout_sec: int = 120, max_tokens: int = 4096, ) -> tuple[str | None, dict]: """Call OpenRouter API. Returns (response_text, usage_dict). usage_dict has keys: prompt_tokens, completion_tokens (0 on failure). """ empty_usage = {"prompt_tokens": 0, "completion_tokens": 0} key_file = config.SECRETS_DIR / "openrouter-key" if not key_file.exists(): logger.error("OpenRouter key file not found") return None, empty_usage key = key_file.read_text().strip() payload = { "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.2, } try: async with aiohttp.ClientSession() as session: async with session.post( config.OPENROUTER_URL, headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"}, json=payload, timeout=aiohttp.ClientTimeout(total=timeout_sec), ) as resp: if resp.status >= 400: text = await resp.text() logger.error("OpenRouter %s → %d: %s", model, resp.status, text[:200]) return None, empty_usage data = await resp.json() usage = data.get("usage", empty_usage) content = data.get("choices", [{}])[0].get("message", {}).get("content") return content, usage except Exception as e: logger.error("OpenRouter error: %s → %s", model, e) return None, empty_usage async def claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> tuple[str | None, dict]: """Call Claude via CLI (Claude Max subscription). Returns (response, usage). Uses --output-format json to capture token usage. Subscription calls cost $0 but tokens are tracked for compute metrics (Cory: capture tokens/time, note subscription). """ empty_usage = { "prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_write_tokens": 0, "duration_ms": 0, "duration_api_ms": 0, "cost_estimate_usd": 0.0, "stop_reason": "", "num_turns": 0, "service_tier": "", "speed": "", } proc = await asyncio.create_subprocess_exec( str(config.CLAUDE_CLI), "-p", "--model", model, "--output-format", "json", cwd=cwd or str(config.REPO_DIR), stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8) try: stdout, stderr = await asyncio.wait_for( proc.communicate(input=prompt.encode()), timeout=timeout_sec, ) except asyncio.TimeoutError: proc.kill() await proc.wait() logger.error("Claude CLI timed out after %ds", timeout_sec) return None, empty_usage finally: _active_subprocesses.discard(proc) out_text = (stdout or b"").decode() err_text = (stderr or b"").decode() # Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message combined_lower = (out_text + err_text).lower() if "hit your limit" in combined_lower or "rate limit" in combined_lower: logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200]) return "RATE_LIMITED", empty_usage if proc.returncode != 0: logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200]) return None, empty_usage # Parse JSON output to extract full usage telemetry usage = empty_usage.copy() try: data = json.loads(out_text) text = data.get("result", "") raw_usage = data.get("usage", {}) usage = { "prompt_tokens": raw_usage.get("input_tokens", 0), "completion_tokens": raw_usage.get("output_tokens", 0), "cache_read_tokens": raw_usage.get("cache_read_input_tokens", 0), "cache_write_tokens": raw_usage.get("cache_creation_input_tokens", 0), "duration_ms": data.get("duration_ms", 0), "duration_api_ms": data.get("duration_api_ms", 0), "cost_estimate_usd": data.get("total_cost_usd", 0.0), "stop_reason": data.get("stop_reason", ""), "num_turns": data.get("num_turns", 0), "service_tier": raw_usage.get("service_tier", ""), "speed": raw_usage.get("speed", ""), } except (json.JSONDecodeError, KeyError): logger.warning("Claude CLI returned non-JSON output, token tracking unavailable") text = out_text.strip() return text, usage # ─── Review execution ───────────────────────────────────────────────────── async def triage_pr(diff: str) -> tuple[str, dict, str]: """Triage PR via Haiku → (tier, usage, reason). tier is DEEP/STANDARD/LIGHT.""" prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage result, usage = await openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30) if not result: logger.warning("Triage failed, defaulting to STANDARD") return "STANDARD", usage, "triage failed, default" tier = result.split("\n")[0].strip().upper() if tier in ("DEEP", "STANDARD", "LIGHT"): reason = result.split("\n")[1].strip() if "\n" in result else "" logger.info("Triage: %s — %s", tier, reason[:100]) return tier, usage, reason[:500] logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20]) return "STANDARD", usage, f"unparseable response, default (got: {tier[:20]})" async def run_batch_domain_review( pr_diffs: list[dict], domain: str, agent: str, ) -> tuple[str | None, dict]: """Run batched domain review for multiple PRs in one LLM call. pr_diffs: list of {"number": int, "label": str, "diff": str, "files": str} Returns (raw_response_text, usage) or (None, usage) on failure. """ # Build per-PR sections with anchoring labels sections = [] for pr in pr_diffs: sections.append( f"=== PR #{pr['number']}: {pr['label']} ({pr['file_count']} files) ===\n" f"--- PR DIFF ---\n{pr['diff']}\n\n" f"--- CHANGED FILES ---\n{pr['files']}\n" ) prompt = BATCH_DOMAIN_PROMPT.format( agent=agent, agent_upper=agent.upper(), domain=domain, n_prs=len(pr_diffs), style_guide=REVIEW_STYLE_GUIDE, pr_sections="\n".join(sections), ) # Scale max_tokens with batch size: ~3K tokens per PR review max_tokens = min(3000 * len(pr_diffs), 16384) result, usage = await openrouter_call( config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT, max_tokens=max_tokens, ) return result, usage async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> tuple[str | None, dict]: """Run domain review via OpenRouter. Decoupled from Claude Max to avoid account-level rate limits blocking domain reviews. Different model lineage also reduces correlated blind spots. Returns (review_text, usage). """ prompt = DOMAIN_PROMPT.format( agent=agent, agent_upper=agent.upper(), domain=domain, style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files, ) result, usage = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result, usage async def run_leo_review(diff: str, files: str, tier: str) -> tuple[str | None, dict]: """Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter). Opus is scarce — reserved for DEEP eval and overnight research sessions. STANDARD goes straight to GPT-4o. Domain review is the primary gate; Leo review is a quality check that doesn't need Opus for routine claims. Returns (review_text, usage). """ prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files) if tier == "DEEP": # Opus skipped — route all Leo reviews through Sonnet until backlog clears. # Opus via Claude Max CLI is consistently unavailable (rate limited or hanging). # Re-enable by removing this block and uncommenting the try-then-overflow below. # (Cory, Mar 14: "yes lets skip opus") # # --- Re-enable Opus later (uses EVAL_TIMEOUT_OPUS for longer reasoning): --- # result, usage = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT_OPUS) # if result == "RATE_LIMITED" or result is None: # logger.info("Opus unavailable for DEEP Leo review — overflowing to Sonnet") # result, usage = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT_OPUS) # return result, usage result, usage = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result, usage else: # STANDARD/LIGHT: Sonnet via OpenRouter — 120s timeout (routine calls) result, usage = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result, usage