teleo-infrastructure/lib/llm.py
m3taversal 93e6f16144 leo: constrain issue tags — do not invent new tags
Opus was ignoring the valid tag list and generating custom tags like
schema-enrichment-slug-mismatch, which fall through to 'unknown' in
disposition logic. All three prompts (domain, Leo standard, Leo deep)
now explicitly say "do not invent new tags" alongside the valid tag list.

Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD>
2026-03-13 17:27:40 +00:00

314 lines
13 KiB
Python

"""LLM transport and review prompts — shared by all evaluation stages.
Extracted from evaluate.py (Phase 3c refactor). This module owns:
- Prompt templates (triage, domain, Leo)
- OpenRouter API transport
- Claude CLI transport with subprocess tracking
- Review runner functions (triage, domain, Leo)
Orchestration (PR lifecycle, SQLite state, Forgejo posting) stays in evaluate.py.
"""
import asyncio
import logging
import aiohttp
from . import config
logger = logging.getLogger("pipeline.llm")
# Track active Claude CLI subprocesses for graceful shutdown (Ganymede #8)
_active_subprocesses: set = set()
async def kill_active_subprocesses():
"""Kill all tracked Claude CLI subprocesses. Called during graceful shutdown."""
for proc in list(_active_subprocesses):
if proc.returncode is None:
logger.warning("Killing lingering Claude CLI subprocess PID %d", proc.pid)
try:
proc.kill()
await proc.wait()
except ProcessLookupError:
pass
_active_subprocesses.clear()
REVIEW_STYLE_GUIDE = (
"You MUST show your work. For each criterion, write one sentence with your finding. "
"Do not summarize what the PR does — evaluate it. "
"If a criterion passes, say what you checked and why it passes. "
"If a criterion fails, explain the specific problem. "
"Responses like 'Everything passes' with no evidence of checking will be treated as review failures. "
"Be concise but substantive — one sentence per criterion, not one sentence total."
)
# ─── Prompt templates ──────────────────────────────────────────────────────
TRIAGE_PROMPT = """Classify this pull request diff into exactly one tier: DEEP, STANDARD, or LIGHT.
DEEP — use when ANY of these apply:
- PR adds or modifies claims rated "likely" or higher confidence
- PR touches agent beliefs or creates cross-domain wiki links
- PR challenges an existing claim (has "challenged_by" or contradicts existing)
- PR modifies axiom-level beliefs
- PR is a cross-domain synthesis claim
STANDARD — use when:
- New claims in established domain areas
- Enrichments to existing claims (confirm/extend)
- New hypothesis-level beliefs
- Source archives with extraction results
LIGHT — use ONLY when ALL changes fit these categories:
- Entity attribute updates (factual corrections, new data points)
- Source archiving without extraction
- Formatting fixes, typo corrections
- Status field changes
IMPORTANT: When uncertain, classify UP, not down. Always err toward more review.
Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, followed by a one-line reason on the second line.
--- PR DIFF ---
{diff}"""
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
Review this PR. For EACH criterion below, write one sentence stating what you found:
1. **Factual accuracy** — Are the claims factually correct? Name any specific errors.
2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files.
3. **Confidence calibration** — Is the confidence level right for the evidence provided? Name the level and say if it matches.
4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? Flag any that look broken.
VERDICT RULES — read carefully:
- APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible.
- REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence).
- Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE.
- Do NOT invent problems. If a criterion passes, say it passes.
{style_guide}
If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags):
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
End your review with exactly one of:
<!-- VERDICT:{agent_upper}:APPROVE -->
<!-- VERDICT:{agent_upper}:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR. For EACH criterion below, write one sentence stating what you found:
1. **Schema** — Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)?
2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim?
3. **Confidence** — Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories)
4. **Wiki links** — Do [[links]] in the diff point to real files? Flag any that look invented.
5. **Source quality** — Is the source credible for this claim?
6. **Specificity** — Could someone disagree with this claim? If it's too vague to be wrong, flag it.
{style_guide}
If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags):
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
LEO_PROMPT_DEEP = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR with MAXIMUM scrutiny. This PR may trigger belief cascades. Check:
1. Cross-domain implications — does this claim affect beliefs in other domains?
2. Confidence calibration — is the confidence level justified by the evidence?
3. Contradiction check — does this contradict any existing claims without explicit argument?
4. Wiki link validity — do all wiki links reference real, existing claims?
5. Axiom integrity — if touching axiom-level beliefs, is the justification extraordinary?
6. Source quality — is the source credible for the claim being made?
7. Duplicate check — does a substantially similar claim already exist?
8. Enrichment vs new claim — should this be an enrichment to an existing claim instead?
9. Domain assignment — is the claim in the correct domain?
10. Schema compliance — YAML frontmatter, prose-as-title format, required fields
11. Epistemic hygiene — is the claim specific enough to be wrong?
{style_guide}
If requesting changes, tag the specific issues using ONLY these tags (do not invent new tags):
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
End your review with exactly one of:
<!-- VERDICT:LEO:APPROVE -->
<!-- VERDICT:LEO:REQUEST_CHANGES -->
--- PR DIFF ---
{diff}
--- CHANGED FILES ---
{files}"""
# ─── API helpers ───────────────────────────────────────────────────────────
async def openrouter_call(model: str, prompt: str, timeout_sec: int = 120) -> str | None:
"""Call OpenRouter API. Returns response text or None on failure."""
key_file = config.SECRETS_DIR / "openrouter-key"
if not key_file.exists():
logger.error("OpenRouter key file not found")
return None
key = key_file.read_text().strip()
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 4096,
"temperature": 0.2,
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(
config.OPENROUTER_URL,
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json=payload,
timeout=aiohttp.ClientTimeout(total=timeout_sec),
) as resp:
if resp.status >= 400:
text = await resp.text()
logger.error("OpenRouter %s%d: %s", model, resp.status, text[:200])
return None
data = await resp.json()
return data.get("choices", [{}])[0].get("message", {}).get("content")
except Exception as e:
logger.error("OpenRouter error: %s%s", model, e)
return None
async def claude_cli_call(model: str, prompt: str, timeout_sec: int = 600, cwd: str = None) -> str | None:
"""Call Claude via CLI (Claude Max subscription). Returns response or None."""
proc = await asyncio.create_subprocess_exec(
str(config.CLAUDE_CLI),
"-p",
"--model",
model,
"--output-format",
"text",
cwd=cwd or str(config.REPO_DIR),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_active_subprocesses.add(proc) # Track for graceful shutdown (Ganymede #8)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=prompt.encode()),
timeout=timeout_sec,
)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
logger.error("Claude CLI timed out after %ds", timeout_sec)
return None
finally:
_active_subprocesses.discard(proc)
out_text = (stdout or b"").decode()
err_text = (stderr or b"").decode()
# Check for rate limit REGARDLESS of exit code — CLI sometimes exits 0 with limit message
combined_lower = (out_text + err_text).lower()
if "hit your limit" in combined_lower or "rate limit" in combined_lower:
logger.warning("Claude Max rate limited (rc=%d, stdout: %s)", proc.returncode, out_text[:200])
return "RATE_LIMITED"
if proc.returncode != 0:
logger.error("Claude CLI failed (rc=%d): stderr=%s stdout=%s", proc.returncode, err_text[:200], out_text[:200])
return None
return out_text.strip()
# ─── Review execution ─────────────────────────────────────────────────────
async def triage_pr(diff: str) -> str:
"""Triage PR via Haiku → DEEP/STANDARD/LIGHT."""
prompt = TRIAGE_PROMPT.format(diff=diff[:50000]) # Cap diff size for triage
result = await openrouter_call(config.TRIAGE_MODEL, prompt, timeout_sec=30)
if not result:
logger.warning("Triage failed, defaulting to STANDARD")
return "STANDARD"
tier = result.split("\n")[0].strip().upper()
if tier in ("DEEP", "STANDARD", "LIGHT"):
reason = result.split("\n")[1].strip() if "\n" in result else ""
logger.info("Triage: %s%s", tier, reason[:100])
return tier
logger.warning("Triage returned unparseable '%s', defaulting to STANDARD", tier[:20])
return "STANDARD"
async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
"""Run domain review via OpenRouter GPT-4o.
Decoupled from Claude Max to avoid account-level rate limits blocking
domain reviews. Different model lineage also reduces correlated blind spots.
"""
prompt = DOMAIN_PROMPT.format(
agent=agent,
agent_upper=agent.upper(),
domain=domain,
style_guide=REVIEW_STYLE_GUIDE,
diff=diff,
files=files,
)
result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
return result
async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
"""Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter).
Opus is scarce — reserved for DEEP eval and overnight research sessions.
STANDARD goes straight to GPT-4o. Domain review is the primary gate;
Leo review is a quality check that doesn't need Opus for routine claims.
"""
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
if tier == "DEEP":
# DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes.
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
logger.info("Claude Max Opus rate limited, queuing DEEP Leo review")
return None
return result
else:
# STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from
# domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max
# rate limit untouched for Opus DEEP + overnight research.
result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
return result