From c0a6adf9ede95971286583efbab1d24f3cce1fcc Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Fri, 13 Mar 2026 17:10:30 +0000
Subject: [PATCH] leo: model diversity + calibrated review prompts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Domain review → GPT-4o (OpenRouter), Leo STANDARD → Sonnet (OpenRouter),
  Leo DEEP → Opus (Claude Max). Two model families = no correlated blind spots.
- Opus reserved for DEEP eval only — protects rate limit for overnight research.
- Review prompts calibrated: require per-criterion evidence, blocking-vs-observation
  verdict rules. Moved from 100% rubber-stamp approval to 12% pass rate.
- OpenRouter failures classified as openrouter_failed (not rate_limited) to avoid
  spurious 15-min Opus backoff.
- merge.py: pre-check PR state before merge API call (prevents 405 on re-merge).

Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD>
---
 lib/config.py   | 22 +++++++-----
 lib/evaluate.py | 25 ++++++-------
 lib/llm.py      | 95 +++++++++++++++++++++++++++----------------------
 lib/merge.py    | 10 ++++++
 4 files changed, 89 insertions(+), 63 deletions(-)

diff --git a/lib/config.py b/lib/config.py
index c24d65c..0e8e2d2 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -28,19 +28,22 @@ MODEL_OPUS = "opus"
 MODEL_SONNET = "sonnet"
 MODEL_HAIKU = "anthropic/claude-3.5-haiku"
 MODEL_GPT4O = "openai/gpt-4o"
+MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5"  # OpenRouter Sonnet (paid, not Claude Max)
 
 # --- Model assignment per stage ---
-# Principle: Opus is a scarce resource. Use it only where judgment quality matters.
-# Sonnet handles volume. Haiku handles routing. Opus handles synthesis + critical eval.
+# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
+# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
+# Both on OpenRouter = Claude Max rate limit untouched for Opus.
 #
 # Pipeline eval ordering (domain-first, Leo-last):
-#   1. Domain review → Sonnet (catches domain issues, evidence gaps — high volume filter)
-#   2. Leo review → Opus (cross-domain synthesis, confidence calibration — only pre-filtered PRs)
-#   3. DEEP cross-family → GPT-4o (adversarial blind-spot check — paid, highest-value claims only)
-EXTRACT_MODEL = MODEL_SONNET  # extraction: structured output, volume work
-TRIAGE_MODEL = MODEL_HAIKU  # triage: routing decision, cheapest
-EVAL_DOMAIN_MODEL = MODEL_SONNET  # domain review: high-volume filter
-EVAL_LEO_MODEL = MODEL_OPUS  # Leo review: scarce, high-value
+#   1. Domain review → GPT-4o (OpenRouter) — different family from Leo
+#   2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
+#   3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
+EXTRACT_MODEL = MODEL_SONNET  # extraction: structured output, volume work (Claude Max)
+TRIAGE_MODEL = MODEL_HAIKU  # triage: routing decision, cheapest (OpenRouter)
+EVAL_DOMAIN_MODEL = MODEL_GPT4O  # domain review: OpenRouter GPT-4o
+EVAL_LEO_MODEL = MODEL_OPUS  # Leo DEEP review: Claude Max Opus
+EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR  # Leo STANDARD review: OpenRouter Sonnet
 EVAL_DEEP_MODEL = MODEL_GPT4O  # DEEP cross-family: paid, adversarial
 
 # --- Model backends ---
@@ -65,6 +68,7 @@ MODEL_COSTS = {
     "sonnet": {"input": 0.003, "output": 0.015},
     MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
     MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
+    MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
 }
 
 # --- Concurrency ---
diff --git a/lib/evaluate.py b/lib/evaluate.py
index be855d0..2957efd 100644
--- a/lib/evaluate.py
+++ b/lib/evaluate.py
@@ -205,9 +205,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
         domain_review = await run_domain_review(review_diff, files, domain or "general", agent)
 
         if domain_review is None:
-            # Rate limited, couldn't overflow — revert to open for retry
+            # OpenRouter failure (timeout, error) — revert to open for retry.
+            # NOT a rate limit — don't trigger 15-min backoff, just skip this PR.
             conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
-            return {"pr": pr_number, "skipped": True, "reason": "rate_limited"}
+            return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"}
 
         domain_verdict = _parse_verdict(domain_review, agent)
         conn.execute(
@@ -243,9 +244,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
         leo_review = await run_leo_review(review_diff, files, tier)
 
         if leo_review is None:
-            # Opus rate limited — revert to open for retry (keep domain verdict)
+            # DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle).
             conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
-            return {"pr": pr_number, "skipped": True, "reason": "opus_rate_limited"}
+            reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed"
+            return {"pr": pr_number, "skipped": True, "reason": reason}
 
         leo_verdict = _parse_verdict(leo_review, "LEO")
         conn.execute("UPDATE prs SET leo_verdict = ? WHERE number = ?", (leo_verdict, pr_number))
@@ -360,11 +362,11 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
     # - status = 'open'
     # - tier0_pass = 1 (passed validation)
     # - leo_verdict = 'pending' OR domain_verdict = 'pending'
-    # During Opus backoff: only fetch PRs needing triage or domain review
-    # (skip PRs already domain-reviewed that are waiting for Leo/Opus)
+    # During Opus backoff: skip DEEP PRs waiting for Leo (they need Opus).
+    # STANDARD PRs can overflow Leo review to GPT-4o, so let them through.
     # Skip PRs attempted within last 10 minutes (backoff during rate limits)
     if opus_backoff:
-        verdict_filter = "AND p.domain_verdict = 'pending'"
+        verdict_filter = "AND (p.domain_verdict = 'pending' OR (p.leo_verdict = 'pending' AND p.tier != 'DEEP'))"
     else:
         verdict_filter = "AND (p.leo_verdict = 'pending' OR p.domain_verdict = 'pending')"
 
@@ -397,17 +399,16 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
 
     for row in rows:
         try:
-            # During Opus backoff, skip PRs that already completed domain review
-            # (they'd just hit the Opus limit again). Only process PRs still
-            # needing triage or domain review.
-            if opus_backoff:
+            # During Opus backoff, skip DEEP PRs that already completed domain review
+            # (they need Opus which is rate limited). STANDARD PRs can overflow to GPT-4o.
+            if opus_backoff and row["tier"] == "DEEP":
                 existing = conn.execute(
                     "SELECT domain_verdict FROM prs WHERE number = ?",
                     (row["number"],),
                 ).fetchone()
                 if existing and existing["domain_verdict"] not in ("pending", None):
                     logger.debug(
-                        "PR #%d: skipping during Opus backoff (domain already %s)",
+                        "PR #%d: skipping DEEP during Opus backoff (domain already %s)",
                         row["number"],
                         existing["domain_verdict"],
                     )
diff --git a/lib/llm.py b/lib/llm.py
index b7079e3..12d16f0 100644
--- a/lib/llm.py
+++ b/lib/llm.py
@@ -36,9 +36,12 @@ async def kill_active_subprocesses():
 
 
 REVIEW_STYLE_GUIDE = (
-    "Be concise. Only mention what fails or is interesting. "
-    "Do not summarize what the PR does — the diff speaks for itself. "
-    "If everything passes, say so in one line and approve."
+    "You MUST show your work. For each criterion, write one sentence with your finding. "
+    "Do not summarize what the PR does — evaluate it. "
+    "If a criterion passes, say what you checked and why it passes. "
+    "If a criterion fails, explain the specific problem. "
+    "Responses like 'Everything passes' with no evidence of checking will be treated as review failures. "
+    "Be concise but substantive — one sentence per criterion, not one sentence total."
 )
 
 
@@ -74,19 +77,25 @@ Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, fo
 
 DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
 
-Review this PR from your domain expertise:
-1. Technical accuracy — are the claims factually correct in your domain?
-2. Domain duplicates — does your domain already have substantially similar claims?
-3. Missing context — is important domain context absent that would change interpretation?
-4. Confidence calibration — from your domain expertise, is the confidence level right?
-5. Enrichment opportunities — should this connect to existing claims via wiki links?
+Review this PR. For EACH criterion below, write one sentence stating what you found:
+
+1. **Factual accuracy** — Are the claims factually correct? Name any specific errors.
+2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files.
+3. **Confidence calibration** — Is the confidence level right for the evidence provided? Name the level and say if it matches.
+4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? Flag any that look broken.
+
+VERDICT RULES — read carefully:
+- APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible.
+- REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence).
+- Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE.
+- Do NOT invent problems. If a criterion passes, say it passes.
 
 {style_guide}
 
-If you are requesting changes, tag the specific issues:
+If requesting changes, tag the specific issues:
 <!-- ISSUES: tag1, tag2 -->
 
-Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
+Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
 
 End your review with exactly one of:
 <!-- VERDICT:{agent_upper}:APPROVE -->
@@ -100,14 +109,14 @@ End your review with exactly one of:
 
 LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
 
-Review this PR against the quality criteria:
-1. Schema compliance — YAML frontmatter, prose-as-title, required fields
-2. Duplicate check — does this claim already exist?
-3. Confidence calibration — appropriate for the evidence?
-4. Wiki link validity — references real claims?
-5. Source quality — credible for the claim?
-6. Domain assignment — correct domain?
-7. Epistemic hygiene — specific enough to be wrong?
+Review this PR. For EACH criterion below, write one sentence stating what you found:
+
+1. **Schema** — Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)?
+2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim?
+3. **Confidence** — Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories)
+4. **Wiki links** — Do [[links]] in the diff point to real files? Flag any that look invented.
+5. **Source quality** — Is the source credible for this claim?
+6. **Specificity** — Could someone disagree with this claim? If it's too vague to be wrong, flag it.
 
 {style_guide}
 
@@ -258,7 +267,11 @@ async def triage_pr(diff: str) -> str:
 
 
 async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
-    """Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
+    """Run domain review via OpenRouter GPT-4o.
+
+    Decoupled from Claude Max to avoid account-level rate limits blocking
+    domain reviews. Different model lineage also reduces correlated blind spots.
+    """
     prompt = DOMAIN_PROMPT.format(
         agent=agent,
         agent_upper=agent.upper(),
@@ -268,32 +281,30 @@ async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> s
         files=files,
     )
 
-    # Try Claude Max Sonnet first
-    result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
-
-    if result == "RATE_LIMITED":
-        # Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
-        policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
-        if policy == "overflow":
-            logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
-            result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
-        else:
-            logger.info("Claude Max rate limited, queuing domain review")
-            return None
-
+    result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
     return result
 
 
 async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
-    """Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
+    """Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter).
+
+    Opus is scarce — reserved for DEEP eval and overnight research sessions.
+    STANDARD goes straight to GPT-4o. Domain review is the primary gate;
+    Leo review is a quality check that doesn't need Opus for routine claims.
+    """
     prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
     prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
 
-    result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
-
-    if result == "RATE_LIMITED":
-        # Leo review queues — don't waste Opus calls (never overflow)
-        logger.info("Claude Max Opus rate limited, queuing Leo review")
-        return None
-
-    return result
+    if tier == "DEEP":
+        # DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes.
+        result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
+        if result == "RATE_LIMITED":
+            logger.info("Claude Max Opus rate limited, queuing DEEP Leo review")
+            return None
+        return result
+    else:
+        # STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from
+        # domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max
+        # rate limit untouched for Opus DEEP + overnight research.
+        result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
+        return result
diff --git a/lib/merge.py b/lib/merge.py
index 40f4f97..3c85cd0 100644
--- a/lib/merge.py
+++ b/lib/merge.py
@@ -243,6 +243,16 @@ async def _rebase_and_push(branch: str) -> tuple[bool, str]:
 
 async def _merge_pr(pr_number: int) -> tuple[bool, str]:
     """Merge PR via Forgejo API. Preserves PR metadata and reviewer attribution."""
+    # Check if already merged/closed on Forgejo (prevents 405 on re-merge attempts)
+    pr_info = await forgejo_api("GET", repo_path(f"pulls/{pr_number}"))
+    if pr_info:
+        if pr_info.get("merged"):
+            logger.info("PR #%d already merged on Forgejo, syncing status", pr_number)
+            return True, "already merged"
+        if pr_info.get("state") == "closed":
+            logger.warning("PR #%d closed on Forgejo but not merged", pr_number)
+            return False, "PR closed without merge"
+
     result = await forgejo_api(
         "POST",
         repo_path(f"pulls/{pr_number}/merge"),