From c0a6adf9ede95971286583efbab1d24f3cce1fcc Mon Sep 17 00:00:00 2001 From: m3taversal Date: Fri, 13 Mar 2026 17:10:30 +0000 Subject: [PATCH] leo: model diversity + calibrated review prompts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Domain review → GPT-4o (OpenRouter), Leo STANDARD → Sonnet (OpenRouter), Leo DEEP → Opus (Claude Max). Two model families = no correlated blind spots. - Opus reserved for DEEP eval only — protects rate limit for overnight research. - Review prompts calibrated: require per-criterion evidence, blocking-vs-observation verdict rules. Moved from 100% rubber-stamp approval to 12% pass rate. - OpenRouter failures classified as openrouter_failed (not rate_limited) to avoid spurious 15-min Opus backoff. - merge.py: pre-check PR state before merge API call (prevents 405 on re-merge). Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD> --- lib/config.py | 22 +++++++----- lib/evaluate.py | 25 ++++++------- lib/llm.py | 95 +++++++++++++++++++++++++++---------------------- lib/merge.py | 10 ++++++ 4 files changed, 89 insertions(+), 63 deletions(-) diff --git a/lib/config.py b/lib/config.py index c24d65c..0e8e2d2 100644 --- a/lib/config.py +++ b/lib/config.py @@ -28,19 +28,22 @@ MODEL_OPUS = "opus" MODEL_SONNET = "sonnet" MODEL_HAIKU = "anthropic/claude-3.5-haiku" MODEL_GPT4O = "openai/gpt-4o" +MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max) # --- Model assignment per stage --- -# Principle: Opus is a scarce resource. Use it only where judgment quality matters. -# Sonnet handles volume. Haiku handles routing. Opus handles synthesis + critical eval. +# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research. +# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots. +# Both on OpenRouter = Claude Max rate limit untouched for Opus. # # Pipeline eval ordering (domain-first, Leo-last): -# 1. Domain review → Sonnet (catches domain issues, evidence gaps — high volume filter) -# 2. Leo review → Opus (cross-domain synthesis, confidence calibration — only pre-filtered PRs) -# 3. DEEP cross-family → GPT-4o (adversarial blind-spot check — paid, highest-value claims only) -EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work -TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest -EVAL_DOMAIN_MODEL = MODEL_SONNET # domain review: high-volume filter -EVAL_LEO_MODEL = MODEL_OPUS # Leo review: scarce, high-value +# 1. Domain review → GPT-4o (OpenRouter) — different family from Leo +# 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain +# 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce +EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max) +TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter) +EVAL_DOMAIN_MODEL = MODEL_GPT4O # domain review: OpenRouter GPT-4o +EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus +EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet EVAL_DEEP_MODEL = MODEL_GPT4O # DEEP cross-family: paid, adversarial # --- Model backends --- @@ -65,6 +68,7 @@ MODEL_COSTS = { "sonnet": {"input": 0.003, "output": 0.015}, MODEL_HAIKU: {"input": 0.0008, "output": 0.004}, MODEL_GPT4O: {"input": 0.0025, "output": 0.01}, + MODEL_SONNET_OR: {"input": 0.003, "output": 0.015}, } # --- Concurrency --- diff --git a/lib/evaluate.py b/lib/evaluate.py index be855d0..2957efd 100644 --- a/lib/evaluate.py +++ b/lib/evaluate.py @@ -205,9 +205,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: domain_review = await run_domain_review(review_diff, files, domain or "general", agent) if domain_review is None: - # Rate limited, couldn't overflow — revert to open for retry + # OpenRouter failure (timeout, error) — revert to open for retry. + # NOT a rate limit — don't trigger 15-min backoff, just skip this PR. conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) - return {"pr": pr_number, "skipped": True, "reason": "rate_limited"} + return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"} domain_verdict = _parse_verdict(domain_review, agent) conn.execute( @@ -243,9 +244,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict: leo_review = await run_leo_review(review_diff, files, tier) if leo_review is None: - # Opus rate limited — revert to open for retry (keep domain verdict) + # DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle). conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) - return {"pr": pr_number, "skipped": True, "reason": "opus_rate_limited"} + reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed" + return {"pr": pr_number, "skipped": True, "reason": reason} leo_verdict = _parse_verdict(leo_review, "LEO") conn.execute("UPDATE prs SET leo_verdict = ? WHERE number = ?", (leo_verdict, pr_number)) @@ -360,11 +362,11 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]: # - status = 'open' # - tier0_pass = 1 (passed validation) # - leo_verdict = 'pending' OR domain_verdict = 'pending' - # During Opus backoff: only fetch PRs needing triage or domain review - # (skip PRs already domain-reviewed that are waiting for Leo/Opus) + # During Opus backoff: skip DEEP PRs waiting for Leo (they need Opus). + # STANDARD PRs can overflow Leo review to GPT-4o, so let them through. # Skip PRs attempted within last 10 minutes (backoff during rate limits) if opus_backoff: - verdict_filter = "AND p.domain_verdict = 'pending'" + verdict_filter = "AND (p.domain_verdict = 'pending' OR (p.leo_verdict = 'pending' AND p.tier != 'DEEP'))" else: verdict_filter = "AND (p.leo_verdict = 'pending' OR p.domain_verdict = 'pending')" @@ -397,17 +399,16 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]: for row in rows: try: - # During Opus backoff, skip PRs that already completed domain review - # (they'd just hit the Opus limit again). Only process PRs still - # needing triage or domain review. - if opus_backoff: + # During Opus backoff, skip DEEP PRs that already completed domain review + # (they need Opus which is rate limited). STANDARD PRs can overflow to GPT-4o. + if opus_backoff and row["tier"] == "DEEP": existing = conn.execute( "SELECT domain_verdict FROM prs WHERE number = ?", (row["number"],), ).fetchone() if existing and existing["domain_verdict"] not in ("pending", None): logger.debug( - "PR #%d: skipping during Opus backoff (domain already %s)", + "PR #%d: skipping DEEP during Opus backoff (domain already %s)", row["number"], existing["domain_verdict"], ) diff --git a/lib/llm.py b/lib/llm.py index b7079e3..12d16f0 100644 --- a/lib/llm.py +++ b/lib/llm.py @@ -36,9 +36,12 @@ async def kill_active_subprocesses(): REVIEW_STYLE_GUIDE = ( - "Be concise. Only mention what fails or is interesting. " - "Do not summarize what the PR does — the diff speaks for itself. " - "If everything passes, say so in one line and approve." + "You MUST show your work. For each criterion, write one sentence with your finding. " + "Do not summarize what the PR does — evaluate it. " + "If a criterion passes, say what you checked and why it passes. " + "If a criterion fails, explain the specific problem. " + "Responses like 'Everything passes' with no evidence of checking will be treated as review failures. " + "Be concise but substantive — one sentence per criterion, not one sentence total." ) @@ -74,19 +77,25 @@ Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, fo DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base. -Review this PR from your domain expertise: -1. Technical accuracy — are the claims factually correct in your domain? -2. Domain duplicates — does your domain already have substantially similar claims? -3. Missing context — is important domain context absent that would change interpretation? -4. Confidence calibration — from your domain expertise, is the confidence level right? -5. Enrichment opportunities — should this connect to existing claims via wiki links? +Review this PR. For EACH criterion below, write one sentence stating what you found: + +1. **Factual accuracy** — Are the claims factually correct? Name any specific errors. +2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files. +3. **Confidence calibration** — Is the confidence level right for the evidence provided? Name the level and say if it matches. +4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? Flag any that look broken. + +VERDICT RULES — read carefully: +- APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible. +- REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence). +- Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE. +- Do NOT invent problems. If a criterion passes, say it passes. {style_guide} -If you are requesting changes, tag the specific issues: +If requesting changes, tag the specific issues: -Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by +Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error End your review with exactly one of: @@ -100,14 +109,14 @@ End your review with exactly one of: LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base. -Review this PR against the quality criteria: -1. Schema compliance — YAML frontmatter, prose-as-title, required fields -2. Duplicate check — does this claim already exist? -3. Confidence calibration — appropriate for the evidence? -4. Wiki link validity — references real claims? -5. Source quality — credible for the claim? -6. Domain assignment — correct domain? -7. Epistemic hygiene — specific enough to be wrong? +Review this PR. For EACH criterion below, write one sentence stating what you found: + +1. **Schema** — Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)? +2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim? +3. **Confidence** — Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories) +4. **Wiki links** — Do [[links]] in the diff point to real files? Flag any that look invented. +5. **Source quality** — Is the source credible for this claim? +6. **Specificity** — Could someone disagree with this claim? If it's too vague to be wrong, flag it. {style_guide} @@ -258,7 +267,11 @@ async def triage_pr(diff: str) -> str: async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None: - """Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o.""" + """Run domain review via OpenRouter GPT-4o. + + Decoupled from Claude Max to avoid account-level rate limits blocking + domain reviews. Different model lineage also reduces correlated blind spots. + """ prompt = DOMAIN_PROMPT.format( agent=agent, agent_upper=agent.upper(), @@ -268,32 +281,30 @@ async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> s files=files, ) - # Try Claude Max Sonnet first - result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) - - if result == "RATE_LIMITED": - # Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck) - policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow") - if policy == "overflow": - logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o") - result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) - else: - logger.info("Claude Max rate limited, queuing domain review") - return None - + result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) return result async def run_leo_review(diff: str, files: str, tier: str) -> str | None: - """Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy).""" + """Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter). + + Opus is scarce — reserved for DEEP eval and overnight research sessions. + STANDARD goes straight to GPT-4o. Domain review is the primary gate; + Leo review is a quality check that doesn't need Opus for routine claims. + """ prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files) - result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) - - if result == "RATE_LIMITED": - # Leo review queues — don't waste Opus calls (never overflow) - logger.info("Claude Max Opus rate limited, queuing Leo review") - return None - - return result + if tier == "DEEP": + # DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes. + result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) + if result == "RATE_LIMITED": + logger.info("Claude Max Opus rate limited, queuing DEEP Leo review") + return None + return result + else: + # STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from + # domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max + # rate limit untouched for Opus DEEP + overnight research. + result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT) + return result diff --git a/lib/merge.py b/lib/merge.py index 40f4f97..3c85cd0 100644 --- a/lib/merge.py +++ b/lib/merge.py @@ -243,6 +243,16 @@ async def _rebase_and_push(branch: str) -> tuple[bool, str]: async def _merge_pr(pr_number: int) -> tuple[bool, str]: """Merge PR via Forgejo API. Preserves PR metadata and reviewer attribution.""" + # Check if already merged/closed on Forgejo (prevents 405 on re-merge attempts) + pr_info = await forgejo_api("GET", repo_path(f"pulls/{pr_number}")) + if pr_info: + if pr_info.get("merged"): + logger.info("PR #%d already merged on Forgejo, syncing status", pr_number) + return True, "already merged" + if pr_info.get("state") == "closed": + logger.warning("PR #%d closed on Forgejo but not merged", pr_number) + return False, "PR closed without merge" + result = await forgejo_api( "POST", repo_path(f"pulls/{pr_number}/merge"),