leo: model diversity + calibrated review prompts

- Domain review → GPT-4o (OpenRouter), Leo STANDARD → Sonnet (OpenRouter),
  Leo DEEP → Opus (Claude Max). Two model families = no correlated blind spots.
- Opus reserved for DEEP eval only — protects rate limit for overnight research.
- Review prompts calibrated: require per-criterion evidence, blocking-vs-observation
  verdict rules. Moved from 100% rubber-stamp approval to 12% pass rate.
- OpenRouter failures classified as openrouter_failed (not rate_limited) to avoid
  spurious 15-min Opus backoff.
- merge.py: pre-check PR state before merge API call (prevents 405 on re-merge).

Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD>
This commit is contained in:
m3taversal 2026-03-13 17:10:30 +00:00
parent 85b86a918a
commit c0a6adf9ed
4 changed files with 89 additions and 63 deletions

View file

@ -28,19 +28,22 @@ MODEL_OPUS = "opus"
MODEL_SONNET = "sonnet"
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
MODEL_GPT4O = "openai/gpt-4o"
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max)
# --- Model assignment per stage ---
# Principle: Opus is a scarce resource. Use it only where judgment quality matters.
# Sonnet handles volume. Haiku handles routing. Opus handles synthesis + critical eval.
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
#
# Pipeline eval ordering (domain-first, Leo-last):
# 1. Domain review → Sonnet (catches domain issues, evidence gaps — high volume filter)
# 2. Leo review → Opus (cross-domain synthesis, confidence calibration — only pre-filtered PRs)
# 3. DEEP cross-family → GPT-4o (adversarial blind-spot check — paid, highest-value claims only)
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest
EVAL_DOMAIN_MODEL = MODEL_SONNET # domain review: high-volume filter
EVAL_LEO_MODEL = MODEL_OPUS # Leo review: scarce, high-value
# 1. Domain review → GPT-4o (OpenRouter) — different family from Leo
# 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
# 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max)
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter)
EVAL_DOMAIN_MODEL = MODEL_GPT4O # domain review: OpenRouter GPT-4o
EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet
EVAL_DEEP_MODEL = MODEL_GPT4O # DEEP cross-family: paid, adversarial
# --- Model backends ---
@ -65,6 +68,7 @@ MODEL_COSTS = {
"sonnet": {"input": 0.003, "output": 0.015},
MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
}
# --- Concurrency ---

View file

@ -205,9 +205,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
domain_review = await run_domain_review(review_diff, files, domain or "general", agent)
if domain_review is None:
# Rate limited, couldn't overflow — revert to open for retry
# OpenRouter failure (timeout, error) — revert to open for retry.
# NOT a rate limit — don't trigger 15-min backoff, just skip this PR.
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "rate_limited"}
return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"}
domain_verdict = _parse_verdict(domain_review, agent)
conn.execute(
@ -243,9 +244,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
leo_review = await run_leo_review(review_diff, files, tier)
if leo_review is None:
# Opus rate limited — revert to open for retry (keep domain verdict)
# DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle).
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "opus_rate_limited"}
reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed"
return {"pr": pr_number, "skipped": True, "reason": reason}
leo_verdict = _parse_verdict(leo_review, "LEO")
conn.execute("UPDATE prs SET leo_verdict = ? WHERE number = ?", (leo_verdict, pr_number))
@ -360,11 +362,11 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
# - status = 'open'
# - tier0_pass = 1 (passed validation)
# - leo_verdict = 'pending' OR domain_verdict = 'pending'
# During Opus backoff: only fetch PRs needing triage or domain review
# (skip PRs already domain-reviewed that are waiting for Leo/Opus)
# During Opus backoff: skip DEEP PRs waiting for Leo (they need Opus).
# STANDARD PRs can overflow Leo review to GPT-4o, so let them through.
# Skip PRs attempted within last 10 minutes (backoff during rate limits)
if opus_backoff:
verdict_filter = "AND p.domain_verdict = 'pending'"
verdict_filter = "AND (p.domain_verdict = 'pending' OR (p.leo_verdict = 'pending' AND p.tier != 'DEEP'))"
else:
verdict_filter = "AND (p.leo_verdict = 'pending' OR p.domain_verdict = 'pending')"
@ -397,17 +399,16 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
for row in rows:
try:
# During Opus backoff, skip PRs that already completed domain review
# (they'd just hit the Opus limit again). Only process PRs still
# needing triage or domain review.
if opus_backoff:
# During Opus backoff, skip DEEP PRs that already completed domain review
# (they need Opus which is rate limited). STANDARD PRs can overflow to GPT-4o.
if opus_backoff and row["tier"] == "DEEP":
existing = conn.execute(
"SELECT domain_verdict FROM prs WHERE number = ?",
(row["number"],),
).fetchone()
if existing and existing["domain_verdict"] not in ("pending", None):
logger.debug(
"PR #%d: skipping during Opus backoff (domain already %s)",
"PR #%d: skipping DEEP during Opus backoff (domain already %s)",
row["number"],
existing["domain_verdict"],
)

View file

@ -36,9 +36,12 @@ async def kill_active_subprocesses():
REVIEW_STYLE_GUIDE = (
"Be concise. Only mention what fails or is interesting. "
"Do not summarize what the PR does — the diff speaks for itself. "
"If everything passes, say so in one line and approve."
"You MUST show your work. For each criterion, write one sentence with your finding. "
"Do not summarize what the PR does — evaluate it. "
"If a criterion passes, say what you checked and why it passes. "
"If a criterion fails, explain the specific problem. "
"Responses like 'Everything passes' with no evidence of checking will be treated as review failures. "
"Be concise but substantive — one sentence per criterion, not one sentence total."
)
@ -74,19 +77,25 @@ Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, fo
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
Review this PR from your domain expertise:
1. Technical accuracy are the claims factually correct in your domain?
2. Domain duplicates does your domain already have substantially similar claims?
3. Missing context is important domain context absent that would change interpretation?
4. Confidence calibration from your domain expertise, is the confidence level right?
5. Enrichment opportunities should this connect to existing claims via wiki links?
Review this PR. For EACH criterion below, write one sentence stating what you found:
1. **Factual accuracy** Are the claims factually correct? Name any specific errors.
2. **Intra-PR duplicates** Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files.
3. **Confidence calibration** Is the confidence level right for the evidence provided? Name the level and say if it matches.
4. **Wiki links** Do [[wiki links]] in the diff reference files that exist? Flag any that look broken.
VERDICT RULES read carefully:
- APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible.
- REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence).
- Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE.
- Do NOT invent problems. If a criterion passes, say it passes.
{style_guide}
If you are requesting changes, tag the specific issues:
If requesting changes, tag the specific issues:
<!-- ISSUES: tag1, tag2 -->
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
End your review with exactly one of:
<!-- VERDICT:{agent_upper}:APPROVE -->
@ -100,14 +109,14 @@ End your review with exactly one of:
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
Review this PR against the quality criteria:
1. Schema compliance YAML frontmatter, prose-as-title, required fields
2. Duplicate check does this claim already exist?
3. Confidence calibration appropriate for the evidence?
4. Wiki link validity references real claims?
5. Source quality credible for the claim?
6. Domain assignment correct domain?
7. Epistemic hygiene specific enough to be wrong?
Review this PR. For EACH criterion below, write one sentence stating what you found:
1. **Schema** Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)?
2. **Duplicate/redundancy** Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim?
3. **Confidence** Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories)
4. **Wiki links** Do [[links]] in the diff point to real files? Flag any that look invented.
5. **Source quality** Is the source credible for this claim?
6. **Specificity** Could someone disagree with this claim? If it's too vague to be wrong, flag it.
{style_guide}
@ -258,7 +267,11 @@ async def triage_pr(diff: str) -> str:
async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
"""Run domain review via OpenRouter GPT-4o.
Decoupled from Claude Max to avoid account-level rate limits blocking
domain reviews. Different model lineage also reduces correlated blind spots.
"""
prompt = DOMAIN_PROMPT.format(
agent=agent,
agent_upper=agent.upper(),
@ -268,32 +281,30 @@ async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> s
files=files,
)
# Try Claude Max Sonnet first
result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
if policy == "overflow":
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
else:
logger.info("Claude Max rate limited, queuing domain review")
return None
result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
return result
async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
"""Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter).
Opus is scarce reserved for DEEP eval and overnight research sessions.
STANDARD goes straight to GPT-4o. Domain review is the primary gate;
Leo review is a quality check that doesn't need Opus for routine claims.
"""
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
# Leo review queues — don't waste Opus calls (never overflow)
logger.info("Claude Max Opus rate limited, queuing Leo review")
return None
return result
if tier == "DEEP":
# DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes.
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
if result == "RATE_LIMITED":
logger.info("Claude Max Opus rate limited, queuing DEEP Leo review")
return None
return result
else:
# STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from
# domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max
# rate limit untouched for Opus DEEP + overnight research.
result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
return result

View file

@ -243,6 +243,16 @@ async def _rebase_and_push(branch: str) -> tuple[bool, str]:
async def _merge_pr(pr_number: int) -> tuple[bool, str]:
"""Merge PR via Forgejo API. Preserves PR metadata and reviewer attribution."""
# Check if already merged/closed on Forgejo (prevents 405 on re-merge attempts)
pr_info = await forgejo_api("GET", repo_path(f"pulls/{pr_number}"))
if pr_info:
if pr_info.get("merged"):
logger.info("PR #%d already merged on Forgejo, syncing status", pr_number)
return True, "already merged"
if pr_info.get("state") == "closed":
logger.warning("PR #%d closed on Forgejo but not merged", pr_number)
return False, "PR closed without merge"
result = await forgejo_api(
"POST",
repo_path(f"pulls/{pr_number}/merge"),