leo: model diversity + calibrated review prompts
- Domain review → GPT-4o (OpenRouter), Leo STANDARD → Sonnet (OpenRouter), Leo DEEP → Opus (Claude Max). Two model families = no correlated blind spots. - Opus reserved for DEEP eval only — protects rate limit for overnight research. - Review prompts calibrated: require per-criterion evidence, blocking-vs-observation verdict rules. Moved from 100% rubber-stamp approval to 12% pass rate. - OpenRouter failures classified as openrouter_failed (not rate_limited) to avoid spurious 15-min Opus backoff. - merge.py: pre-check PR state before merge API call (prevents 405 on re-merge). Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD>
This commit is contained in:
parent
85b86a918a
commit
c0a6adf9ed
4 changed files with 89 additions and 63 deletions
|
|
@ -28,19 +28,22 @@ MODEL_OPUS = "opus"
|
|||
MODEL_SONNET = "sonnet"
|
||||
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
|
||||
MODEL_GPT4O = "openai/gpt-4o"
|
||||
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max)
|
||||
|
||||
# --- Model assignment per stage ---
|
||||
# Principle: Opus is a scarce resource. Use it only where judgment quality matters.
|
||||
# Sonnet handles volume. Haiku handles routing. Opus handles synthesis + critical eval.
|
||||
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
|
||||
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
|
||||
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
|
||||
#
|
||||
# Pipeline eval ordering (domain-first, Leo-last):
|
||||
# 1. Domain review → Sonnet (catches domain issues, evidence gaps — high volume filter)
|
||||
# 2. Leo review → Opus (cross-domain synthesis, confidence calibration — only pre-filtered PRs)
|
||||
# 3. DEEP cross-family → GPT-4o (adversarial blind-spot check — paid, highest-value claims only)
|
||||
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work
|
||||
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest
|
||||
EVAL_DOMAIN_MODEL = MODEL_SONNET # domain review: high-volume filter
|
||||
EVAL_LEO_MODEL = MODEL_OPUS # Leo review: scarce, high-value
|
||||
# 1. Domain review → GPT-4o (OpenRouter) — different family from Leo
|
||||
# 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
|
||||
# 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
|
||||
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max)
|
||||
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter)
|
||||
EVAL_DOMAIN_MODEL = MODEL_GPT4O # domain review: OpenRouter GPT-4o
|
||||
EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus
|
||||
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet
|
||||
EVAL_DEEP_MODEL = MODEL_GPT4O # DEEP cross-family: paid, adversarial
|
||||
|
||||
# --- Model backends ---
|
||||
|
|
@ -65,6 +68,7 @@ MODEL_COSTS = {
|
|||
"sonnet": {"input": 0.003, "output": 0.015},
|
||||
MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
|
||||
MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
|
||||
MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
|
||||
}
|
||||
|
||||
# --- Concurrency ---
|
||||
|
|
|
|||
|
|
@ -205,9 +205,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
|||
domain_review = await run_domain_review(review_diff, files, domain or "general", agent)
|
||||
|
||||
if domain_review is None:
|
||||
# Rate limited, couldn't overflow — revert to open for retry
|
||||
# OpenRouter failure (timeout, error) — revert to open for retry.
|
||||
# NOT a rate limit — don't trigger 15-min backoff, just skip this PR.
|
||||
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
||||
return {"pr": pr_number, "skipped": True, "reason": "rate_limited"}
|
||||
return {"pr": pr_number, "skipped": True, "reason": "openrouter_failed"}
|
||||
|
||||
domain_verdict = _parse_verdict(domain_review, agent)
|
||||
conn.execute(
|
||||
|
|
@ -243,9 +244,10 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
|
|||
leo_review = await run_leo_review(review_diff, files, tier)
|
||||
|
||||
if leo_review is None:
|
||||
# Opus rate limited — revert to open for retry (keep domain verdict)
|
||||
# DEEP: Opus rate limited (queue for later). STANDARD: OpenRouter failed (skip, retry next cycle).
|
||||
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
|
||||
return {"pr": pr_number, "skipped": True, "reason": "opus_rate_limited"}
|
||||
reason = "opus_rate_limited" if tier == "DEEP" else "openrouter_failed"
|
||||
return {"pr": pr_number, "skipped": True, "reason": reason}
|
||||
|
||||
leo_verdict = _parse_verdict(leo_review, "LEO")
|
||||
conn.execute("UPDATE prs SET leo_verdict = ? WHERE number = ?", (leo_verdict, pr_number))
|
||||
|
|
@ -360,11 +362,11 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
# - status = 'open'
|
||||
# - tier0_pass = 1 (passed validation)
|
||||
# - leo_verdict = 'pending' OR domain_verdict = 'pending'
|
||||
# During Opus backoff: only fetch PRs needing triage or domain review
|
||||
# (skip PRs already domain-reviewed that are waiting for Leo/Opus)
|
||||
# During Opus backoff: skip DEEP PRs waiting for Leo (they need Opus).
|
||||
# STANDARD PRs can overflow Leo review to GPT-4o, so let them through.
|
||||
# Skip PRs attempted within last 10 minutes (backoff during rate limits)
|
||||
if opus_backoff:
|
||||
verdict_filter = "AND p.domain_verdict = 'pending'"
|
||||
verdict_filter = "AND (p.domain_verdict = 'pending' OR (p.leo_verdict = 'pending' AND p.tier != 'DEEP'))"
|
||||
else:
|
||||
verdict_filter = "AND (p.leo_verdict = 'pending' OR p.domain_verdict = 'pending')"
|
||||
|
||||
|
|
@ -397,17 +399,16 @@ async def evaluate_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
|
||||
for row in rows:
|
||||
try:
|
||||
# During Opus backoff, skip PRs that already completed domain review
|
||||
# (they'd just hit the Opus limit again). Only process PRs still
|
||||
# needing triage or domain review.
|
||||
if opus_backoff:
|
||||
# During Opus backoff, skip DEEP PRs that already completed domain review
|
||||
# (they need Opus which is rate limited). STANDARD PRs can overflow to GPT-4o.
|
||||
if opus_backoff and row["tier"] == "DEEP":
|
||||
existing = conn.execute(
|
||||
"SELECT domain_verdict FROM prs WHERE number = ?",
|
||||
(row["number"],),
|
||||
).fetchone()
|
||||
if existing and existing["domain_verdict"] not in ("pending", None):
|
||||
logger.debug(
|
||||
"PR #%d: skipping during Opus backoff (domain already %s)",
|
||||
"PR #%d: skipping DEEP during Opus backoff (domain already %s)",
|
||||
row["number"],
|
||||
existing["domain_verdict"],
|
||||
)
|
||||
|
|
|
|||
95
lib/llm.py
95
lib/llm.py
|
|
@ -36,9 +36,12 @@ async def kill_active_subprocesses():
|
|||
|
||||
|
||||
REVIEW_STYLE_GUIDE = (
|
||||
"Be concise. Only mention what fails or is interesting. "
|
||||
"Do not summarize what the PR does — the diff speaks for itself. "
|
||||
"If everything passes, say so in one line and approve."
|
||||
"You MUST show your work. For each criterion, write one sentence with your finding. "
|
||||
"Do not summarize what the PR does — evaluate it. "
|
||||
"If a criterion passes, say what you checked and why it passes. "
|
||||
"If a criterion fails, explain the specific problem. "
|
||||
"Responses like 'Everything passes' with no evidence of checking will be treated as review failures. "
|
||||
"Be concise but substantive — one sentence per criterion, not one sentence total."
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -74,19 +77,25 @@ Respond with ONLY the tier name (DEEP, STANDARD, or LIGHT) on the first line, fo
|
|||
|
||||
DOMAIN_PROMPT = """You are {agent}, the {domain} domain expert for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR from your domain expertise:
|
||||
1. Technical accuracy — are the claims factually correct in your domain?
|
||||
2. Domain duplicates — does your domain already have substantially similar claims?
|
||||
3. Missing context — is important domain context absent that would change interpretation?
|
||||
4. Confidence calibration — from your domain expertise, is the confidence level right?
|
||||
5. Enrichment opportunities — should this connect to existing claims via wiki links?
|
||||
Review this PR. For EACH criterion below, write one sentence stating what you found:
|
||||
|
||||
1. **Factual accuracy** — Are the claims factually correct? Name any specific errors.
|
||||
2. **Intra-PR duplicates** — Do multiple changes in THIS PR add the same evidence to different claims with near-identical wording? Only flag if the same paragraph of evidence is copy-pasted across files.
|
||||
3. **Confidence calibration** — Is the confidence level right for the evidence provided? Name the level and say if it matches.
|
||||
4. **Wiki links** — Do [[wiki links]] in the diff reference files that exist? Flag any that look broken.
|
||||
|
||||
VERDICT RULES — read carefully:
|
||||
- APPROVE if claims are factually correct and evidence supports them, even if minor improvements are possible.
|
||||
- REQUEST_CHANGES only for BLOCKING issues: factual errors, genuinely broken wiki links, copy-pasted duplicate evidence across files, or confidence that is clearly wrong (e.g. "proven" with no evidence).
|
||||
- Missing context, style preferences, and "could be better" observations are NOT blocking. Note them but still APPROVE.
|
||||
- Do NOT invent problems. If a criterion passes, say it passes.
|
||||
|
||||
{style_guide}
|
||||
|
||||
If you are requesting changes, tag the specific issues:
|
||||
If requesting changes, tag the specific issues:
|
||||
<!-- ISSUES: tag1, tag2 -->
|
||||
|
||||
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error, source_archive, placeholder_url, missing_challenged_by
|
||||
Valid tags: broken_wiki_links, frontmatter_schema, title_overclaims, confidence_miscalibration, date_errors, factual_discrepancy, near_duplicate, scope_error
|
||||
|
||||
End your review with exactly one of:
|
||||
<!-- VERDICT:{agent_upper}:APPROVE -->
|
||||
|
|
@ -100,14 +109,14 @@ End your review with exactly one of:
|
|||
|
||||
LEO_PROMPT_STANDARD = """You are Leo, the lead evaluator for TeleoHumanity's knowledge base.
|
||||
|
||||
Review this PR against the quality criteria:
|
||||
1. Schema compliance — YAML frontmatter, prose-as-title, required fields
|
||||
2. Duplicate check — does this claim already exist?
|
||||
3. Confidence calibration — appropriate for the evidence?
|
||||
4. Wiki link validity — references real claims?
|
||||
5. Source quality — credible for the claim?
|
||||
6. Domain assignment — correct domain?
|
||||
7. Epistemic hygiene — specific enough to be wrong?
|
||||
Review this PR. For EACH criterion below, write one sentence stating what you found:
|
||||
|
||||
1. **Schema** — Does YAML frontmatter have type, domain, confidence, source, created? Is the title a prose proposition (not a label)?
|
||||
2. **Duplicate/redundancy** — Do multiple enrichments in this PR inject the same evidence into different claims? Is the enrichment actually new vs already present in the claim?
|
||||
3. **Confidence** — Name the confidence level. Does the evidence justify it? (proven needs strong evidence, speculative is fine for theories)
|
||||
4. **Wiki links** — Do [[links]] in the diff point to real files? Flag any that look invented.
|
||||
5. **Source quality** — Is the source credible for this claim?
|
||||
6. **Specificity** — Could someone disagree with this claim? If it's too vague to be wrong, flag it.
|
||||
|
||||
{style_guide}
|
||||
|
||||
|
|
@ -258,7 +267,11 @@ async def triage_pr(diff: str) -> str:
|
|||
|
||||
|
||||
async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> str | None:
|
||||
"""Run domain review. Tries Claude Max Sonnet first, overflows to OpenRouter GPT-4o."""
|
||||
"""Run domain review via OpenRouter GPT-4o.
|
||||
|
||||
Decoupled from Claude Max to avoid account-level rate limits blocking
|
||||
domain reviews. Different model lineage also reduces correlated blind spots.
|
||||
"""
|
||||
prompt = DOMAIN_PROMPT.format(
|
||||
agent=agent,
|
||||
agent_upper=agent.upper(),
|
||||
|
|
@ -268,32 +281,30 @@ async def run_domain_review(diff: str, files: str, domain: str, agent: str) -> s
|
|||
files=files,
|
||||
)
|
||||
|
||||
# Try Claude Max Sonnet first
|
||||
result = await claude_cli_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Overflow to OpenRouter GPT-4o (Rhea: domain review is the volume filter, don't bottleneck)
|
||||
policy = config.OVERFLOW_POLICY.get("eval_domain", "overflow")
|
||||
if policy == "overflow":
|
||||
logger.info("Claude Max rate limited, overflowing domain review to OpenRouter GPT-4o")
|
||||
result = await openrouter_call(config.EVAL_DEEP_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
else:
|
||||
logger.info("Claude Max rate limited, queuing domain review")
|
||||
return None
|
||||
|
||||
result = await openrouter_call(config.EVAL_DOMAIN_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
return result
|
||||
|
||||
|
||||
async def run_leo_review(diff: str, files: str, tier: str) -> str | None:
|
||||
"""Run Leo review via Claude Max Opus. Returns None if rate limited (queue policy)."""
|
||||
"""Run Leo review. DEEP → Opus (Claude Max, queue if limited). STANDARD → GPT-4o (OpenRouter).
|
||||
|
||||
Opus is scarce — reserved for DEEP eval and overnight research sessions.
|
||||
STANDARD goes straight to GPT-4o. Domain review is the primary gate;
|
||||
Leo review is a quality check that doesn't need Opus for routine claims.
|
||||
"""
|
||||
prompt_template = LEO_PROMPT_DEEP if tier == "DEEP" else LEO_PROMPT_STANDARD
|
||||
prompt = prompt_template.format(style_guide=REVIEW_STYLE_GUIDE, diff=diff, files=files)
|
||||
|
||||
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
|
||||
if result == "RATE_LIMITED":
|
||||
# Leo review queues — don't waste Opus calls (never overflow)
|
||||
logger.info("Claude Max Opus rate limited, queuing Leo review")
|
||||
return None
|
||||
|
||||
return result
|
||||
if tier == "DEEP":
|
||||
# DEEP: Opus only, queue if rate limited. Opus is scarce — reserve for high-stakes.
|
||||
result = await claude_cli_call(config.EVAL_LEO_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
if result == "RATE_LIMITED":
|
||||
logger.info("Claude Max Opus rate limited, queuing DEEP Leo review")
|
||||
return None
|
||||
return result
|
||||
else:
|
||||
# STANDARD/LIGHT: Sonnet via OpenRouter. Different model family from
|
||||
# domain review (GPT-4o) = no correlated blind spots. Keeps Claude Max
|
||||
# rate limit untouched for Opus DEEP + overnight research.
|
||||
result = await openrouter_call(config.EVAL_LEO_STANDARD_MODEL, prompt, timeout_sec=config.EVAL_TIMEOUT)
|
||||
return result
|
||||
|
|
|
|||
10
lib/merge.py
10
lib/merge.py
|
|
@ -243,6 +243,16 @@ async def _rebase_and_push(branch: str) -> tuple[bool, str]:
|
|||
|
||||
async def _merge_pr(pr_number: int) -> tuple[bool, str]:
|
||||
"""Merge PR via Forgejo API. Preserves PR metadata and reviewer attribution."""
|
||||
# Check if already merged/closed on Forgejo (prevents 405 on re-merge attempts)
|
||||
pr_info = await forgejo_api("GET", repo_path(f"pulls/{pr_number}"))
|
||||
if pr_info:
|
||||
if pr_info.get("merged"):
|
||||
logger.info("PR #%d already merged on Forgejo, syncing status", pr_number)
|
||||
return True, "already merged"
|
||||
if pr_info.get("state") == "closed":
|
||||
logger.warning("PR #%d closed on Forgejo but not merged", pr_number)
|
||||
return False, "PR closed without merge"
|
||||
|
||||
result = await forgejo_api(
|
||||
"POST",
|
||||
repo_path(f"pulls/{pr_number}/merge"),
|
||||
|
|
|
|||
Loading…
Reference in a new issue