teleo-infrastructure/scripts/prove_phase1b_local.py
2026-05-29 15:08:09 +02:00

346 lines
12 KiB
Python

#!/usr/bin/env python3
"""No-network local proof for Phase 1b agent routing.
This script exercises the real evaluate cycle against an in-memory migrated DB
while replacing only external network/LLM edges with deterministic fakes.
"""
# ruff: noqa: E402,I001
from __future__ import annotations
import argparse
import asyncio
import json
import re
import sqlite3
import sys
from pathlib import Path
from typing import Any
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from lib import config, db
from lib import evaluate as evaluate_mod
SINGLE_DOMAIN_CASES = [
{
"number": 101,
"domain": "grand-strategy",
"branch": "leo/grand-strategy",
"paths": ["domains/grand-strategy/strategy.md"],
"expected_agents": ["Leo"],
},
{
"number": 102,
"domain": "ai-alignment",
"branch": "theseus/alignment",
"paths": ["domains/ai-alignment/systems.md"],
"expected_agents": ["Theseus"],
},
{
"number": 103,
"domain": "internet-finance",
"branch": "rio/x402",
"paths": ["domains/internet-finance/x402.md"],
"expected_agents": ["Rio"],
},
{
"number": 104,
"domain": "health",
"branch": "vida/health",
"paths": ["domains/health/clinical.md"],
"expected_agents": ["Vida"],
},
{
"number": 105,
"domain": "entertainment",
"branch": "clay/games",
"paths": ["domains/entertainment/games.md"],
"expected_agents": ["Clay"],
},
{
"number": 106,
"domain": "space-development",
"branch": "astra/robotics",
"paths": ["domains/space-development/robotics.md"],
"expected_agents": ["Astra"],
},
]
CROSS_DOMAIN_CASE = {
"number": 107,
"domain": "cross-ai-finance",
"branch": "rio/ai-x402",
"paths": ["domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md"],
"expected_agents": ["Theseus", "Rio"],
}
FEEDBACK_CASE = {
"number": 108,
"domain": "health-feedback",
"branch": "vida/reject-health",
"paths": ["domains/health/incorrect-health-claim.md"],
"expected_agents": ["Vida"],
}
def _diff_for(paths: list[str]) -> str:
chunks = []
for path in paths:
chunks.append(
"\n".join(
[
f"diff --git a/{path} b/{path}",
"--- a/file.md",
"+++ b/file.md",
"+type: claim",
"+description: local phase 1b proof claim",
]
)
)
return "\n".join(chunks)
def _insert_pr(conn: sqlite3.Connection, case: dict[str, Any]) -> None:
source_path = f"inbox/archive/phase1b-{case['number']}.md"
conn.execute(
"INSERT INTO sources (path, status, priority) VALUES (?, 'extracted', 'medium')",
(source_path,),
)
conn.execute(
"""INSERT INTO prs
(number, source_path, branch, status, tier, tier0_pass,
leo_verdict, domain_verdict, eval_attempts, priority)
VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0, 'medium')""",
(case["number"], source_path, case["branch"]),
)
def _pr_number_from_path(path: str) -> int | None:
match = re.search(r"(?:issues|pulls)/(\d+)", path)
return int(match.group(1)) if match else None
async def run_phase1b_local_proof() -> dict[str, Any]:
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
db.migrate(conn)
cases = [*SINGLE_DOMAIN_CASES, CROSS_DOMAIN_CASE, FEEDBACK_CASE]
diffs = {case["number"]: _diff_for(case["paths"]) for case in cases}
for case in cases:
_insert_pr(conn, case)
comments: dict[int, list[str]] = {}
formal_approvals: list[int] = []
eval_feedback: list[dict[str, Any]] = []
dispositions: list[dict[str, Any]] = []
agent_review_calls: list[dict[str, Any]] = []
async def fake_get_pr_diff(pr_number: int) -> str:
return diffs[pr_number]
async def fake_run_agent_review(
diff: str,
files: str,
agent: str,
route_context: str = "",
tier: str = "STANDARD",
) -> tuple[str, dict[str, int]]:
verdict = "REQUEST_CHANGES" if "incorrect-health-claim.md" in diff and agent == "Vida" else "APPROVE"
issues = "\n<!-- ISSUES: factual_discrepancy -->" if verdict == "REQUEST_CHANGES" else ""
agent_review_calls.append(
{
"agent": agent,
"tier": tier,
"files": files.splitlines(),
"route": json.loads(route_context),
"verdict": verdict,
}
)
return (
f"{agent} local Phase 1b review{issues}\n<!-- VERDICT:{agent.upper()}:{verdict} -->",
{"prompt_tokens": 10, "completion_tokens": 5},
)
async def fake_forgejo_api(method: str, path: str, body: dict | None = None, token: str | None = None):
pr_number = _pr_number_from_path(path)
if method == "GET" and "comments" in path:
return [{"body": body_text} for body_text in comments.get(pr_number or -1, [])]
if method == "POST" and "comments" in path:
comments.setdefault(pr_number or -1, []).append((body or {}).get("body", ""))
return {"id": len(comments[pr_number or -1])}
if method == "GET" and "pulls/" in path:
return {"user": {"login": "phase1b-local-proof"}}
return {"ok": True, "token": bool(token)}
async def fake_post_formal_approvals(pr_number: int, pr_author: str) -> None:
formal_approvals.append(pr_number)
async def fake_on_eval_complete(
conn: sqlite3.Connection,
pr_number: int,
*,
outcome: str,
review_text: str,
issues: list[str] | None = None,
) -> None:
eval_feedback.append({"pr": pr_number, "outcome": outcome, "issues": issues or []})
async def fake_dispose_rejected_pr(
conn: sqlite3.Connection,
pr_number: int,
eval_attempts: int,
issues: list[str],
) -> None:
dispositions.append({"pr": pr_number, "eval_attempts": eval_attempts, "issues": issues})
originals = {
"flag": config.PHASE1B_AGENT_ROUTING_ENABLED,
"backoff": evaluate_mod._rate_limit_backoff_until,
"get_pr_diff": evaluate_mod.get_pr_diff,
"run_agent_review": evaluate_mod.run_agent_review,
"forgejo_api": evaluate_mod.forgejo_api,
"post_formal_approvals": evaluate_mod.post_formal_approvals,
"on_eval_complete": evaluate_mod.on_eval_complete,
"dispose_rejected_pr": evaluate_mod.dispose_rejected_pr,
}
try:
config.PHASE1B_AGENT_ROUTING_ENABLED = True
evaluate_mod._rate_limit_backoff_until = None
evaluate_mod.get_pr_diff = fake_get_pr_diff
evaluate_mod.run_agent_review = fake_run_agent_review
evaluate_mod.forgejo_api = fake_forgejo_api
evaluate_mod.post_formal_approvals = fake_post_formal_approvals
evaluate_mod.on_eval_complete = fake_on_eval_complete
evaluate_mod.dispose_rejected_pr = fake_dispose_rejected_pr
succeeded, failed = await evaluate_mod.evaluate_cycle(conn, max_workers=len(cases))
finally:
config.PHASE1B_AGENT_ROUTING_ENABLED = originals["flag"]
evaluate_mod._rate_limit_backoff_until = originals["backoff"]
evaluate_mod.get_pr_diff = originals["get_pr_diff"]
evaluate_mod.run_agent_review = originals["run_agent_review"]
evaluate_mod.forgejo_api = originals["forgejo_api"]
evaluate_mod.post_formal_approvals = originals["post_formal_approvals"]
evaluate_mod.on_eval_complete = originals["on_eval_complete"]
evaluate_mod.dispose_rejected_pr = originals["dispose_rejected_pr"]
pr_rows = {
row["number"]: dict(row)
for row in conn.execute(
"""SELECT number, status, branch, domain, domain_agent, leo_verdict,
domain_verdict, auto_merge, eval_issues
FROM prs
ORDER BY number"""
).fetchall()
}
review_rows = [dict(row) for row in conn.execute("SELECT * FROM review_records ORDER BY pr_number, agent")]
route_events = [
json.loads(row["detail"])
for row in conn.execute(
"SELECT detail FROM audit_log WHERE stage = 'evaluate' AND event = 'phase1b_route' ORDER BY id"
).fetchall()
]
source_feedback = {
row["path"]: row["feedback"]
for row in conn.execute("SELECT path, feedback FROM sources WHERE feedback IS NOT NULL ORDER BY path")
}
case_results = []
for case in cases:
number = case["number"]
reviewers = sorted(row["agent"] for row in review_rows if row["pr_number"] == number)
posted = comments.get(number, [])
case_results.append(
{
"number": number,
"domain": case["domain"],
"expected_agents": sorted(case["expected_agents"]),
"reviewers": reviewers,
"status": pr_rows[number]["status"],
"domain_agent": pr_rows[number]["domain_agent"],
"domain_verdict": pr_rows[number]["domain_verdict"],
"comments": len(posted),
"markers": [
marker
for body in posted
for marker in re.findall(r"<!-- PHASE1B_REVIEW:PR=\d+:AGENT=[A-Z]+ -->", body)
],
}
)
proof = {
"ok": True,
"scope": "local_no_network_phase1b_eval_cycle",
"schema_version": db.SCHEMA_VERSION,
"feature_flag": "PHASE1B_AGENT_ROUTING_ENABLED",
"succeeded": succeeded,
"failed": failed,
"cases_total": len(cases),
"case_results": case_results,
"agents_seen": sorted({call["agent"] for call in agent_review_calls}),
"agent_review_calls": agent_review_calls,
"formal_approvals": sorted(formal_approvals),
"eval_feedback": sorted(eval_feedback, key=lambda item: item["pr"]),
"rejection_dispositions": dispositions,
"route_events": route_events,
"source_feedback_paths": sorted(source_feedback),
}
_assert_phase1b_proof(proof)
return proof
def _assert_phase1b_proof(proof: dict[str, Any]) -> None:
expected_agents = ["Astra", "Clay", "Leo", "Rio", "Theseus", "Vida"]
assert proof["succeeded"] == proof["cases_total"]
assert proof["failed"] == 0
assert proof["agents_seen"] == expected_agents
assert len(proof["route_events"]) == proof["cases_total"]
by_number = {case["number"]: case for case in proof["case_results"]}
for case in SINGLE_DOMAIN_CASES:
result = by_number[case["number"]]
assert result["status"] == "approved"
assert result["reviewers"] == sorted(case["expected_agents"])
assert result["comments"] == len(case["expected_agents"])
cross = by_number[CROSS_DOMAIN_CASE["number"]]
assert cross["status"] == "approved"
assert cross["reviewers"] == sorted(CROSS_DOMAIN_CASE["expected_agents"])
assert cross["comments"] == 2
feedback = by_number[FEEDBACK_CASE["number"]]
assert feedback["status"] == "open"
assert feedback["reviewers"] == ["Vida"]
assert feedback["domain_verdict"] == "request_changes"
assert proof["rejection_dispositions"] == [
{"pr": FEEDBACK_CASE["number"], "eval_attempts": 1, "issues": ["factual_discrepancy"]}
]
assert len(proof["formal_approvals"]) == len(SINGLE_DOMAIN_CASES) + 1
assert [item for item in proof["eval_feedback"] if item["outcome"] == "rejected"]
def main() -> None:
parser = argparse.ArgumentParser(description="Run local no-network Phase 1b proof")
parser.add_argument(
"--output",
default="proof/phase1b-local-e2e-proof.json",
help="JSON proof output path",
)
args = parser.parse_args()
proof = asyncio.run(run_phase1b_local_proof())
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n")
print(json.dumps({"ok": True, "output": str(output_path), "cases_total": proof["cases_total"]}, sort_keys=True))
if __name__ == "__main__":
main()