#!/usr/bin/env python3 """No-network local proof for Phase 1b agent routing. This script exercises the real evaluate cycle against an in-memory migrated DB while replacing only external network/LLM edges with deterministic fakes. """ # ruff: noqa: E402,I001 from __future__ import annotations import argparse import asyncio import json import re import sqlite3 import sys from pathlib import Path from typing import Any REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from lib import config, db from lib import evaluate as evaluate_mod SINGLE_DOMAIN_CASES = [ { "number": 101, "domain": "grand-strategy", "branch": "leo/grand-strategy", "paths": ["domains/grand-strategy/strategy.md"], "expected_agents": ["Leo"], }, { "number": 102, "domain": "ai-alignment", "branch": "theseus/alignment", "paths": ["domains/ai-alignment/systems.md"], "expected_agents": ["Theseus"], }, { "number": 103, "domain": "internet-finance", "branch": "rio/x402", "paths": ["domains/internet-finance/x402.md"], "expected_agents": ["Rio"], }, { "number": 104, "domain": "health", "branch": "vida/health", "paths": ["domains/health/clinical.md"], "expected_agents": ["Vida"], }, { "number": 105, "domain": "entertainment", "branch": "clay/games", "paths": ["domains/entertainment/games.md"], "expected_agents": ["Clay"], }, { "number": 106, "domain": "space-development", "branch": "astra/robotics", "paths": ["domains/space-development/robotics.md"], "expected_agents": ["Astra"], }, ] CROSS_DOMAIN_CASE = { "number": 107, "domain": "cross-ai-finance", "branch": "rio/ai-x402", "paths": ["domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md"], "expected_agents": ["Theseus", "Rio"], } FEEDBACK_CASE = { "number": 108, "domain": "health-feedback", "branch": "vida/reject-health", "paths": ["domains/health/incorrect-health-claim.md"], "expected_agents": ["Vida"], } def _diff_for(paths: list[str]) -> str: chunks = [] for path in paths: chunks.append( "\n".join( [ f"diff --git a/{path} b/{path}", "--- a/file.md", "+++ b/file.md", "+type: claim", "+description: local phase 1b proof claim", ] ) ) return "\n".join(chunks) def _insert_pr(conn: sqlite3.Connection, case: dict[str, Any]) -> None: source_path = f"inbox/archive/phase1b-{case['number']}.md" conn.execute( "INSERT INTO sources (path, status, priority) VALUES (?, 'extracted', 'medium')", (source_path,), ) conn.execute( """INSERT INTO prs (number, source_path, branch, status, tier, tier0_pass, leo_verdict, domain_verdict, eval_attempts, priority) VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0, 'medium')""", (case["number"], source_path, case["branch"]), ) def _pr_number_from_path(path: str) -> int | None: match = re.search(r"(?:issues|pulls)/(\d+)", path) return int(match.group(1)) if match else None async def run_phase1b_local_proof() -> dict[str, Any]: conn = sqlite3.connect(":memory:") conn.row_factory = sqlite3.Row db.migrate(conn) cases = [*SINGLE_DOMAIN_CASES, CROSS_DOMAIN_CASE, FEEDBACK_CASE] diffs = {case["number"]: _diff_for(case["paths"]) for case in cases} for case in cases: _insert_pr(conn, case) comments: dict[int, list[str]] = {} formal_approvals: list[int] = [] eval_feedback: list[dict[str, Any]] = [] dispositions: list[dict[str, Any]] = [] agent_review_calls: list[dict[str, Any]] = [] async def fake_get_pr_diff(pr_number: int) -> str: return diffs[pr_number] async def fake_run_agent_review( diff: str, files: str, agent: str, route_context: str = "", tier: str = "STANDARD", ) -> tuple[str, dict[str, int]]: verdict = "REQUEST_CHANGES" if "incorrect-health-claim.md" in diff and agent == "Vida" else "APPROVE" issues = "\n" if verdict == "REQUEST_CHANGES" else "" agent_review_calls.append( { "agent": agent, "tier": tier, "files": files.splitlines(), "route": json.loads(route_context), "verdict": verdict, } ) return ( f"{agent} local Phase 1b review{issues}\n", {"prompt_tokens": 10, "completion_tokens": 5}, ) async def fake_forgejo_api(method: str, path: str, body: dict | None = None, token: str | None = None): pr_number = _pr_number_from_path(path) if method == "GET" and "comments" in path: return [{"body": body_text} for body_text in comments.get(pr_number or -1, [])] if method == "POST" and "comments" in path: comments.setdefault(pr_number or -1, []).append((body or {}).get("body", "")) return {"id": len(comments[pr_number or -1])} if method == "GET" and "pulls/" in path: return {"user": {"login": "phase1b-local-proof"}} return {"ok": True, "token": bool(token)} async def fake_post_formal_approvals(pr_number: int, pr_author: str) -> None: formal_approvals.append(pr_number) async def fake_on_eval_complete( conn: sqlite3.Connection, pr_number: int, *, outcome: str, review_text: str, issues: list[str] | None = None, ) -> None: eval_feedback.append({"pr": pr_number, "outcome": outcome, "issues": issues or []}) async def fake_dispose_rejected_pr( conn: sqlite3.Connection, pr_number: int, eval_attempts: int, issues: list[str], ) -> None: dispositions.append({"pr": pr_number, "eval_attempts": eval_attempts, "issues": issues}) originals = { "flag": config.PHASE1B_AGENT_ROUTING_ENABLED, "backoff": evaluate_mod._rate_limit_backoff_until, "get_pr_diff": evaluate_mod.get_pr_diff, "run_agent_review": evaluate_mod.run_agent_review, "forgejo_api": evaluate_mod.forgejo_api, "post_formal_approvals": evaluate_mod.post_formal_approvals, "on_eval_complete": evaluate_mod.on_eval_complete, "dispose_rejected_pr": evaluate_mod.dispose_rejected_pr, } try: config.PHASE1B_AGENT_ROUTING_ENABLED = True evaluate_mod._rate_limit_backoff_until = None evaluate_mod.get_pr_diff = fake_get_pr_diff evaluate_mod.run_agent_review = fake_run_agent_review evaluate_mod.forgejo_api = fake_forgejo_api evaluate_mod.post_formal_approvals = fake_post_formal_approvals evaluate_mod.on_eval_complete = fake_on_eval_complete evaluate_mod.dispose_rejected_pr = fake_dispose_rejected_pr succeeded, failed = await evaluate_mod.evaluate_cycle(conn, max_workers=len(cases)) finally: config.PHASE1B_AGENT_ROUTING_ENABLED = originals["flag"] evaluate_mod._rate_limit_backoff_until = originals["backoff"] evaluate_mod.get_pr_diff = originals["get_pr_diff"] evaluate_mod.run_agent_review = originals["run_agent_review"] evaluate_mod.forgejo_api = originals["forgejo_api"] evaluate_mod.post_formal_approvals = originals["post_formal_approvals"] evaluate_mod.on_eval_complete = originals["on_eval_complete"] evaluate_mod.dispose_rejected_pr = originals["dispose_rejected_pr"] pr_rows = { row["number"]: dict(row) for row in conn.execute( """SELECT number, status, branch, domain, domain_agent, leo_verdict, domain_verdict, auto_merge, eval_issues FROM prs ORDER BY number""" ).fetchall() } review_rows = [dict(row) for row in conn.execute("SELECT * FROM review_records ORDER BY pr_number, agent")] route_events = [ json.loads(row["detail"]) for row in conn.execute( "SELECT detail FROM audit_log WHERE stage = 'evaluate' AND event = 'phase1b_route' ORDER BY id" ).fetchall() ] source_feedback = { row["path"]: row["feedback"] for row in conn.execute("SELECT path, feedback FROM sources WHERE feedback IS NOT NULL ORDER BY path") } case_results = [] for case in cases: number = case["number"] reviewers = sorted(row["agent"] for row in review_rows if row["pr_number"] == number) posted = comments.get(number, []) case_results.append( { "number": number, "domain": case["domain"], "expected_agents": sorted(case["expected_agents"]), "reviewers": reviewers, "status": pr_rows[number]["status"], "domain_agent": pr_rows[number]["domain_agent"], "domain_verdict": pr_rows[number]["domain_verdict"], "comments": len(posted), "markers": [ marker for body in posted for marker in re.findall(r"", body) ], } ) proof = { "ok": True, "scope": "local_no_network_phase1b_eval_cycle", "schema_version": db.SCHEMA_VERSION, "feature_flag": "PHASE1B_AGENT_ROUTING_ENABLED", "succeeded": succeeded, "failed": failed, "cases_total": len(cases), "case_results": case_results, "agents_seen": sorted({call["agent"] for call in agent_review_calls}), "agent_review_calls": agent_review_calls, "formal_approvals": sorted(formal_approvals), "eval_feedback": sorted(eval_feedback, key=lambda item: item["pr"]), "rejection_dispositions": dispositions, "route_events": route_events, "source_feedback_paths": sorted(source_feedback), } _assert_phase1b_proof(proof) return proof def _assert_phase1b_proof(proof: dict[str, Any]) -> None: expected_agents = ["Astra", "Clay", "Leo", "Rio", "Theseus", "Vida"] assert proof["succeeded"] == proof["cases_total"] assert proof["failed"] == 0 assert proof["agents_seen"] == expected_agents assert len(proof["route_events"]) == proof["cases_total"] by_number = {case["number"]: case for case in proof["case_results"]} for case in SINGLE_DOMAIN_CASES: result = by_number[case["number"]] assert result["status"] == "approved" assert result["reviewers"] == sorted(case["expected_agents"]) assert result["comments"] == len(case["expected_agents"]) cross = by_number[CROSS_DOMAIN_CASE["number"]] assert cross["status"] == "approved" assert cross["reviewers"] == sorted(CROSS_DOMAIN_CASE["expected_agents"]) assert cross["comments"] == 2 feedback = by_number[FEEDBACK_CASE["number"]] assert feedback["status"] == "open" assert feedback["reviewers"] == ["Vida"] assert feedback["domain_verdict"] == "request_changes" assert proof["rejection_dispositions"] == [ {"pr": FEEDBACK_CASE["number"], "eval_attempts": 1, "issues": ["factual_discrepancy"]} ] assert len(proof["formal_approvals"]) == len(SINGLE_DOMAIN_CASES) + 1 assert [item for item in proof["eval_feedback"] if item["outcome"] == "rejected"] def main() -> None: parser = argparse.ArgumentParser(description="Run local no-network Phase 1b proof") parser.add_argument( "--output", default="proof/phase1b-local-e2e-proof.json", help="JSON proof output path", ) args = parser.parse_args() proof = asyncio.run(run_phase1b_local_proof()) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n") print(json.dumps({"ok": True, "output": str(output_path), "cases_total": proof["cases_total"]}, sort_keys=True)) if __name__ == "__main__": main()