346 lines
12 KiB
Python
346 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""No-network local proof for Phase 1b agent routing.
|
|
|
|
This script exercises the real evaluate cycle against an in-memory migrated DB
|
|
while replacing only external network/LLM edges with deterministic fakes.
|
|
"""
|
|
|
|
# ruff: noqa: E402,I001
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
if str(REPO_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(REPO_ROOT))
|
|
|
|
from lib import config, db
|
|
from lib import evaluate as evaluate_mod
|
|
|
|
|
|
SINGLE_DOMAIN_CASES = [
|
|
{
|
|
"number": 101,
|
|
"domain": "grand-strategy",
|
|
"branch": "leo/grand-strategy",
|
|
"paths": ["domains/grand-strategy/strategy.md"],
|
|
"expected_agents": ["Leo"],
|
|
},
|
|
{
|
|
"number": 102,
|
|
"domain": "ai-alignment",
|
|
"branch": "theseus/alignment",
|
|
"paths": ["domains/ai-alignment/systems.md"],
|
|
"expected_agents": ["Theseus"],
|
|
},
|
|
{
|
|
"number": 103,
|
|
"domain": "internet-finance",
|
|
"branch": "rio/x402",
|
|
"paths": ["domains/internet-finance/x402.md"],
|
|
"expected_agents": ["Rio"],
|
|
},
|
|
{
|
|
"number": 104,
|
|
"domain": "health",
|
|
"branch": "vida/health",
|
|
"paths": ["domains/health/clinical.md"],
|
|
"expected_agents": ["Vida"],
|
|
},
|
|
{
|
|
"number": 105,
|
|
"domain": "entertainment",
|
|
"branch": "clay/games",
|
|
"paths": ["domains/entertainment/games.md"],
|
|
"expected_agents": ["Clay"],
|
|
},
|
|
{
|
|
"number": 106,
|
|
"domain": "space-development",
|
|
"branch": "astra/robotics",
|
|
"paths": ["domains/space-development/robotics.md"],
|
|
"expected_agents": ["Astra"],
|
|
},
|
|
]
|
|
|
|
CROSS_DOMAIN_CASE = {
|
|
"number": 107,
|
|
"domain": "cross-ai-finance",
|
|
"branch": "rio/ai-x402",
|
|
"paths": ["domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md"],
|
|
"expected_agents": ["Theseus", "Rio"],
|
|
}
|
|
|
|
FEEDBACK_CASE = {
|
|
"number": 108,
|
|
"domain": "health-feedback",
|
|
"branch": "vida/reject-health",
|
|
"paths": ["domains/health/incorrect-health-claim.md"],
|
|
"expected_agents": ["Vida"],
|
|
}
|
|
|
|
|
|
def _diff_for(paths: list[str]) -> str:
|
|
chunks = []
|
|
for path in paths:
|
|
chunks.append(
|
|
"\n".join(
|
|
[
|
|
f"diff --git a/{path} b/{path}",
|
|
"--- a/file.md",
|
|
"+++ b/file.md",
|
|
"+type: claim",
|
|
"+description: local phase 1b proof claim",
|
|
]
|
|
)
|
|
)
|
|
return "\n".join(chunks)
|
|
|
|
|
|
def _insert_pr(conn: sqlite3.Connection, case: dict[str, Any]) -> None:
|
|
source_path = f"inbox/archive/phase1b-{case['number']}.md"
|
|
conn.execute(
|
|
"INSERT INTO sources (path, status, priority) VALUES (?, 'extracted', 'medium')",
|
|
(source_path,),
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO prs
|
|
(number, source_path, branch, status, tier, tier0_pass,
|
|
leo_verdict, domain_verdict, eval_attempts, priority)
|
|
VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0, 'medium')""",
|
|
(case["number"], source_path, case["branch"]),
|
|
)
|
|
|
|
|
|
def _pr_number_from_path(path: str) -> int | None:
|
|
match = re.search(r"(?:issues|pulls)/(\d+)", path)
|
|
return int(match.group(1)) if match else None
|
|
|
|
|
|
async def run_phase1b_local_proof() -> dict[str, Any]:
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.row_factory = sqlite3.Row
|
|
db.migrate(conn)
|
|
|
|
cases = [*SINGLE_DOMAIN_CASES, CROSS_DOMAIN_CASE, FEEDBACK_CASE]
|
|
diffs = {case["number"]: _diff_for(case["paths"]) for case in cases}
|
|
for case in cases:
|
|
_insert_pr(conn, case)
|
|
|
|
comments: dict[int, list[str]] = {}
|
|
formal_approvals: list[int] = []
|
|
eval_feedback: list[dict[str, Any]] = []
|
|
dispositions: list[dict[str, Any]] = []
|
|
agent_review_calls: list[dict[str, Any]] = []
|
|
|
|
async def fake_get_pr_diff(pr_number: int) -> str:
|
|
return diffs[pr_number]
|
|
|
|
async def fake_run_agent_review(
|
|
diff: str,
|
|
files: str,
|
|
agent: str,
|
|
route_context: str = "",
|
|
tier: str = "STANDARD",
|
|
) -> tuple[str, dict[str, int]]:
|
|
verdict = "REQUEST_CHANGES" if "incorrect-health-claim.md" in diff and agent == "Vida" else "APPROVE"
|
|
issues = "\n<!-- ISSUES: factual_discrepancy -->" if verdict == "REQUEST_CHANGES" else ""
|
|
agent_review_calls.append(
|
|
{
|
|
"agent": agent,
|
|
"tier": tier,
|
|
"files": files.splitlines(),
|
|
"route": json.loads(route_context),
|
|
"verdict": verdict,
|
|
}
|
|
)
|
|
return (
|
|
f"{agent} local Phase 1b review{issues}\n<!-- VERDICT:{agent.upper()}:{verdict} -->",
|
|
{"prompt_tokens": 10, "completion_tokens": 5},
|
|
)
|
|
|
|
async def fake_forgejo_api(method: str, path: str, body: dict | None = None, token: str | None = None):
|
|
pr_number = _pr_number_from_path(path)
|
|
if method == "GET" and "comments" in path:
|
|
return [{"body": body_text} for body_text in comments.get(pr_number or -1, [])]
|
|
if method == "POST" and "comments" in path:
|
|
comments.setdefault(pr_number or -1, []).append((body or {}).get("body", ""))
|
|
return {"id": len(comments[pr_number or -1])}
|
|
if method == "GET" and "pulls/" in path:
|
|
return {"user": {"login": "phase1b-local-proof"}}
|
|
return {"ok": True, "token": bool(token)}
|
|
|
|
async def fake_post_formal_approvals(pr_number: int, pr_author: str) -> None:
|
|
formal_approvals.append(pr_number)
|
|
|
|
async def fake_on_eval_complete(
|
|
conn: sqlite3.Connection,
|
|
pr_number: int,
|
|
*,
|
|
outcome: str,
|
|
review_text: str,
|
|
issues: list[str] | None = None,
|
|
) -> None:
|
|
eval_feedback.append({"pr": pr_number, "outcome": outcome, "issues": issues or []})
|
|
|
|
async def fake_dispose_rejected_pr(
|
|
conn: sqlite3.Connection,
|
|
pr_number: int,
|
|
eval_attempts: int,
|
|
issues: list[str],
|
|
) -> None:
|
|
dispositions.append({"pr": pr_number, "eval_attempts": eval_attempts, "issues": issues})
|
|
|
|
originals = {
|
|
"flag": config.PHASE1B_AGENT_ROUTING_ENABLED,
|
|
"backoff": evaluate_mod._rate_limit_backoff_until,
|
|
"get_pr_diff": evaluate_mod.get_pr_diff,
|
|
"run_agent_review": evaluate_mod.run_agent_review,
|
|
"forgejo_api": evaluate_mod.forgejo_api,
|
|
"post_formal_approvals": evaluate_mod.post_formal_approvals,
|
|
"on_eval_complete": evaluate_mod.on_eval_complete,
|
|
"dispose_rejected_pr": evaluate_mod.dispose_rejected_pr,
|
|
}
|
|
|
|
try:
|
|
config.PHASE1B_AGENT_ROUTING_ENABLED = True
|
|
evaluate_mod._rate_limit_backoff_until = None
|
|
evaluate_mod.get_pr_diff = fake_get_pr_diff
|
|
evaluate_mod.run_agent_review = fake_run_agent_review
|
|
evaluate_mod.forgejo_api = fake_forgejo_api
|
|
evaluate_mod.post_formal_approvals = fake_post_formal_approvals
|
|
evaluate_mod.on_eval_complete = fake_on_eval_complete
|
|
evaluate_mod.dispose_rejected_pr = fake_dispose_rejected_pr
|
|
|
|
succeeded, failed = await evaluate_mod.evaluate_cycle(conn, max_workers=len(cases))
|
|
finally:
|
|
config.PHASE1B_AGENT_ROUTING_ENABLED = originals["flag"]
|
|
evaluate_mod._rate_limit_backoff_until = originals["backoff"]
|
|
evaluate_mod.get_pr_diff = originals["get_pr_diff"]
|
|
evaluate_mod.run_agent_review = originals["run_agent_review"]
|
|
evaluate_mod.forgejo_api = originals["forgejo_api"]
|
|
evaluate_mod.post_formal_approvals = originals["post_formal_approvals"]
|
|
evaluate_mod.on_eval_complete = originals["on_eval_complete"]
|
|
evaluate_mod.dispose_rejected_pr = originals["dispose_rejected_pr"]
|
|
|
|
pr_rows = {
|
|
row["number"]: dict(row)
|
|
for row in conn.execute(
|
|
"""SELECT number, status, branch, domain, domain_agent, leo_verdict,
|
|
domain_verdict, auto_merge, eval_issues
|
|
FROM prs
|
|
ORDER BY number"""
|
|
).fetchall()
|
|
}
|
|
review_rows = [dict(row) for row in conn.execute("SELECT * FROM review_records ORDER BY pr_number, agent")]
|
|
route_events = [
|
|
json.loads(row["detail"])
|
|
for row in conn.execute(
|
|
"SELECT detail FROM audit_log WHERE stage = 'evaluate' AND event = 'phase1b_route' ORDER BY id"
|
|
).fetchall()
|
|
]
|
|
source_feedback = {
|
|
row["path"]: row["feedback"]
|
|
for row in conn.execute("SELECT path, feedback FROM sources WHERE feedback IS NOT NULL ORDER BY path")
|
|
}
|
|
|
|
case_results = []
|
|
for case in cases:
|
|
number = case["number"]
|
|
reviewers = sorted(row["agent"] for row in review_rows if row["pr_number"] == number)
|
|
posted = comments.get(number, [])
|
|
case_results.append(
|
|
{
|
|
"number": number,
|
|
"domain": case["domain"],
|
|
"expected_agents": sorted(case["expected_agents"]),
|
|
"reviewers": reviewers,
|
|
"status": pr_rows[number]["status"],
|
|
"domain_agent": pr_rows[number]["domain_agent"],
|
|
"domain_verdict": pr_rows[number]["domain_verdict"],
|
|
"comments": len(posted),
|
|
"markers": [
|
|
marker
|
|
for body in posted
|
|
for marker in re.findall(r"<!-- PHASE1B_REVIEW:PR=\d+:AGENT=[A-Z]+ -->", body)
|
|
],
|
|
}
|
|
)
|
|
|
|
proof = {
|
|
"ok": True,
|
|
"scope": "local_no_network_phase1b_eval_cycle",
|
|
"schema_version": db.SCHEMA_VERSION,
|
|
"feature_flag": "PHASE1B_AGENT_ROUTING_ENABLED",
|
|
"succeeded": succeeded,
|
|
"failed": failed,
|
|
"cases_total": len(cases),
|
|
"case_results": case_results,
|
|
"agents_seen": sorted({call["agent"] for call in agent_review_calls}),
|
|
"agent_review_calls": agent_review_calls,
|
|
"formal_approvals": sorted(formal_approvals),
|
|
"eval_feedback": sorted(eval_feedback, key=lambda item: item["pr"]),
|
|
"rejection_dispositions": dispositions,
|
|
"route_events": route_events,
|
|
"source_feedback_paths": sorted(source_feedback),
|
|
}
|
|
_assert_phase1b_proof(proof)
|
|
return proof
|
|
|
|
|
|
def _assert_phase1b_proof(proof: dict[str, Any]) -> None:
|
|
expected_agents = ["Astra", "Clay", "Leo", "Rio", "Theseus", "Vida"]
|
|
assert proof["succeeded"] == proof["cases_total"]
|
|
assert proof["failed"] == 0
|
|
assert proof["agents_seen"] == expected_agents
|
|
assert len(proof["route_events"]) == proof["cases_total"]
|
|
|
|
by_number = {case["number"]: case for case in proof["case_results"]}
|
|
for case in SINGLE_DOMAIN_CASES:
|
|
result = by_number[case["number"]]
|
|
assert result["status"] == "approved"
|
|
assert result["reviewers"] == sorted(case["expected_agents"])
|
|
assert result["comments"] == len(case["expected_agents"])
|
|
|
|
cross = by_number[CROSS_DOMAIN_CASE["number"]]
|
|
assert cross["status"] == "approved"
|
|
assert cross["reviewers"] == sorted(CROSS_DOMAIN_CASE["expected_agents"])
|
|
assert cross["comments"] == 2
|
|
|
|
feedback = by_number[FEEDBACK_CASE["number"]]
|
|
assert feedback["status"] == "open"
|
|
assert feedback["reviewers"] == ["Vida"]
|
|
assert feedback["domain_verdict"] == "request_changes"
|
|
assert proof["rejection_dispositions"] == [
|
|
{"pr": FEEDBACK_CASE["number"], "eval_attempts": 1, "issues": ["factual_discrepancy"]}
|
|
]
|
|
assert len(proof["formal_approvals"]) == len(SINGLE_DOMAIN_CASES) + 1
|
|
assert [item for item in proof["eval_feedback"] if item["outcome"] == "rejected"]
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Run local no-network Phase 1b proof")
|
|
parser.add_argument(
|
|
"--output",
|
|
default="proof/phase1b-local-e2e-proof.json",
|
|
help="JSON proof output path",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
proof = asyncio.run(run_phase1b_local_proof())
|
|
output_path = Path(args.output)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n")
|
|
print(json.dumps({"ok": True, "output": str(output_path), "cases_total": proof["cases_total"]}, sort_keys=True))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|