teleo-infrastructure/tests/test_evaluate_agent_routing.py
2026-05-29 14:16:12 +02:00

238 lines
8.6 KiB
Python

"""Tests for Phase 1b eval integration."""
import sqlite3
from unittest.mock import AsyncMock
import pytest
from lib import config
from lib.evaluate import _evaluate_pr_phase1b, _post_phase1b_review_comment, evaluate_pr
@pytest.fixture
def phase1b_conn():
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.executescript(
"""
CREATE TABLE prs (
number INTEGER PRIMARY KEY,
source_path TEXT,
branch TEXT,
status TEXT NOT NULL DEFAULT 'open',
domain TEXT,
agent TEXT,
tier TEXT,
tier0_pass INTEGER,
leo_verdict TEXT DEFAULT 'pending',
domain_verdict TEXT DEFAULT 'pending',
domain_agent TEXT,
domain_model TEXT,
eval_attempts INTEGER DEFAULT 0,
eval_issues TEXT DEFAULT '[]',
merge_cycled INTEGER DEFAULT 0,
last_error TEXT,
last_attempt TEXT,
cost_usd REAL DEFAULT 0,
auto_merge INTEGER DEFAULT 0,
created_at TEXT DEFAULT (datetime('now')),
merged_at TEXT
);
CREATE TABLE sources (
path TEXT PRIMARY KEY,
status TEXT DEFAULT 'extracted',
feedback TEXT
);
CREATE TABLE audit_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
stage TEXT,
event TEXT,
detail TEXT
);
CREATE TABLE review_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
pr_number INTEGER NOT NULL,
claim_path TEXT,
domain TEXT,
agent TEXT,
reviewer TEXT,
reviewer_model TEXT,
outcome TEXT NOT NULL,
rejection_reason TEXT,
disagreement_type TEXT,
notes TEXT,
batch_id TEXT,
claims_in_batch INTEGER,
reviewed_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE costs (
date TEXT,
model TEXT,
stage TEXT,
calls INTEGER DEFAULT 0,
input_tokens INTEGER DEFAULT 0,
output_tokens INTEGER DEFAULT 0,
cost_usd REAL DEFAULT 0,
duration_ms INTEGER DEFAULT 0,
cache_read_tokens INTEGER DEFAULT 0,
cache_write_tokens INTEGER DEFAULT 0,
cost_estimate_usd REAL DEFAULT 0,
PRIMARY KEY (date, model, stage)
);
"""
)
yield conn
conn.close()
def _diff_for(*paths: str) -> str:
return "\n".join(f"diff --git a/{path} b/{path}\n+type: claim\n+description: test" for path in paths)
def _insert_pr(conn, number=1, branch="rio/test", source_path="inbox/archive/test.md"):
conn.execute("INSERT INTO sources (path, status) VALUES (?, ?)", (source_path, "extracted"))
conn.execute(
"""INSERT INTO prs
(number, source_path, branch, status, tier, tier0_pass, leo_verdict, domain_verdict, eval_attempts)
VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0)""",
(number, source_path, branch),
)
async def _fake_agent_review(_diff, _files, agent, _route_context, tier="STANDARD"):
return f"{agent} review\n<!-- VERDICT:{agent.upper()}:APPROVE -->", {
"prompt_tokens": 10,
"completion_tokens": 5,
}
async def _fake_agent_review_reject_vida(_diff, _files, agent, _route_context, tier="STANDARD"):
verdict = "REQUEST_CHANGES" if agent == "Vida" else "APPROVE"
issues = "\n<!-- ISSUES: factual_discrepancy -->" if verdict == "REQUEST_CHANGES" else ""
return f"{agent} review{issues}\n<!-- VERDICT:{agent.upper()}:{verdict} -->", {
"prompt_tokens": 10,
"completion_tokens": 5,
}
async def _fake_forgejo_api(method, path, body=None, token=None):
if method == "GET" and "comments" in path:
return []
if method == "GET" and "pulls/" in path:
return {"user": {"login": "contributor"}}
return {"id": 1}
@pytest.mark.asyncio
async def test_phase1b_cross_domain_approves_after_all_required_agents(phase1b_conn, monkeypatch):
conn = phase1b_conn
_insert_pr(conn, branch="rio/ai-x402")
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review)
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
post_formal = AsyncMock()
monkeypatch.setattr("lib.evaluate.post_formal_approvals", post_formal)
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
diff = _diff_for("domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md")
result = await _evaluate_pr_phase1b(
conn,
1,
tier="STANDARD",
diff=diff,
review_diff=diff,
files="domains/ai-systems/agent-wallets.md\ndomains/internet-finance/x402.md",
branch_name="rio/ai-x402",
eval_attempts=1,
pr_cost=0,
)
assert result["approved"] is True
assert set(result["agent_verdicts"]) == {"Theseus", "Rio"}
row = conn.execute("SELECT status, domain, domain_agent, leo_verdict, domain_verdict FROM prs WHERE number = 1").fetchone()
assert row["status"] == "approved"
assert row["domain"] == "multi"
assert row["leo_verdict"] == "skipped"
assert row["domain_verdict"] == "approve"
assert row["domain_agent"] in {"Theseus", "Rio"}
review_count = conn.execute("SELECT COUNT(*) AS n FROM review_records WHERE pr_number = 1").fetchone()["n"]
assert review_count == 2
reviewers = {
row["agent"] for row in conn.execute("SELECT agent FROM review_records WHERE pr_number = 1").fetchall()
}
assert reviewers == {"Theseus", "Rio"}
post_formal.assert_awaited_once()
@pytest.mark.asyncio
async def test_phase1b_request_changes_blocks_merge(phase1b_conn, monkeypatch):
conn = phase1b_conn
_insert_pr(conn, branch="vida/health")
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review_reject_vida)
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock())
dispose = AsyncMock()
monkeypatch.setattr("lib.evaluate.dispose_rejected_pr", dispose)
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
diff = _diff_for("domains/health/claim.md")
result = await _evaluate_pr_phase1b(
conn,
1,
tier="STANDARD",
diff=diff,
review_diff=diff,
files="domains/health/claim.md",
branch_name="vida/health",
eval_attempts=1,
pr_cost=0,
)
assert result["approved"] is False
assert result["agent_verdicts"] == {"Vida": "request_changes"}
row = conn.execute("SELECT status, domain_agent, domain_verdict, eval_issues FROM prs WHERE number = 1").fetchone()
assert row["status"] == "open"
assert row["domain_agent"] == "Vida"
assert row["domain_verdict"] == "request_changes"
assert "factual_discrepancy" in row["eval_issues"]
dispose.assert_awaited_once()
@pytest.mark.asyncio
async def test_evaluate_pr_flag_uses_phase1b_and_not_legacy_reviewers(phase1b_conn, monkeypatch):
conn = phase1b_conn
_insert_pr(conn, branch="rio/x402")
monkeypatch.setattr(config, "PHASE1B_AGENT_ROUTING_ENABLED", True)
monkeypatch.setattr("lib.evaluate.get_pr_diff", AsyncMock(return_value=_diff_for("domains/internet-finance/x402.md")))
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review)
legacy_domain = AsyncMock()
legacy_leo = AsyncMock()
monkeypatch.setattr("lib.evaluate.run_domain_review", legacy_domain)
monkeypatch.setattr("lib.evaluate.run_leo_review", legacy_leo)
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock())
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
result = await evaluate_pr(conn, 1, tier="STANDARD")
assert result["phase1b"] is True
assert result["agent_verdicts"] == {"Rio": "approve"}
legacy_domain.assert_not_awaited()
legacy_leo.assert_not_awaited()
@pytest.mark.asyncio
async def test_phase1b_review_comment_is_idempotent(monkeypatch):
calls = []
async def fake_api(method, path, body=None, token=None):
calls.append((method, path, body))
if method == "GET":
return [{"body": "<!-- PHASE1B_REVIEW:PR=7:AGENT=RIO -->\nold review"}]
return {"id": 1}
monkeypatch.setattr("lib.evaluate.forgejo_api", fake_api)
posted = await _post_phase1b_review_comment(7, "Rio", "new review\n<!-- VERDICT:RIO:APPROVE -->")
assert posted is False
assert [call[0] for call in calls] == ["GET"]