"""Tests for Phase 1b eval integration.""" import sqlite3 from unittest.mock import AsyncMock import pytest from lib import config from lib.evaluate import _evaluate_pr_phase1b, _post_phase1b_review_comment, evaluate_pr @pytest.fixture def phase1b_conn(): conn = sqlite3.connect(":memory:") conn.row_factory = sqlite3.Row conn.executescript( """ CREATE TABLE prs ( number INTEGER PRIMARY KEY, source_path TEXT, branch TEXT, status TEXT NOT NULL DEFAULT 'open', domain TEXT, agent TEXT, tier TEXT, tier0_pass INTEGER, leo_verdict TEXT DEFAULT 'pending', domain_verdict TEXT DEFAULT 'pending', domain_agent TEXT, domain_model TEXT, eval_attempts INTEGER DEFAULT 0, eval_issues TEXT DEFAULT '[]', merge_cycled INTEGER DEFAULT 0, last_error TEXT, last_attempt TEXT, cost_usd REAL DEFAULT 0, auto_merge INTEGER DEFAULT 0, created_at TEXT DEFAULT (datetime('now')), merged_at TEXT ); CREATE TABLE sources ( path TEXT PRIMARY KEY, status TEXT DEFAULT 'extracted', feedback TEXT ); CREATE TABLE audit_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, stage TEXT, event TEXT, detail TEXT ); CREATE TABLE review_records ( id INTEGER PRIMARY KEY AUTOINCREMENT, pr_number INTEGER NOT NULL, claim_path TEXT, domain TEXT, agent TEXT, reviewer TEXT, reviewer_model TEXT, outcome TEXT NOT NULL, rejection_reason TEXT, disagreement_type TEXT, notes TEXT, batch_id TEXT, claims_in_batch INTEGER, reviewed_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE costs ( date TEXT, model TEXT, stage TEXT, calls INTEGER DEFAULT 0, input_tokens INTEGER DEFAULT 0, output_tokens INTEGER DEFAULT 0, cost_usd REAL DEFAULT 0, duration_ms INTEGER DEFAULT 0, cache_read_tokens INTEGER DEFAULT 0, cache_write_tokens INTEGER DEFAULT 0, cost_estimate_usd REAL DEFAULT 0, PRIMARY KEY (date, model, stage) ); """ ) yield conn conn.close() def _diff_for(*paths: str) -> str: return "\n".join(f"diff --git a/{path} b/{path}\n+type: claim\n+description: test" for path in paths) def _insert_pr(conn, number=1, branch="rio/test", source_path="inbox/archive/test.md"): conn.execute("INSERT INTO sources (path, status) VALUES (?, ?)", (source_path, "extracted")) conn.execute( """INSERT INTO prs (number, source_path, branch, status, tier, tier0_pass, leo_verdict, domain_verdict, eval_attempts) VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0)""", (number, source_path, branch), ) async def _fake_agent_review(_diff, _files, agent, _route_context, tier="STANDARD"): return f"{agent} review\n", { "prompt_tokens": 10, "completion_tokens": 5, } async def _fake_agent_review_reject_vida(_diff, _files, agent, _route_context, tier="STANDARD"): verdict = "REQUEST_CHANGES" if agent == "Vida" else "APPROVE" issues = "\n" if verdict == "REQUEST_CHANGES" else "" return f"{agent} review{issues}\n", { "prompt_tokens": 10, "completion_tokens": 5, } async def _fake_forgejo_api(method, path, body=None, token=None): if method == "GET" and "comments" in path: return [] if method == "GET" and "pulls/" in path: return {"user": {"login": "contributor"}} return {"id": 1} @pytest.mark.asyncio async def test_phase1b_cross_domain_approves_after_all_required_agents(phase1b_conn, monkeypatch): conn = phase1b_conn _insert_pr(conn, branch="rio/ai-x402") monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review) monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api) post_formal = AsyncMock() monkeypatch.setattr("lib.evaluate.post_formal_approvals", post_formal) monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock()) diff = _diff_for("domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md") result = await _evaluate_pr_phase1b( conn, 1, tier="STANDARD", diff=diff, review_diff=diff, files="domains/ai-systems/agent-wallets.md\ndomains/internet-finance/x402.md", branch_name="rio/ai-x402", eval_attempts=1, pr_cost=0, ) assert result["approved"] is True assert set(result["agent_verdicts"]) == {"Theseus", "Rio"} row = conn.execute("SELECT status, domain, domain_agent, leo_verdict, domain_verdict FROM prs WHERE number = 1").fetchone() assert row["status"] == "approved" assert row["domain"] == "multi" assert row["leo_verdict"] == "skipped" assert row["domain_verdict"] == "approve" assert row["domain_agent"] in {"Theseus", "Rio"} review_count = conn.execute("SELECT COUNT(*) AS n FROM review_records WHERE pr_number = 1").fetchone()["n"] assert review_count == 2 reviewers = { row["agent"] for row in conn.execute("SELECT agent FROM review_records WHERE pr_number = 1").fetchall() } assert reviewers == {"Theseus", "Rio"} post_formal.assert_awaited_once() @pytest.mark.asyncio async def test_phase1b_request_changes_blocks_merge(phase1b_conn, monkeypatch): conn = phase1b_conn _insert_pr(conn, branch="vida/health") monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review_reject_vida) monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api) monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock()) dispose = AsyncMock() monkeypatch.setattr("lib.evaluate.dispose_rejected_pr", dispose) monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock()) diff = _diff_for("domains/health/claim.md") result = await _evaluate_pr_phase1b( conn, 1, tier="STANDARD", diff=diff, review_diff=diff, files="domains/health/claim.md", branch_name="vida/health", eval_attempts=1, pr_cost=0, ) assert result["approved"] is False assert result["agent_verdicts"] == {"Vida": "request_changes"} row = conn.execute("SELECT status, domain_agent, domain_verdict, eval_issues FROM prs WHERE number = 1").fetchone() assert row["status"] == "open" assert row["domain_agent"] == "Vida" assert row["domain_verdict"] == "request_changes" assert "factual_discrepancy" in row["eval_issues"] dispose.assert_awaited_once() @pytest.mark.asyncio async def test_evaluate_pr_flag_uses_phase1b_and_not_legacy_reviewers(phase1b_conn, monkeypatch): conn = phase1b_conn _insert_pr(conn, branch="rio/x402") monkeypatch.setattr(config, "PHASE1B_AGENT_ROUTING_ENABLED", True) monkeypatch.setattr("lib.evaluate.get_pr_diff", AsyncMock(return_value=_diff_for("domains/internet-finance/x402.md"))) monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review) legacy_domain = AsyncMock() legacy_leo = AsyncMock() monkeypatch.setattr("lib.evaluate.run_domain_review", legacy_domain) monkeypatch.setattr("lib.evaluate.run_leo_review", legacy_leo) monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api) monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock()) monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock()) result = await evaluate_pr(conn, 1, tier="STANDARD") assert result["phase1b"] is True assert result["agent_verdicts"] == {"Rio": "approve"} legacy_domain.assert_not_awaited() legacy_leo.assert_not_awaited() @pytest.mark.asyncio async def test_phase1b_review_comment_is_idempotent(monkeypatch): calls = [] async def fake_api(method, path, body=None, token=None): calls.append((method, path, body)) if method == "GET": return [{"body": "\nold review"}] return {"id": 1} monkeypatch.setattr("lib.evaluate.forgejo_api", fake_api) posted = await _post_phase1b_review_comment(7, "Rio", "new review\n") assert posted is False assert [call[0] for call in calls] == ["GET"]