238 lines
8.6 KiB
Python
238 lines
8.6 KiB
Python
"""Tests for Phase 1b eval integration."""
|
|
|
|
import sqlite3
|
|
from unittest.mock import AsyncMock
|
|
|
|
import pytest
|
|
|
|
from lib import config
|
|
from lib.evaluate import _evaluate_pr_phase1b, _post_phase1b_review_comment, evaluate_pr
|
|
|
|
|
|
@pytest.fixture
|
|
def phase1b_conn():
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.row_factory = sqlite3.Row
|
|
conn.executescript(
|
|
"""
|
|
CREATE TABLE prs (
|
|
number INTEGER PRIMARY KEY,
|
|
source_path TEXT,
|
|
branch TEXT,
|
|
status TEXT NOT NULL DEFAULT 'open',
|
|
domain TEXT,
|
|
agent TEXT,
|
|
tier TEXT,
|
|
tier0_pass INTEGER,
|
|
leo_verdict TEXT DEFAULT 'pending',
|
|
domain_verdict TEXT DEFAULT 'pending',
|
|
domain_agent TEXT,
|
|
domain_model TEXT,
|
|
eval_attempts INTEGER DEFAULT 0,
|
|
eval_issues TEXT DEFAULT '[]',
|
|
merge_cycled INTEGER DEFAULT 0,
|
|
last_error TEXT,
|
|
last_attempt TEXT,
|
|
cost_usd REAL DEFAULT 0,
|
|
auto_merge INTEGER DEFAULT 0,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
merged_at TEXT
|
|
);
|
|
CREATE TABLE sources (
|
|
path TEXT PRIMARY KEY,
|
|
status TEXT DEFAULT 'extracted',
|
|
feedback TEXT
|
|
);
|
|
CREATE TABLE audit_log (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
stage TEXT,
|
|
event TEXT,
|
|
detail TEXT
|
|
);
|
|
CREATE TABLE review_records (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
pr_number INTEGER NOT NULL,
|
|
claim_path TEXT,
|
|
domain TEXT,
|
|
agent TEXT,
|
|
reviewer TEXT,
|
|
reviewer_model TEXT,
|
|
outcome TEXT NOT NULL,
|
|
rejection_reason TEXT,
|
|
disagreement_type TEXT,
|
|
notes TEXT,
|
|
batch_id TEXT,
|
|
claims_in_batch INTEGER,
|
|
reviewed_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
CREATE TABLE costs (
|
|
date TEXT,
|
|
model TEXT,
|
|
stage TEXT,
|
|
calls INTEGER DEFAULT 0,
|
|
input_tokens INTEGER DEFAULT 0,
|
|
output_tokens INTEGER DEFAULT 0,
|
|
cost_usd REAL DEFAULT 0,
|
|
duration_ms INTEGER DEFAULT 0,
|
|
cache_read_tokens INTEGER DEFAULT 0,
|
|
cache_write_tokens INTEGER DEFAULT 0,
|
|
cost_estimate_usd REAL DEFAULT 0,
|
|
PRIMARY KEY (date, model, stage)
|
|
);
|
|
"""
|
|
)
|
|
yield conn
|
|
conn.close()
|
|
|
|
|
|
def _diff_for(*paths: str) -> str:
|
|
return "\n".join(f"diff --git a/{path} b/{path}\n+type: claim\n+description: test" for path in paths)
|
|
|
|
|
|
def _insert_pr(conn, number=1, branch="rio/test", source_path="inbox/archive/test.md"):
|
|
conn.execute("INSERT INTO sources (path, status) VALUES (?, ?)", (source_path, "extracted"))
|
|
conn.execute(
|
|
"""INSERT INTO prs
|
|
(number, source_path, branch, status, tier, tier0_pass, leo_verdict, domain_verdict, eval_attempts)
|
|
VALUES (?, ?, ?, 'open', 'STANDARD', 1, 'pending', 'pending', 0)""",
|
|
(number, source_path, branch),
|
|
)
|
|
|
|
|
|
async def _fake_agent_review(_diff, _files, agent, _route_context, tier="STANDARD"):
|
|
return f"{agent} review\n<!-- VERDICT:{agent.upper()}:APPROVE -->", {
|
|
"prompt_tokens": 10,
|
|
"completion_tokens": 5,
|
|
}
|
|
|
|
|
|
async def _fake_agent_review_reject_vida(_diff, _files, agent, _route_context, tier="STANDARD"):
|
|
verdict = "REQUEST_CHANGES" if agent == "Vida" else "APPROVE"
|
|
issues = "\n<!-- ISSUES: factual_discrepancy -->" if verdict == "REQUEST_CHANGES" else ""
|
|
return f"{agent} review{issues}\n<!-- VERDICT:{agent.upper()}:{verdict} -->", {
|
|
"prompt_tokens": 10,
|
|
"completion_tokens": 5,
|
|
}
|
|
|
|
|
|
async def _fake_forgejo_api(method, path, body=None, token=None):
|
|
if method == "GET" and "comments" in path:
|
|
return []
|
|
if method == "GET" and "pulls/" in path:
|
|
return {"user": {"login": "contributor"}}
|
|
return {"id": 1}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_phase1b_cross_domain_approves_after_all_required_agents(phase1b_conn, monkeypatch):
|
|
conn = phase1b_conn
|
|
_insert_pr(conn, branch="rio/ai-x402")
|
|
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review)
|
|
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
|
|
post_formal = AsyncMock()
|
|
monkeypatch.setattr("lib.evaluate.post_formal_approvals", post_formal)
|
|
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
|
|
|
|
diff = _diff_for("domains/ai-systems/agent-wallets.md", "domains/internet-finance/x402.md")
|
|
result = await _evaluate_pr_phase1b(
|
|
conn,
|
|
1,
|
|
tier="STANDARD",
|
|
diff=diff,
|
|
review_diff=diff,
|
|
files="domains/ai-systems/agent-wallets.md\ndomains/internet-finance/x402.md",
|
|
branch_name="rio/ai-x402",
|
|
eval_attempts=1,
|
|
pr_cost=0,
|
|
)
|
|
|
|
assert result["approved"] is True
|
|
assert set(result["agent_verdicts"]) == {"Theseus", "Rio"}
|
|
row = conn.execute("SELECT status, domain, domain_agent, leo_verdict, domain_verdict FROM prs WHERE number = 1").fetchone()
|
|
assert row["status"] == "approved"
|
|
assert row["domain"] == "multi"
|
|
assert row["leo_verdict"] == "skipped"
|
|
assert row["domain_verdict"] == "approve"
|
|
assert row["domain_agent"] in {"Theseus", "Rio"}
|
|
review_count = conn.execute("SELECT COUNT(*) AS n FROM review_records WHERE pr_number = 1").fetchone()["n"]
|
|
assert review_count == 2
|
|
reviewers = {
|
|
row["agent"] for row in conn.execute("SELECT agent FROM review_records WHERE pr_number = 1").fetchall()
|
|
}
|
|
assert reviewers == {"Theseus", "Rio"}
|
|
post_formal.assert_awaited_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_phase1b_request_changes_blocks_merge(phase1b_conn, monkeypatch):
|
|
conn = phase1b_conn
|
|
_insert_pr(conn, branch="vida/health")
|
|
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review_reject_vida)
|
|
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
|
|
monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock())
|
|
dispose = AsyncMock()
|
|
monkeypatch.setattr("lib.evaluate.dispose_rejected_pr", dispose)
|
|
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
|
|
|
|
diff = _diff_for("domains/health/claim.md")
|
|
result = await _evaluate_pr_phase1b(
|
|
conn,
|
|
1,
|
|
tier="STANDARD",
|
|
diff=diff,
|
|
review_diff=diff,
|
|
files="domains/health/claim.md",
|
|
branch_name="vida/health",
|
|
eval_attempts=1,
|
|
pr_cost=0,
|
|
)
|
|
|
|
assert result["approved"] is False
|
|
assert result["agent_verdicts"] == {"Vida": "request_changes"}
|
|
row = conn.execute("SELECT status, domain_agent, domain_verdict, eval_issues FROM prs WHERE number = 1").fetchone()
|
|
assert row["status"] == "open"
|
|
assert row["domain_agent"] == "Vida"
|
|
assert row["domain_verdict"] == "request_changes"
|
|
assert "factual_discrepancy" in row["eval_issues"]
|
|
dispose.assert_awaited_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_pr_flag_uses_phase1b_and_not_legacy_reviewers(phase1b_conn, monkeypatch):
|
|
conn = phase1b_conn
|
|
_insert_pr(conn, branch="rio/x402")
|
|
monkeypatch.setattr(config, "PHASE1B_AGENT_ROUTING_ENABLED", True)
|
|
monkeypatch.setattr("lib.evaluate.get_pr_diff", AsyncMock(return_value=_diff_for("domains/internet-finance/x402.md")))
|
|
monkeypatch.setattr("lib.evaluate.run_agent_review", _fake_agent_review)
|
|
legacy_domain = AsyncMock()
|
|
legacy_leo = AsyncMock()
|
|
monkeypatch.setattr("lib.evaluate.run_domain_review", legacy_domain)
|
|
monkeypatch.setattr("lib.evaluate.run_leo_review", legacy_leo)
|
|
monkeypatch.setattr("lib.evaluate.forgejo_api", _fake_forgejo_api)
|
|
monkeypatch.setattr("lib.evaluate.post_formal_approvals", AsyncMock())
|
|
monkeypatch.setattr("lib.evaluate.on_eval_complete", AsyncMock())
|
|
|
|
result = await evaluate_pr(conn, 1, tier="STANDARD")
|
|
|
|
assert result["phase1b"] is True
|
|
assert result["agent_verdicts"] == {"Rio": "approve"}
|
|
legacy_domain.assert_not_awaited()
|
|
legacy_leo.assert_not_awaited()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_phase1b_review_comment_is_idempotent(monkeypatch):
|
|
calls = []
|
|
|
|
async def fake_api(method, path, body=None, token=None):
|
|
calls.append((method, path, body))
|
|
if method == "GET":
|
|
return [{"body": "<!-- PHASE1B_REVIEW:PR=7:AGENT=RIO -->\nold review"}]
|
|
return {"id": 1}
|
|
|
|
monkeypatch.setattr("lib.evaluate.forgejo_api", fake_api)
|
|
|
|
posted = await _post_phase1b_review_comment(7, "Rio", "new review\n<!-- VERDICT:RIO:APPROVE -->")
|
|
|
|
assert posted is False
|
|
assert [call[0] for call in calls] == ["GET"]
|