teleo-infrastructure/tests/test_eval_actions.py
m3taversal f46e14dfae
Some checks are pending
CI / lint-and-test (push) Waiting to run
refactor: Phase 4 — extract eval_actions.py, drop underscore prefixes in eval_parse
Three changes:

1. Drop underscore prefixes in eval_parse.py — functions are now the public
   API of the module (filter_diff, parse_verdict, classify_issues, etc.).
   All 12 functions renamed, imports updated in evaluate.py and tests.

2. Extract eval_actions.py from evaluate.py — 3 async PR disposition functions:
   - post_formal_approvals: submit Forgejo reviews from 2 agents
   - terminate_pr: close PR, post rejection comment, requeue source
   - dispose_rejected_pr: disposition logic for rejected PRs on attempt 2+
   evaluate.py drops from ~1140 to 911 lines.

3. 14 new tests in test_eval_actions.py covering all three functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 12:57:51 +01:00

290 lines
11 KiB
Python

"""Tests for lib/eval_actions.py — async PR disposition actions."""
import asyncio
import json
import sqlite3
import sys
import os
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
# Mock heavy dependencies before importing
sys.modules.setdefault("aiohttp", MagicMock())
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from lib.eval_actions import dispose_rejected_pr, post_formal_approvals, terminate_pr
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_db():
"""Create a minimal in-memory DB with prs + sources tables."""
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.execute("""
CREATE TABLE prs (
number INTEGER PRIMARY KEY,
source_path TEXT,
branch TEXT,
status TEXT NOT NULL DEFAULT 'open',
domain TEXT,
agent TEXT,
auto_merge INTEGER DEFAULT 0,
leo_verdict TEXT DEFAULT 'pending',
domain_verdict TEXT DEFAULT 'pending',
eval_attempts INTEGER DEFAULT 0,
eval_issues TEXT DEFAULT '[]',
merge_cycled INTEGER DEFAULT 0,
merge_failures INTEGER DEFAULT 0,
conflict_rebase_attempts INTEGER DEFAULT 0,
last_error TEXT,
merged_at TEXT,
last_attempt TEXT,
cost_usd REAL DEFAULT 0,
created_at TEXT DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE sources (
path TEXT PRIMARY KEY,
status TEXT DEFAULT 'extracted',
feedback TEXT,
updated_at TEXT
)
""")
conn.execute("""
CREATE TABLE audit_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
stage TEXT,
event TEXT,
detail TEXT,
ts TEXT DEFAULT (datetime('now'))
)
""")
return conn
def _insert_pr(conn, number=100, status="open", source_path=None, eval_issues=None, **kwargs):
"""Insert a test PR row."""
cols = ["number", "status"]
vals = [number, status]
if source_path is not None:
cols.append("source_path")
vals.append(source_path)
if eval_issues is not None:
cols.append("eval_issues")
vals.append(eval_issues)
for k, v in kwargs.items():
cols.append(k)
vals.append(v)
placeholders = ", ".join("?" * len(cols))
conn.execute(f"INSERT INTO prs ({', '.join(cols)}) VALUES ({placeholders})", vals)
conn.commit()
def _run(coro):
"""Run async coroutine in sync test."""
return asyncio.run(coro)
# ---------------------------------------------------------------------------
# post_formal_approvals
# ---------------------------------------------------------------------------
class TestPostFormalApprovals:
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
@patch("lib.eval_actions.get_agent_token")
def test_posts_two_approvals(self, mock_token, mock_api):
mock_token.return_value = "fake-token"
mock_api.return_value = {"id": 1}
_run(post_formal_approvals(100, "epimetheus"))
# Should have called forgejo_api exactly 2 times (first 2 agents that aren't author)
assert mock_api.call_count == 2
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
@patch("lib.eval_actions.get_agent_token")
def test_skips_pr_author(self, mock_token, mock_api):
mock_token.return_value = "fake-token"
mock_api.return_value = {"id": 1}
_run(post_formal_approvals(100, "leo"))
# Leo is first in the list, should be skipped, next 2 agents used
calls = mock_api.call_args_list
for call in calls:
assert "leo" not in str(call).lower() or "token" in str(call)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
@patch("lib.eval_actions.get_agent_token")
def test_handles_no_tokens(self, mock_token, mock_api):
mock_token.return_value = None
_run(post_formal_approvals(100, "someone"))
# No tokens available — no API calls
mock_api.assert_not_called()
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
@patch("lib.eval_actions.get_agent_token")
def test_continues_on_api_failure(self, mock_token, mock_api):
mock_token.return_value = "fake-token"
# First call fails (returns None), subsequent succeed
mock_api.side_effect = [None, {"id": 1}, {"id": 2}, {"id": 3}]
_run(post_formal_approvals(100, "nobody"))
# Should keep trying until 2 successes
assert mock_api.call_count >= 3
# ---------------------------------------------------------------------------
# terminate_pr
# ---------------------------------------------------------------------------
class TestTerminatePr:
@patch("lib.eval_actions.close_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_closes_pr_and_requeues_source(self, mock_api, mock_close):
mock_api.return_value = {"id": 1}
mock_close.return_value = True
conn = _make_db()
_insert_pr(conn, 100, source_path="inbox/queue/test.md",
eval_issues='["factual_discrepancy"]')
conn.execute("INSERT INTO sources (path, status) VALUES (?, ?)",
("inbox/queue/test.md", "extracted"))
conn.commit()
_run(terminate_pr(conn, 100, "test reason"))
# close_pr called
mock_close.assert_called_once()
# Source requeued
src = conn.execute("SELECT status FROM sources WHERE path = ?",
("inbox/queue/test.md",)).fetchone()
assert src["status"] == "needs_reextraction"
# Audit logged
audit = conn.execute("SELECT * FROM audit_log WHERE event = 'pr_terminated'").fetchone()
assert audit is not None
@patch("lib.eval_actions.close_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_skips_requeue_on_close_failure(self, mock_api, mock_close):
mock_api.return_value = {"id": 1}
mock_close.return_value = False
conn = _make_db()
_insert_pr(conn, 100, source_path="inbox/queue/test.md")
conn.execute("INSERT INTO sources (path, status) VALUES (?, ?)",
("inbox/queue/test.md", "extracted"))
conn.commit()
_run(terminate_pr(conn, 100, "test reason"))
# Source NOT requeued
src = conn.execute("SELECT status FROM sources WHERE path = ?",
("inbox/queue/test.md",)).fetchone()
assert src["status"] == "extracted"
# No audit for pr_terminated
audit = conn.execute("SELECT * FROM audit_log WHERE event = 'pr_terminated'").fetchone()
assert audit is None
@patch("lib.eval_actions.close_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_posts_structured_feedback_with_issues(self, mock_api, mock_close):
mock_api.return_value = {"id": 1}
mock_close.return_value = True
conn = _make_db()
_insert_pr(conn, 100, eval_issues='["frontmatter_schema", "near_duplicate"]')
_run(terminate_pr(conn, 100, "budget exhausted"))
# Comment posted with structured feedback
comment_call = mock_api.call_args_list[0]
body = comment_call[0][2]["body"]
assert "Closed by eval pipeline" in body
@patch("lib.eval_actions.close_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_handles_no_source_path(self, mock_api, mock_close):
mock_api.return_value = {"id": 1}
mock_close.return_value = True
conn = _make_db()
_insert_pr(conn, 100) # No source_path
_run(terminate_pr(conn, 100, "no source"))
# Should not crash, audit should still be logged
audit = conn.execute("SELECT * FROM audit_log WHERE event = 'pr_terminated'").fetchone()
assert audit is not None
# ---------------------------------------------------------------------------
# dispose_rejected_pr
# ---------------------------------------------------------------------------
class TestDisposeRejectedPr:
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_1_posts_feedback_only(self, mock_api):
mock_api.return_value = {"id": 1}
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=1, all_issues=["factual_discrepancy"]))
# Should post feedback comment but NOT close
assert mock_api.call_count == 1 # Just the comment
pr = conn.execute("SELECT status FROM prs WHERE number = 100").fetchone()
assert pr["status"] == "open" # Still open
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_1_no_issues_no_action(self, mock_api):
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=1, all_issues=[]))
mock_api.assert_not_called()
@patch("lib.eval_actions.terminate_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_max_terminates(self, mock_api, mock_terminate):
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=3, all_issues=["factual_discrepancy"]))
mock_terminate.assert_called_once()
assert "eval budget exhausted" in mock_terminate.call_args[0][2]
@patch("lib.eval_actions.terminate_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_2_mechanical_keeps_open(self, mock_api, mock_terminate):
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=2, all_issues=["frontmatter_schema"]))
# Should NOT terminate — mechanical issues get one more try
mock_terminate.assert_not_called()
# Should log audit
audit = conn.execute("SELECT * FROM audit_log WHERE event = 'mechanical_retry'").fetchone()
assert audit is not None
@patch("lib.eval_actions.terminate_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_2_substantive_terminates(self, mock_api, mock_terminate):
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=2, all_issues=["factual_discrepancy"]))
mock_terminate.assert_called_once()
assert "substantive issues" in mock_terminate.call_args[0][2]
@patch("lib.eval_actions.terminate_pr", new_callable=AsyncMock)
@patch("lib.eval_actions.forgejo_api", new_callable=AsyncMock)
def test_attempt_2_mixed_terminates(self, mock_api, mock_terminate):
conn = _make_db()
_insert_pr(conn, 100)
_run(dispose_rejected_pr(conn, 100, eval_attempts=2,
all_issues=["frontmatter_schema", "factual_discrepancy"]))
mock_terminate.assert_called_once()