Some checks are pending
CI / lint-and-test (push) Waiting to run
Three changes: 1. Drop underscore prefixes in eval_parse.py — functions are now the public API of the module (filter_diff, parse_verdict, classify_issues, etc.). All 12 functions renamed, imports updated in evaluate.py and tests. 2. Extract eval_actions.py from evaluate.py — 3 async PR disposition functions: - post_formal_approvals: submit Forgejo reviews from 2 agents - terminate_pr: close PR, post rejection comment, requeue source - dispose_rejected_pr: disposition logic for rejected PRs on attempt 2+ evaluate.py drops from ~1140 to 911 lines. 3. 14 new tests in test_eval_actions.py covering all three functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
315 lines
12 KiB
Python
315 lines
12 KiB
Python
"""Tests for lib/eval_parse.py — pure parsing functions extracted from evaluate.py."""
|
|
|
|
import sys
|
|
import os
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
# Mock heavy dependencies before importing
|
|
sys.modules.setdefault("aiohttp", MagicMock())
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from lib.eval_parse import (
|
|
VALID_ISSUE_TAGS,
|
|
classify_issues,
|
|
deterministic_tier,
|
|
diff_contains_claim_type,
|
|
extract_changed_files,
|
|
filter_diff,
|
|
infer_issues_from_prose,
|
|
is_musings_only,
|
|
normalize_tag,
|
|
parse_batch_response,
|
|
parse_issues,
|
|
parse_verdict,
|
|
validate_batch_fanout,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# filter_diff
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFilterDiff:
|
|
def test_strips_inbox_files(self):
|
|
diff = (
|
|
"diff --git a/inbox/archive/foo.md b/inbox/archive/foo.md\n"
|
|
"+some content\n"
|
|
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
|
|
"+real content\n"
|
|
)
|
|
review_diff, entity_diff = filter_diff(diff)
|
|
assert "inbox" not in review_diff
|
|
assert "claim.md" in review_diff
|
|
|
|
def test_separates_entity_diffs(self):
|
|
diff = (
|
|
"diff --git a/entities/people/alice.md b/entities/people/alice.md\n"
|
|
"+entity content\n"
|
|
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
|
|
"+claim content\n"
|
|
)
|
|
review_diff, entity_diff = filter_diff(diff)
|
|
assert "alice.md" in entity_diff
|
|
assert "claim.md" in review_diff
|
|
|
|
def test_core_domain_entities_stay_in_review(self):
|
|
diff = (
|
|
"diff --git a/entities/living-agents/agent.md b/entities/living-agents/agent.md\n"
|
|
"+core entity\n"
|
|
)
|
|
review_diff, entity_diff = filter_diff(diff)
|
|
assert "agent.md" in review_diff
|
|
assert entity_diff == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# extract_changed_files
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestExtractChangedFiles:
|
|
def test_extracts_paths(self):
|
|
diff = (
|
|
"diff --git a/domains/foo.md b/domains/foo.md\n"
|
|
"+content\n"
|
|
"diff --git a/entities/bar.md b/entities/bar.md\n"
|
|
"+more\n"
|
|
)
|
|
result = extract_changed_files(diff)
|
|
assert "domains/foo.md" in result
|
|
assert "entities/bar.md" in result
|
|
|
|
def test_empty_diff(self):
|
|
assert extract_changed_files("") == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# is_musings_only
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestIsMusingsOnly:
|
|
def test_musings_only(self):
|
|
diff = "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
|
|
assert is_musings_only(diff) is True
|
|
|
|
def test_mixed_files(self):
|
|
diff = (
|
|
"diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
|
|
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
|
|
)
|
|
assert is_musings_only(diff) is False
|
|
|
|
def test_no_musings(self):
|
|
diff = "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
|
|
assert is_musings_only(diff) is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# diff_contains_claim_type
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestDiffContainsClaimType:
|
|
def test_detects_claim_type(self):
|
|
diff = "+type: claim\n+confidence: 0.8\n"
|
|
assert diff_contains_claim_type(diff) is True
|
|
|
|
def test_ignores_non_claim(self):
|
|
diff = "+type: entity\n"
|
|
assert diff_contains_claim_type(diff) is False
|
|
|
|
def test_ignores_file_header(self):
|
|
diff = "+++ b/type: claim\n"
|
|
assert diff_contains_claim_type(diff) is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# deterministic_tier
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestDeterministicTier:
|
|
def test_entities_only_is_light(self):
|
|
diff = "diff --git a/entities/people/alice.md b/entities/people/alice.md\n+data\n"
|
|
assert deterministic_tier(diff) == "LIGHT"
|
|
|
|
def test_inbox_only_is_light(self):
|
|
diff = "diff --git a/inbox/archive/src.md b/inbox/archive/src.md\n+data\n"
|
|
assert deterministic_tier(diff) == "LIGHT"
|
|
|
|
def test_core_is_deep(self):
|
|
diff = "diff --git a/core/schema.md b/core/schema.md\n+structural change\n"
|
|
assert deterministic_tier(diff) == "DEEP"
|
|
|
|
def test_challenged_by_is_deep(self):
|
|
diff = "diff --git a/domains/x/claim.md b/domains/x/claim.md\n+challenged_by: some-claim\n"
|
|
assert deterministic_tier(diff) == "DEEP"
|
|
|
|
def test_domain_claim_returns_none(self):
|
|
diff = "diff --git a/domains/finance/new-claim.md b/domains/finance/new-claim.md\n--- /dev/null\n+++ b/domains/finance/new-claim.md\n+type: claim\n"
|
|
assert deterministic_tier(diff) is None
|
|
|
|
def test_empty_diff_returns_none(self):
|
|
assert deterministic_tier("") is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_verdict
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestParseVerdict:
|
|
def test_approve(self):
|
|
assert parse_verdict("VERDICT:LEO:APPROVE", "LEO") == "approve"
|
|
|
|
def test_request_changes(self):
|
|
assert parse_verdict("VERDICT:LEO:REQUEST_CHANGES", "LEO") == "request_changes"
|
|
|
|
def test_no_verdict_defaults_to_request_changes(self):
|
|
assert parse_verdict("some review text", "LEO") == "request_changes"
|
|
|
|
def test_case_insensitive_reviewer(self):
|
|
assert parse_verdict("VERDICT:LEO:APPROVE", "leo") == "approve"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# normalize_tag
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestNormalizeTag:
|
|
def test_valid_tag_passes_through(self):
|
|
assert normalize_tag("broken_wiki_links") == "broken_wiki_links"
|
|
|
|
def test_alias_normalizes(self):
|
|
assert normalize_tag("schema_violation") == "frontmatter_schema"
|
|
assert normalize_tag("duplicate") == "near_duplicate"
|
|
assert normalize_tag("factual_error") == "factual_discrepancy"
|
|
|
|
def test_hyphen_to_underscore(self):
|
|
assert normalize_tag("unverified-wiki-links") == "broken_wiki_links"
|
|
|
|
def test_unknown_returns_none(self):
|
|
assert normalize_tag("totally_made_up") is None
|
|
|
|
def test_fuzzy_substring_match(self):
|
|
assert normalize_tag("wiki_links") == "broken_wiki_links"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_issues
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestParseIssues:
|
|
def test_structured_comment(self):
|
|
text = "Review text. <!-- ISSUES: schema_violation, duplicate -->"
|
|
result = parse_issues(text)
|
|
assert "frontmatter_schema" in result
|
|
assert "near_duplicate" in result
|
|
|
|
def test_fallback_to_prose(self):
|
|
text = "The frontmatter is missing required fields."
|
|
result = parse_issues(text)
|
|
assert "frontmatter_schema" in result
|
|
|
|
def test_empty_structured_falls_back(self):
|
|
text = "Review. <!-- ISSUES: totally_unknown_tag -->"
|
|
# All tags unrecognizable, falls through to prose
|
|
result = parse_issues(text)
|
|
# With no prose keywords either, should return empty
|
|
assert isinstance(result, list)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# infer_issues_from_prose
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestInferIssuesFromProse:
|
|
def test_detects_frontmatter_issues(self):
|
|
result = infer_issues_from_prose("The YAML frontmatter is invalid.")
|
|
assert "frontmatter_schema" in result
|
|
|
|
def test_detects_wiki_link_issues(self):
|
|
result = infer_issues_from_prose("Found broken wiki links in the claim.")
|
|
assert "broken_wiki_links" in result
|
|
|
|
def test_detects_near_duplicate(self):
|
|
result = infer_issues_from_prose("This is a near-duplicate of an existing claim.")
|
|
assert "near_duplicate" in result
|
|
|
|
def test_no_matches(self):
|
|
result = infer_issues_from_prose("This claim is well-structured and accurate.")
|
|
assert result == []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# classify_issues
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestClassifyIssues:
|
|
def test_mechanical_only(self):
|
|
assert classify_issues(["frontmatter_schema", "near_duplicate"]) == "mechanical"
|
|
|
|
def test_substantive_only(self):
|
|
assert classify_issues(["factual_discrepancy"]) == "substantive"
|
|
|
|
def test_mixed(self):
|
|
assert classify_issues(["frontmatter_schema", "factual_discrepancy"]) == "mixed"
|
|
|
|
def test_empty(self):
|
|
assert classify_issues([]) == "unknown"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_batch_response
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestParseBatchResponse:
|
|
def test_parses_multiple_prs(self):
|
|
response = (
|
|
"=== PR #100\nReview for 100\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
|
|
"=== PR #200\nReview for 200\n<!-- PR:200 VERDICT:AGENT:REQUEST_CHANGES -->\n"
|
|
)
|
|
result = parse_batch_response(response, [100, 200], "agent")
|
|
assert 100 in result
|
|
assert 200 in result
|
|
assert "APPROVE" in result[100]
|
|
assert "REQUEST_CHANGES" in result[200]
|
|
|
|
def test_missing_pr(self):
|
|
response = "=== PR #100\nReview\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
|
|
result = parse_batch_response(response, [100, 200], "agent")
|
|
assert 100 in result
|
|
assert 200 not in result
|
|
|
|
def test_empty_response(self):
|
|
result = parse_batch_response("no verdicts here", [100], "agent")
|
|
assert result == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# validate_batch_fanout
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestValidateBatchFanout:
|
|
def test_missing_pr_falls_back(self):
|
|
parsed = {100: "review for 100"}
|
|
pr_diffs = [
|
|
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
|
|
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
|
|
]
|
|
valid, fallback = validate_batch_fanout(parsed, pr_diffs, "agent")
|
|
assert 200 in fallback
|
|
assert 200 not in valid
|
|
|
|
def test_all_present_passes(self):
|
|
parsed = {
|
|
100: "review mentioning domains/long-enough-path.md",
|
|
200: "review mentioning domains/other-long-path.md",
|
|
}
|
|
pr_diffs = [
|
|
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
|
|
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
|
|
]
|
|
valid, fallback = validate_batch_fanout(parsed, pr_diffs, "agent")
|
|
assert 100 in valid
|
|
assert 200 in valid
|
|
assert fallback == []
|