From 376b77999f68155f7f22eb6348000807ba106897 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Thu, 16 Apr 2026 12:40:17 +0100
Subject: [PATCH] =?UTF-8?q?refactor:=20Phase=203=20=E2=80=94=20fix=20close?=
 =?UTF-8?q?=5Fpr=20ghost=20bug,=20wire=20stale=5Fpr,=20extract=20eval=5Fpa?=
 =?UTF-8?q?rse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical bug fix: close_pr now checks forgejo_api return value and
skips DB update on Forgejo failure, preventing ghost PRs (DB closed,
Forgejo open). Returns bool so callers can handle failures.

_terminate_pr checks return value — skips source requeue on failure.
stale_pr.py migrated from raw Forgejo+DB to close_pr (last raw close
transition eliminated).

eval_parse.py: 15 pure parsing functions extracted from evaluate.py
(~370 lines removed). Zero I/O, zero async, independently testable.
evaluate.py drops from ~1510 to ~1140 lines.

Tests: 295 passed (42 new eval_parse + 2 new close_pr), zero regressions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/eval_parse.py        | 434 +++++++++++++++++++++++++++++++++++++++
 lib/evaluate.py          | 433 ++------------------------------------
 lib/pr_state.py          |  14 +-
 lib/stale_pr.py          |  16 +-
 tests/test_eval_parse.py | 315 ++++++++++++++++++++++++++++
 tests/test_pr_state.py   |  33 ++-
 6 files changed, 815 insertions(+), 430 deletions(-)
 create mode 100644 lib/eval_parse.py
 create mode 100644 tests/test_eval_parse.py

diff --git a/lib/eval_parse.py b/lib/eval_parse.py
new file mode 100644
index 0000000..cc94753
--- /dev/null
+++ b/lib/eval_parse.py
@@ -0,0 +1,434 @@
+"""Pure parsing functions for the eval stage — zero I/O, zero async.
+
+Extracted from evaluate.py to isolate testable parsing logic from
+orchestration, DB, and Forgejo API calls.
+
+Contents:
+- Diff helpers: filter, classify, tier routing
+- Verdict/issue parsing: structured tags + prose inference
+- Batch response parsing: fan-out validation
+
+All functions are pure (input → output). The only external dependency
+is config.MECHANICAL_ISSUE_TAGS / config.SUBSTANTIVE_ISSUE_TAGS for
+_classify_issues.
+"""
+
+import logging
+import re
+
+from . import config
+
+logger = logging.getLogger("pipeline.eval_parse")
+
+
+# ─── Diff helpers ──────────────────────────────────────────────────────────
+
+
+def _filter_diff(diff: str) -> tuple[str, str]:
+    """Filter diff to only review-relevant files.
+
+    Returns (review_diff, entity_diff).
+    Strips: inbox/, schemas/, skills/, agents/*/musings/
+    """
+    sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
+    skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
+    core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
+
+    claim_sections = []
+    entity_sections = []
+
+    for section in sections:
+        if not section.strip():
+            continue
+        if any(re.match(p, section) for p in skip_patterns):
+            continue
+        entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
+        if entity_match and entity_match.group(1) not in core_domains:
+            entity_sections.append(section)
+            continue
+        claim_sections.append(section)
+
+    return "".join(claim_sections), "".join(entity_sections)
+
+
+def _extract_changed_files(diff: str) -> str:
+    """Extract changed file paths from diff."""
+    return "\n".join(
+        line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
+    )
+
+
+def _is_musings_only(diff: str) -> bool:
+    """Check if PR only modifies musing files."""
+    has_musings = False
+    has_other = False
+    for line in diff.split("\n"):
+        if line.startswith("diff --git"):
+            if "agents/" in line and "/musings/" in line:
+                has_musings = True
+            else:
+                has_other = True
+    return has_musings and not has_other
+
+
+def _diff_contains_claim_type(diff: str) -> bool:
+    """Claim-shape detector: check if any file in diff has type: claim in frontmatter.
+
+    Mechanical check ($0). If YAML declares type: claim, this is a factual claim —
+    not an entity update or formatting fix. Must be classified STANDARD minimum
+    regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
+    (Theseus: converts semantic problem to mechanical check)
+    """
+    for line in diff.split("\n"):
+        if line.startswith("+") and not line.startswith("+++"):
+            stripped = line[1:].strip()
+            if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
+                return True
+    return False
+
+
+def _deterministic_tier(diff: str) -> str | None:
+    """Deterministic tier routing — skip Haiku triage for obvious cases.
+
+    Checks diff file patterns before calling the LLM. Returns tier string
+    if deterministic, None if Haiku triage is needed.
+
+    Rules (Leo-calibrated):
+    - All files in entities/ only → LIGHT
+    - All files in inbox/ only (queue, archive, null-result) → LIGHT
+    - Any file in core/ or foundations/ → DEEP (structural KB changes)
+    - Has challenged_by field → DEEP (challenges existing claims)
+    - Modifies existing file (not new) in domains/ → DEEP (enrichment/change)
+    - Otherwise → None (needs Haiku triage)
+
+    NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link
+    across domains, that's the whole point of the knowledge graph (Leo).
+    """
+    changed_files = []
+    for line in diff.split("\n"):
+        if line.startswith("diff --git a/"):
+            path = line.replace("diff --git a/", "").split(" b/")[0]
+            changed_files.append(path)
+
+    if not changed_files:
+        return None
+
+    # All entities/ only → LIGHT
+    if all(f.startswith("entities/") for f in changed_files):
+        logger.info("Deterministic tier: LIGHT (all files in entities/)")
+        return "LIGHT"
+
+    # All inbox/ only (queue, archive, null-result) → LIGHT
+    if all(f.startswith("inbox/") for f in changed_files):
+        logger.info("Deterministic tier: LIGHT (all files in inbox/)")
+        return "LIGHT"
+
+    # Any file in core/ or foundations/ → DEEP (structural KB changes)
+    if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
+        logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
+        return "DEEP"
+
+    # Check diff content for DEEP signals
+    has_challenged_by = False
+    new_files: set[str] = set()
+
+    lines = diff.split("\n")
+    for i, line in enumerate(lines):
+        # Detect new files
+        if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
+            new_files.add(lines[i + 1][6:])
+        # Check for challenged_by field
+        if line.startswith("+") and not line.startswith("+++"):
+            stripped = line[1:].strip()
+            if stripped.startswith("challenged_by:"):
+                has_challenged_by = True
+
+    if has_challenged_by:
+        logger.info("Deterministic tier: DEEP (has challenged_by field)")
+        return "DEEP"
+
+    # NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
+    # (appending evidence) are common and should be STANDARD. Let Haiku triage
+    # distinguish enrichments from structural changes.
+
+    return None
+
+
+# ─── Verdict parsing ──────────────────────────────────────────────────────
+
+
+def _parse_verdict(review_text: str, reviewer: str) -> str:
+    """Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
+    upper = reviewer.upper()
+    if f"VERDICT:{upper}:APPROVE" in review_text:
+        return "approve"
+    elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
+        return "request_changes"
+    else:
+        logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
+        return "request_changes"
+
+
+# Map model-invented tags to valid tags. Models consistently ignore the valid
+# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
+_TAG_ALIASES: dict[str, str] = {
+    "schema_violation": "frontmatter_schema",
+    "missing_schema_fields": "frontmatter_schema",
+    "missing_schema": "frontmatter_schema",
+    "schema": "frontmatter_schema",
+    "missing_frontmatter": "frontmatter_schema",
+    "redundancy": "near_duplicate",
+    "duplicate": "near_duplicate",
+    "missing_confidence": "confidence_miscalibration",
+    "confidence_error": "confidence_miscalibration",
+    "vague_claims": "scope_error",
+    "unfalsifiable": "scope_error",
+    "unverified_wiki_links": "broken_wiki_links",
+    "unverified-wiki-links": "broken_wiki_links",
+    "missing_wiki_links": "broken_wiki_links",
+    "invalid_wiki_links": "broken_wiki_links",
+    "wiki_link_errors": "broken_wiki_links",
+    "overclaiming": "title_overclaims",
+    "title_overclaim": "title_overclaims",
+    "date_error": "date_errors",
+    "factual_error": "factual_discrepancy",
+    "factual_inaccuracy": "factual_discrepancy",
+}
+
+VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
+                    "confidence_miscalibration", "date_errors", "factual_discrepancy",
+                    "near_duplicate", "scope_error"}
+
+
+def _normalize_tag(tag: str) -> str | None:
+    """Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
+    tag = tag.strip().lower().replace("-", "_")
+    if tag in VALID_ISSUE_TAGS:
+        return tag
+    if tag in _TAG_ALIASES:
+        return _TAG_ALIASES[tag]
+    # Fuzzy: check if any valid tag is a substring or vice versa
+    for valid in VALID_ISSUE_TAGS:
+        if valid in tag or tag in valid:
+            return valid
+    return None
+
+
+# ─── Issue parsing ─────────────────────────────────────────────────────────
+
+
+# Keyword patterns for inferring issue tags from unstructured review prose.
+# Conservative: only match unambiguous indicators. Order doesn't matter.
+_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
+    "frontmatter_schema": [
+        re.compile(r"frontmatter", re.IGNORECASE),
+        re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
+        re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
+        re.compile(r"required field", re.IGNORECASE),
+        re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
+        re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
+        re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
+    ],
+    "broken_wiki_links": [
+        re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
+        re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
+        re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
+        re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
+    ],
+    "factual_discrepancy": [
+        re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
+        re.compile(r"misrepresent", re.IGNORECASE),
+    ],
+    "confidence_miscalibration": [
+        re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
+        re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
+    ],
+    "scope_error": [
+        re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
+        re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
+        re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
+        re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
+    ],
+    "title_overclaims": [
+        re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
+        re.compile(r"overclaim", re.IGNORECASE),
+    ],
+    "near_duplicate": [
+        re.compile(r"near.?duplicate", re.IGNORECASE),
+        re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
+        re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
+        re.compile(r"redundan", re.IGNORECASE),
+    ],
+}
+
+
+def _parse_issues(review_text: str) -> list[str]:
+    """Extract issue tags from review.
+
+    First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
+    Falls back to keyword inference from prose.
+    """
+    match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
+    if match:
+        raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
+        normalized = []
+        for tag in raw_tags:
+            norm = _normalize_tag(tag)
+            if norm and norm not in normalized:
+                normalized.append(norm)
+            else:
+                logger.debug("Unrecognized issue tag '%s' — dropped", tag)
+        if normalized:
+            return normalized
+    # Fallback: infer tags from review prose
+    return _infer_issues_from_prose(review_text)
+
+
+def _infer_issues_from_prose(review_text: str) -> list[str]:
+    """Infer issue tags from unstructured review text via keyword matching.
+
+    Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
+    Conservative: requires at least one unambiguous keyword match per tag.
+    """
+    inferred = []
+    for tag, patterns in _PROSE_TAG_PATTERNS.items():
+        if any(p.search(review_text) for p in patterns):
+            inferred.append(tag)
+    return inferred
+
+
+def _classify_issues(issues: list[str]) -> str:
+    """Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
+    if not issues:
+        return "unknown"
+    mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
+    substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
+    if substantive and not mechanical:
+        return "substantive"
+    if mechanical and not substantive:
+        return "mechanical"
+    if mechanical and substantive:
+        return "mixed"
+    return "unknown"  # tags not in either set
+
+
+# ─── Batch response parsing ───────────────────────────────────────────────
+
+
+def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
+    """Parse batched domain review into per-PR review sections.
+
+    Returns {pr_number: review_text} for each PR found in the response.
+    Missing PRs are omitted — caller handles fallback.
+    """
+    agent_upper = agent.upper()
+    result: dict[int, str] = {}
+
+    # Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
+    # Each marker terminates the previous PR's section
+    pattern = re.compile(
+        r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
+    )
+
+    matches = list(pattern.finditer(response))
+    if not matches:
+        return result
+
+    for i, match in enumerate(matches):
+        pr_num = int(match.group(1))
+        marker_end = match.end()
+
+        # Find the start of this PR's section by looking for the section header
+        # or the end of the previous verdict
+        section_header = f"=== PR #{pr_num}"
+        header_pos = response.rfind(section_header, 0, match.start())
+
+        if header_pos >= 0:
+            # Extract from header to end of verdict marker
+            section_text = response[header_pos:marker_end].strip()
+        else:
+            # No header found — extract from previous marker end to this marker end
+            prev_end = matches[i - 1].end() if i > 0 else 0
+            section_text = response[prev_end:marker_end].strip()
+
+        # Re-format as individual review comment
+        # Strip the batch section header, keep just the review content
+        # Add batch label for traceability
+        pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
+        review_text = (
+            f"*(batch review with PRs {pr_nums_str})*\n\n"
+            f"{section_text}\n"
+        )
+        result[pr_num] = review_text
+
+    return result
+
+
+def _validate_batch_fanout(
+    parsed: dict[int, str],
+    pr_diffs: list[dict],
+    agent: str,
+) -> tuple[dict[int, str], list[int]]:
+    """Validate batch fan-out for completeness and cross-contamination.
+
+    Returns (valid_reviews, fallback_pr_numbers).
+    - valid_reviews: reviews that passed validation
+    - fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
+    """
+    valid: dict[int, str] = {}
+    fallback: list[int] = []
+
+    # Build file map: pr_number → set of path segments for matching.
+    # Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
+    # to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
+    pr_files: dict[int, set[str]] = {}
+    for pr in pr_diffs:
+        files = set()
+        for line in pr["diff"].split("\n"):
+            if line.startswith("diff --git a/"):
+                path = line.replace("diff --git a/", "").split(" b/")[0]
+                files.add(path)
+                # Also add the last 2 path segments (e.g., "internet-finance/dao.md")
+                # for models that abbreviate paths
+                parts = path.split("/")
+                if len(parts) >= 2:
+                    files.add("/".join(parts[-2:]))
+        pr_files[pr["number"]] = files
+
+    for pr in pr_diffs:
+        pr_num = pr["number"]
+
+        # Completeness check: is there a review for this PR?
+        if pr_num not in parsed:
+            logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
+            fallback.append(pr_num)
+            continue
+
+        review = parsed[pr_num]
+
+        # Cross-contamination check: does review mention at least one file from this PR?
+        # Use path segments (min 10 chars) to avoid false substring matches on short names.
+        my_files = pr_files.get(pr_num, set())
+        mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
+
+        if not mentions_own_file and my_files:
+            # Check if it references files from OTHER PRs (cross-contamination signal)
+            other_files = set()
+            for other_pr in pr_diffs:
+                if other_pr["number"] != pr_num:
+                    other_files.update(pr_files.get(other_pr["number"], set()))
+            mentions_other = any(f in review for f in other_files if len(f) >= 10)
+
+            if mentions_other:
+                logger.warning(
+                    "Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
+                    pr_num,
+                )
+                fallback.append(pr_num)
+                continue
+            # If it doesn't mention any files at all, could be a generic review — accept it
+            # (some PRs have short diffs where the model doesn't reference filenames)
+
+        valid[pr_num] = review
+
+    return valid, fallback
diff --git a/lib/evaluate.py b/lib/evaluate.py
index 47812b9..ad726c2 100644
--- a/lib/evaluate.py
+++ b/lib/evaluate.py
@@ -21,11 +21,25 @@ LLM transport and prompts extracted to lib/llm.py (Phase 3c).
 import json
 import logging
 import random
-import re
 from datetime import datetime, timezone
 
 from . import config, db
 from .domains import agent_for_domain, detect_domain_from_branch, detect_domain_from_diff
+from .eval_parse import (
+    VALID_ISSUE_TAGS,
+    _classify_issues,
+    _deterministic_tier,
+    _diff_contains_claim_type,
+    _extract_changed_files,
+    _filter_diff,
+    _infer_issues_from_prose,
+    _is_musings_only,
+    _normalize_tag,
+    _parse_batch_response,
+    _parse_issues,
+    _parse_verdict,
+    _validate_batch_fanout,
+)
 from .forgejo import api as forgejo_api
 from .forgejo import get_agent_token, get_pr_diff, repo_path
 from .merge import PIPELINE_OWNED_PREFIXES
@@ -37,289 +51,12 @@ from .validate import load_existing_claims
 logger = logging.getLogger("pipeline.evaluate")
 
 
-# ─── Diff helpers ──────────────────────────────────────────────────────────
-
-
-def _filter_diff(diff: str) -> tuple[str, str]:
-    """Filter diff to only review-relevant files.
-
-    Returns (review_diff, entity_diff).
-    Strips: inbox/, schemas/, skills/, agents/*/musings/
-    """
-    sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
-    skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
-    core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
-
-    claim_sections = []
-    entity_sections = []
-
-    for section in sections:
-        if not section.strip():
-            continue
-        if any(re.match(p, section) for p in skip_patterns):
-            continue
-        entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
-        if entity_match and entity_match.group(1) not in core_domains:
-            entity_sections.append(section)
-            continue
-        claim_sections.append(section)
-
-    return "".join(claim_sections), "".join(entity_sections)
-
-
-def _extract_changed_files(diff: str) -> str:
-    """Extract changed file paths from diff."""
-    return "\n".join(
-        line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
-    )
-
-
-def _is_musings_only(diff: str) -> bool:
-    """Check if PR only modifies musing files."""
-    has_musings = False
-    has_other = False
-    for line in diff.split("\n"):
-        if line.startswith("diff --git"):
-            if "agents/" in line and "/musings/" in line:
-                has_musings = True
-            else:
-                has_other = True
-    return has_musings and not has_other
-
-
 # ─── NOTE: Tier 0.5 mechanical pre-check moved to validate.py ────────────
 # Tier 0.5 now runs as part of the validate stage (before eval), not inside
 # evaluate_pr(). This prevents wasting eval_attempts on mechanically fixable
 # PRs. Eval trusts that tier0_pass=1 means all mechanical checks passed.
 
 
-# ─── Tier overrides ───────────────────────────────────────────────────────
-
-
-def _diff_contains_claim_type(diff: str) -> bool:
-    """Claim-shape detector: check if any file in diff has type: claim in frontmatter.
-
-    Mechanical check ($0). If YAML declares type: claim, this is a factual claim —
-    not an entity update or formatting fix. Must be classified STANDARD minimum
-    regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
-    (Theseus: converts semantic problem to mechanical check)
-    """
-    for line in diff.split("\n"):
-        if line.startswith("+") and not line.startswith("+++"):
-            stripped = line[1:].strip()
-            if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
-                return True
-    return False
-
-
-def _deterministic_tier(diff: str) -> str | None:
-    """Deterministic tier routing — skip Haiku triage for obvious cases.
-
-    Checks diff file patterns before calling the LLM. Returns tier string
-    if deterministic, None if Haiku triage is needed.
-
-    Rules (Leo-calibrated):
-    - All files in entities/ only → LIGHT
-    - All files in inbox/ only (queue, archive, null-result) → LIGHT
-    - Any file in core/ or foundations/ → DEEP (structural KB changes)
-    - Has challenged_by field → DEEP (challenges existing claims)
-    - Modifies existing file (not new) in domains/ → DEEP (enrichment/change)
-    - Otherwise → None (needs Haiku triage)
-
-    NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link
-    across domains, that's the whole point of the knowledge graph (Leo).
-    """
-    changed_files = []
-    for line in diff.split("\n"):
-        if line.startswith("diff --git a/"):
-            path = line.replace("diff --git a/", "").split(" b/")[0]
-            changed_files.append(path)
-
-    if not changed_files:
-        return None
-
-    # All entities/ only → LIGHT
-    if all(f.startswith("entities/") for f in changed_files):
-        logger.info("Deterministic tier: LIGHT (all files in entities/)")
-        return "LIGHT"
-
-    # All inbox/ only (queue, archive, null-result) → LIGHT
-    if all(f.startswith("inbox/") for f in changed_files):
-        logger.info("Deterministic tier: LIGHT (all files in inbox/)")
-        return "LIGHT"
-
-    # Any file in core/ or foundations/ → DEEP (structural KB changes)
-    if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
-        logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
-        return "DEEP"
-
-    # Check diff content for DEEP signals
-    has_challenged_by = False
-    has_modified_claim = False
-    new_files: set[str] = set()
-
-    lines = diff.split("\n")
-    for i, line in enumerate(lines):
-        # Detect new files
-        if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
-            new_files.add(lines[i + 1][6:])
-        # Check for challenged_by field
-        if line.startswith("+") and not line.startswith("+++"):
-            stripped = line[1:].strip()
-            if stripped.startswith("challenged_by:"):
-                has_challenged_by = True
-
-    if has_challenged_by:
-        logger.info("Deterministic tier: DEEP (has challenged_by field)")
-        return "DEEP"
-
-    # NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
-    # (appending evidence) are common and should be STANDARD. Let Haiku triage
-    # distinguish enrichments from structural changes.
-
-    return None
-
-
-# ─── Verdict parsing ──────────────────────────────────────────────────────
-
-
-def _parse_verdict(review_text: str, reviewer: str) -> str:
-    """Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
-    upper = reviewer.upper()
-    if f"VERDICT:{upper}:APPROVE" in review_text:
-        return "approve"
-    elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
-        return "request_changes"
-    else:
-        logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
-        return "request_changes"
-
-
-# Map model-invented tags to valid tags. Models consistently ignore the valid
-# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
-_TAG_ALIASES: dict[str, str] = {
-    "schema_violation": "frontmatter_schema",
-    "missing_schema_fields": "frontmatter_schema",
-    "missing_schema": "frontmatter_schema",
-    "schema": "frontmatter_schema",
-    "missing_frontmatter": "frontmatter_schema",
-    "redundancy": "near_duplicate",
-    "duplicate": "near_duplicate",
-    "missing_confidence": "confidence_miscalibration",
-    "confidence_error": "confidence_miscalibration",
-    "vague_claims": "scope_error",
-    "unfalsifiable": "scope_error",
-    "unverified_wiki_links": "broken_wiki_links",
-    "unverified-wiki-links": "broken_wiki_links",
-    "missing_wiki_links": "broken_wiki_links",
-    "invalid_wiki_links": "broken_wiki_links",
-    "wiki_link_errors": "broken_wiki_links",
-    "overclaiming": "title_overclaims",
-    "title_overclaim": "title_overclaims",
-    "date_error": "date_errors",
-    "factual_error": "factual_discrepancy",
-    "factual_inaccuracy": "factual_discrepancy",
-}
-
-VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
-                    "confidence_miscalibration", "date_errors", "factual_discrepancy",
-                    "near_duplicate", "scope_error"}
-
-
-def _normalize_tag(tag: str) -> str | None:
-    """Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
-    tag = tag.strip().lower().replace("-", "_")
-    if tag in VALID_ISSUE_TAGS:
-        return tag
-    if tag in _TAG_ALIASES:
-        return _TAG_ALIASES[tag]
-    # Fuzzy: check if any valid tag is a substring or vice versa
-    for valid in VALID_ISSUE_TAGS:
-        if valid in tag or tag in valid:
-            return valid
-    return None
-
-
-def _parse_issues(review_text: str) -> list[str]:
-    """Extract issue tags from review.
-
-    First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
-    Falls back to keyword inference from prose.
-    """
-    match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
-    if match:
-        raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
-        normalized = []
-        for tag in raw_tags:
-            norm = _normalize_tag(tag)
-            if norm and norm not in normalized:
-                normalized.append(norm)
-            else:
-                logger.debug("Unrecognized issue tag '%s' — dropped", tag)
-        if normalized:
-            return normalized
-    # Fallback: infer tags from review prose
-    return _infer_issues_from_prose(review_text)
-
-
-# Keyword patterns for inferring issue tags from unstructured review prose.
-# Conservative: only match unambiguous indicators. Order doesn't matter.
-_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
-    "frontmatter_schema": [
-        re.compile(r"frontmatter", re.IGNORECASE),
-        re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
-        re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
-        re.compile(r"required field", re.IGNORECASE),
-        re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
-        re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
-        re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
-    ],
-    "broken_wiki_links": [
-        re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
-        re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
-        re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
-        re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
-    ],
-    "factual_discrepancy": [
-        re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
-        re.compile(r"misrepresent", re.IGNORECASE),
-    ],
-    "confidence_miscalibration": [
-        re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
-        re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
-    ],
-    "scope_error": [
-        re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
-        re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
-        re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
-        re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
-    ],
-    "title_overclaims": [
-        re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
-        re.compile(r"overclaim", re.IGNORECASE),
-    ],
-    "near_duplicate": [
-        re.compile(r"near.?duplicate", re.IGNORECASE),
-        re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
-        re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
-        re.compile(r"redundan", re.IGNORECASE),
-    ],
-}
-
-
-def _infer_issues_from_prose(review_text: str) -> list[str]:
-    """Infer issue tags from unstructured review text via keyword matching.
-
-    Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
-    Conservative: requires at least one unambiguous keyword match per tag.
-    """
-    inferred = []
-    for tag, patterns in _PROSE_TAG_PATTERNS.items():
-        if any(p.search(review_text) for p in patterns):
-            inferred.append(tag)
-    return inferred
-
-
 async def _post_formal_approvals(pr_number: int, pr_author: str):
     """Submit formal Forgejo reviews from 2 agents (not the PR author)."""
     approvals = 0
@@ -376,7 +113,10 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
         repo_path(f"issues/{pr_number}/comments"),
         {"body": comment_body},
     )
-    await close_pr(conn, pr_number, last_error=reason)
+    closed = await close_pr(conn, pr_number, last_error=reason)
+    if not closed:
+        logger.warning("PR #%d: Forgejo close failed — skipping source requeue, will retry next cycle", pr_number)
+        return
 
     # Tag source for re-extraction with feedback
     cursor = conn.execute(
@@ -402,21 +142,6 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
     logger.info("PR #%d: TERMINATED — %s", pr_number, reason)
 
 
-def _classify_issues(issues: list[str]) -> str:
-    """Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
-    if not issues:
-        return "unknown"
-    mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
-    substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
-    if substantive and not mechanical:
-        return "substantive"
-    if mechanical and not substantive:
-        return "mechanical"
-    if mechanical and substantive:
-        return "mixed"
-    return "unknown"  # tags not in either set
-
-
 async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_issues: list[str]):
     """Disposition logic for rejected PRs on attempt 2+.
 
@@ -872,126 +597,6 @@ _RATE_LIMIT_BACKOFF_MINUTES = 15
 # ─── Batch domain review ─────────────────────────────────────────────────
 
 
-def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
-    """Parse batched domain review into per-PR review sections.
-
-    Returns {pr_number: review_text} for each PR found in the response.
-    Missing PRs are omitted — caller handles fallback.
-    """
-    agent_upper = agent.upper()
-    result: dict[int, str] = {}
-
-    # Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
-    # Each marker terminates the previous PR's section
-    pattern = re.compile(
-        r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
-    )
-
-    matches = list(pattern.finditer(response))
-    if not matches:
-        return result
-
-    for i, match in enumerate(matches):
-        pr_num = int(match.group(1))
-        verdict = match.group(2)
-        marker_end = match.end()
-
-        # Find the start of this PR's section by looking for the section header
-        # or the end of the previous verdict
-        section_header = f"=== PR #{pr_num}"
-        header_pos = response.rfind(section_header, 0, match.start())
-
-        if header_pos >= 0:
-            # Extract from header to end of verdict marker
-            section_text = response[header_pos:marker_end].strip()
-        else:
-            # No header found — extract from previous marker end to this marker end
-            prev_end = matches[i - 1].end() if i > 0 else 0
-            section_text = response[prev_end:marker_end].strip()
-
-        # Re-format as individual review comment
-        # Strip the batch section header, keep just the review content
-        # Add batch label for traceability
-        pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
-        review_text = (
-            f"*(batch review with PRs {pr_nums_str})*\n\n"
-            f"{section_text}\n"
-        )
-        result[pr_num] = review_text
-
-    return result
-
-
-def _validate_batch_fanout(
-    parsed: dict[int, str],
-    pr_diffs: list[dict],
-    agent: str,
-) -> tuple[dict[int, str], list[int]]:
-    """Validate batch fan-out for completeness and cross-contamination.
-
-    Returns (valid_reviews, fallback_pr_numbers).
-    - valid_reviews: reviews that passed validation
-    - fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
-    """
-    valid: dict[int, str] = {}
-    fallback: list[int] = []
-
-    # Build file map: pr_number → set of path segments for matching.
-    # Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
-    # to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
-    pr_files: dict[int, set[str]] = {}
-    for pr in pr_diffs:
-        files = set()
-        for line in pr["diff"].split("\n"):
-            if line.startswith("diff --git a/"):
-                path = line.replace("diff --git a/", "").split(" b/")[0]
-                files.add(path)
-                # Also add the last 2 path segments (e.g., "internet-finance/dao.md")
-                # for models that abbreviate paths
-                parts = path.split("/")
-                if len(parts) >= 2:
-                    files.add("/".join(parts[-2:]))
-        pr_files[pr["number"]] = files
-
-    for pr in pr_diffs:
-        pr_num = pr["number"]
-
-        # Completeness check: is there a review for this PR?
-        if pr_num not in parsed:
-            logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
-            fallback.append(pr_num)
-            continue
-
-        review = parsed[pr_num]
-
-        # Cross-contamination check: does review mention at least one file from this PR?
-        # Use path segments (min 10 chars) to avoid false substring matches on short names.
-        my_files = pr_files.get(pr_num, set())
-        mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
-
-        if not mentions_own_file and my_files:
-            # Check if it references files from OTHER PRs (cross-contamination signal)
-            other_files = set()
-            for other_pr in pr_diffs:
-                if other_pr["number"] != pr_num:
-                    other_files.update(pr_files.get(other_pr["number"], set()))
-            mentions_other = any(f in review for f in other_files if len(f) >= 10)
-
-            if mentions_other:
-                logger.warning(
-                    "Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
-                    pr_num,
-                )
-                fallback.append(pr_num)
-                continue
-            # If it doesn't mention any files at all, could be a generic review — accept it
-            # (some PRs have short diffs where the model doesn't reference filenames)
-
-        valid[pr_num] = review
-
-    return valid, fallback
-
-
 async def _run_batch_domain_eval(
     conn, batch_prs: list[dict], domain: str, agent: str,
 ) -> tuple[int, int]:
diff --git a/lib/pr_state.py b/lib/pr_state.py
index 10dcf71..b143897 100644
--- a/lib/pr_state.py
+++ b/lib/pr_state.py
@@ -29,15 +29,22 @@ async def close_pr(
     merge_cycled: bool = False,
     inc_merge_failures: bool = False,
     close_on_forgejo: bool = True,
-):
-    """Close a PR in DB and on Forgejo.
+) -> bool:
+    """Close a PR in DB and on Forgejo. Returns True on success, False on Forgejo failure.
 
     Args:
         close_on_forgejo: False only when caller already closed on Forgejo
             (reconciliation, ghost PR cleanup after manual close).
+
+    If Forgejo API fails, the DB update is SKIPPED to prevent ghost PRs
+    (DB says closed, Forgejo says open). The reconciliation loop in
+    merge.py._reconcile_db_state catches any that slip through.
     """
     if close_on_forgejo:
-        await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
+        result = await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
+        if result is None:
+            logger.error("close_pr: Forgejo API failed for PR #%d, skipping DB update", pr_number)
+            return False
 
     parts = ["status = 'closed'"]
     params = []
@@ -54,6 +61,7 @@ async def close_pr(
 
     params.append(pr_number)
     conn.execute(f"UPDATE prs SET {', '.join(parts)} WHERE number = ?", params)
+    return True
 
 
 def approve_pr(
diff --git a/lib/stale_pr.py b/lib/stale_pr.py
index abd2643..97602e5 100644
--- a/lib/stale_pr.py
+++ b/lib/stale_pr.py
@@ -17,6 +17,7 @@ from datetime import datetime, timezone
 
 from . import config, db
 from .forgejo import api, repo_path
+from .pr_state import close_pr
 
 logger = logging.getLogger("pipeline.stale_pr")
 
@@ -48,13 +49,9 @@ async def check_stale_prs(conn) -> tuple[int, int]:
         source_path = pr["source_path"] or "unknown"
 
         try:
-            # Close the PR via Forgejo
-            result = await api(
-                "PATCH",
-                repo_path(f"pulls/{pr_num}"),
-                body={"state": "closed"},
-            )
-            if result is None:
+            closed = await close_pr(conn, pr_num,
+                                    last_error=f"stale: no claims after {STALE_THRESHOLD_MINUTES} min")
+            if not closed:
                 stale_errors += 1
                 logger.warning(
                     "Failed to close stale extraction PR #%d (%s, %s)",
@@ -62,11 +59,6 @@ async def check_stale_prs(conn) -> tuple[int, int]:
                 )
                 continue
 
-            # Update local DB status
-            conn.execute(
-                "UPDATE prs SET status = 'closed' WHERE number = ?",
-                (pr_num,),
-            )
             db.audit(
                 conn,
                 "watchdog",
diff --git a/tests/test_eval_parse.py b/tests/test_eval_parse.py
new file mode 100644
index 0000000..0c37fa3
--- /dev/null
+++ b/tests/test_eval_parse.py
@@ -0,0 +1,315 @@
+"""Tests for lib/eval_parse.py — pure parsing functions extracted from evaluate.py."""
+
+import sys
+import os
+from unittest.mock import MagicMock
+
+import pytest
+
+# Mock heavy dependencies before importing
+sys.modules.setdefault("aiohttp", MagicMock())
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from lib.eval_parse import (
+    VALID_ISSUE_TAGS,
+    _classify_issues,
+    _deterministic_tier,
+    _diff_contains_claim_type,
+    _extract_changed_files,
+    _filter_diff,
+    _infer_issues_from_prose,
+    _is_musings_only,
+    _normalize_tag,
+    _parse_batch_response,
+    _parse_issues,
+    _parse_verdict,
+    _validate_batch_fanout,
+)
+
+
+# ---------------------------------------------------------------------------
+# _filter_diff
+# ---------------------------------------------------------------------------
+
+class TestFilterDiff:
+    def test_strips_inbox_files(self):
+        diff = (
+            "diff --git a/inbox/archive/foo.md b/inbox/archive/foo.md\n"
+            "+some content\n"
+            "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
+            "+real content\n"
+        )
+        review_diff, entity_diff = _filter_diff(diff)
+        assert "inbox" not in review_diff
+        assert "claim.md" in review_diff
+
+    def test_separates_entity_diffs(self):
+        diff = (
+            "diff --git a/entities/people/alice.md b/entities/people/alice.md\n"
+            "+entity content\n"
+            "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
+            "+claim content\n"
+        )
+        review_diff, entity_diff = _filter_diff(diff)
+        assert "alice.md" in entity_diff
+        assert "claim.md" in review_diff
+
+    def test_core_domain_entities_stay_in_review(self):
+        diff = (
+            "diff --git a/entities/living-agents/agent.md b/entities/living-agents/agent.md\n"
+            "+core entity\n"
+        )
+        review_diff, entity_diff = _filter_diff(diff)
+        assert "agent.md" in review_diff
+        assert entity_diff == ""
+
+
+# ---------------------------------------------------------------------------
+# _extract_changed_files
+# ---------------------------------------------------------------------------
+
+class TestExtractChangedFiles:
+    def test_extracts_paths(self):
+        diff = (
+            "diff --git a/domains/foo.md b/domains/foo.md\n"
+            "+content\n"
+            "diff --git a/entities/bar.md b/entities/bar.md\n"
+            "+more\n"
+        )
+        result = _extract_changed_files(diff)
+        assert "domains/foo.md" in result
+        assert "entities/bar.md" in result
+
+    def test_empty_diff(self):
+        assert _extract_changed_files("") == ""
+
+
+# ---------------------------------------------------------------------------
+# _is_musings_only
+# ---------------------------------------------------------------------------
+
+class TestIsMusingsOnly:
+    def test_musings_only(self):
+        diff = "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
+        assert _is_musings_only(diff) is True
+
+    def test_mixed_files(self):
+        diff = (
+            "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
+            "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
+        )
+        assert _is_musings_only(diff) is False
+
+    def test_no_musings(self):
+        diff = "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
+        assert _is_musings_only(diff) is False
+
+
+# ---------------------------------------------------------------------------
+# _diff_contains_claim_type
+# ---------------------------------------------------------------------------
+
+class TestDiffContainsClaimType:
+    def test_detects_claim_type(self):
+        diff = "+type: claim\n+confidence: 0.8\n"
+        assert _diff_contains_claim_type(diff) is True
+
+    def test_ignores_non_claim(self):
+        diff = "+type: entity\n"
+        assert _diff_contains_claim_type(diff) is False
+
+    def test_ignores_file_header(self):
+        diff = "+++ b/type: claim\n"
+        assert _diff_contains_claim_type(diff) is False
+
+
+# ---------------------------------------------------------------------------
+# _deterministic_tier
+# ---------------------------------------------------------------------------
+
+class TestDeterministicTier:
+    def test_entities_only_is_light(self):
+        diff = "diff --git a/entities/people/alice.md b/entities/people/alice.md\n+data\n"
+        assert _deterministic_tier(diff) == "LIGHT"
+
+    def test_inbox_only_is_light(self):
+        diff = "diff --git a/inbox/archive/src.md b/inbox/archive/src.md\n+data\n"
+        assert _deterministic_tier(diff) == "LIGHT"
+
+    def test_core_is_deep(self):
+        diff = "diff --git a/core/schema.md b/core/schema.md\n+structural change\n"
+        assert _deterministic_tier(diff) == "DEEP"
+
+    def test_challenged_by_is_deep(self):
+        diff = "diff --git a/domains/x/claim.md b/domains/x/claim.md\n+challenged_by: some-claim\n"
+        assert _deterministic_tier(diff) == "DEEP"
+
+    def test_domain_claim_returns_none(self):
+        diff = "diff --git a/domains/finance/new-claim.md b/domains/finance/new-claim.md\n--- /dev/null\n+++ b/domains/finance/new-claim.md\n+type: claim\n"
+        assert _deterministic_tier(diff) is None
+
+    def test_empty_diff_returns_none(self):
+        assert _deterministic_tier("") is None
+
+
+# ---------------------------------------------------------------------------
+# _parse_verdict
+# ---------------------------------------------------------------------------
+
+class TestParseVerdict:
+    def test_approve(self):
+        assert _parse_verdict("VERDICT:LEO:APPROVE", "LEO") == "approve"
+
+    def test_request_changes(self):
+        assert _parse_verdict("VERDICT:LEO:REQUEST_CHANGES", "LEO") == "request_changes"
+
+    def test_no_verdict_defaults_to_request_changes(self):
+        assert _parse_verdict("some review text", "LEO") == "request_changes"
+
+    def test_case_insensitive_reviewer(self):
+        assert _parse_verdict("VERDICT:LEO:APPROVE", "leo") == "approve"
+
+
+# ---------------------------------------------------------------------------
+# _normalize_tag
+# ---------------------------------------------------------------------------
+
+class TestNormalizeTag:
+    def test_valid_tag_passes_through(self):
+        assert _normalize_tag("broken_wiki_links") == "broken_wiki_links"
+
+    def test_alias_normalizes(self):
+        assert _normalize_tag("schema_violation") == "frontmatter_schema"
+        assert _normalize_tag("duplicate") == "near_duplicate"
+        assert _normalize_tag("factual_error") == "factual_discrepancy"
+
+    def test_hyphen_to_underscore(self):
+        assert _normalize_tag("unverified-wiki-links") == "broken_wiki_links"
+
+    def test_unknown_returns_none(self):
+        assert _normalize_tag("totally_made_up") is None
+
+    def test_fuzzy_substring_match(self):
+        assert _normalize_tag("wiki_links") == "broken_wiki_links"
+
+
+# ---------------------------------------------------------------------------
+# _parse_issues
+# ---------------------------------------------------------------------------
+
+class TestParseIssues:
+    def test_structured_comment(self):
+        text = "Review text. <!-- ISSUES: schema_violation, duplicate -->"
+        result = _parse_issues(text)
+        assert "frontmatter_schema" in result
+        assert "near_duplicate" in result
+
+    def test_fallback_to_prose(self):
+        text = "The frontmatter is missing required fields."
+        result = _parse_issues(text)
+        assert "frontmatter_schema" in result
+
+    def test_empty_structured_falls_back(self):
+        text = "Review. <!-- ISSUES: totally_unknown_tag -->"
+        # All tags unrecognizable, falls through to prose
+        result = _parse_issues(text)
+        # With no prose keywords either, should return empty
+        assert isinstance(result, list)
+
+
+# ---------------------------------------------------------------------------
+# _infer_issues_from_prose
+# ---------------------------------------------------------------------------
+
+class TestInferIssuesFromProse:
+    def test_detects_frontmatter_issues(self):
+        result = _infer_issues_from_prose("The YAML frontmatter is invalid.")
+        assert "frontmatter_schema" in result
+
+    def test_detects_wiki_link_issues(self):
+        result = _infer_issues_from_prose("Found broken wiki links in the claim.")
+        assert "broken_wiki_links" in result
+
+    def test_detects_near_duplicate(self):
+        result = _infer_issues_from_prose("This is a near-duplicate of an existing claim.")
+        assert "near_duplicate" in result
+
+    def test_no_matches(self):
+        result = _infer_issues_from_prose("This claim is well-structured and accurate.")
+        assert result == []
+
+
+# ---------------------------------------------------------------------------
+# _classify_issues
+# ---------------------------------------------------------------------------
+
+class TestClassifyIssues:
+    def test_mechanical_only(self):
+        assert _classify_issues(["frontmatter_schema", "near_duplicate"]) == "mechanical"
+
+    def test_substantive_only(self):
+        assert _classify_issues(["factual_discrepancy"]) == "substantive"
+
+    def test_mixed(self):
+        assert _classify_issues(["frontmatter_schema", "factual_discrepancy"]) == "mixed"
+
+    def test_empty(self):
+        assert _classify_issues([]) == "unknown"
+
+
+# ---------------------------------------------------------------------------
+# _parse_batch_response
+# ---------------------------------------------------------------------------
+
+class TestParseBatchResponse:
+    def test_parses_multiple_prs(self):
+        response = (
+            "=== PR #100\nReview for 100\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
+            "=== PR #200\nReview for 200\n<!-- PR:200 VERDICT:AGENT:REQUEST_CHANGES -->\n"
+        )
+        result = _parse_batch_response(response, [100, 200], "agent")
+        assert 100 in result
+        assert 200 in result
+        assert "APPROVE" in result[100]
+        assert "REQUEST_CHANGES" in result[200]
+
+    def test_missing_pr(self):
+        response = "=== PR #100\nReview\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
+        result = _parse_batch_response(response, [100, 200], "agent")
+        assert 100 in result
+        assert 200 not in result
+
+    def test_empty_response(self):
+        result = _parse_batch_response("no verdicts here", [100], "agent")
+        assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# _validate_batch_fanout
+# ---------------------------------------------------------------------------
+
+class TestValidateBatchFanout:
+    def test_missing_pr_falls_back(self):
+        parsed = {100: "review for 100"}
+        pr_diffs = [
+            {"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
+            {"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
+        ]
+        valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
+        assert 200 in fallback
+        assert 200 not in valid
+
+    def test_all_present_passes(self):
+        parsed = {
+            100: "review mentioning domains/long-enough-path.md",
+            200: "review mentioning domains/other-long-path.md",
+        }
+        pr_diffs = [
+            {"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
+            {"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
+        ]
+        valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
+        assert 100 in valid
+        assert 200 in valid
+        assert fallback == []
diff --git a/tests/test_pr_state.py b/tests/test_pr_state.py
index 6c0cb98..735d51b 100644
--- a/tests/test_pr_state.py
+++ b/tests/test_pr_state.py
@@ -135,13 +135,44 @@ class TestClosePr:
         mock_api = AsyncMock(return_value={})
         with patch("lib.pr_state.forgejo_api", mock_api), \
              patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
-            asyncio.run(close_pr(conn, 42))
+            result = asyncio.run(close_pr(conn, 42))
 
+        assert result is True
         row = _get_pr(conn, 42)
         assert row["status"] == "closed"
         # last_error not overwritten when not provided
         assert row["last_error"] == "old error"
 
+    def test_close_returns_false_on_forgejo_failure(self):
+        """Critical: if Forgejo API fails, DB must NOT be updated (ghost PR prevention)."""
+        conn = _make_db()
+        _insert_pr(conn, 42)
+
+        mock_api = AsyncMock(return_value=None)  # forgejo_api returns None on failure
+        with patch("lib.pr_state.forgejo_api", mock_api), \
+             patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
+            result = asyncio.run(close_pr(conn, 42, last_error="should not persist"))
+
+        assert result is False
+        row = _get_pr(conn, 42)
+        assert row["status"] == "open"  # DB not touched
+        assert row["last_error"] is None  # last_error not set
+
+    def test_close_skips_forgejo_check_when_opted_out(self):
+        """close_on_forgejo=False always succeeds (caller already closed Forgejo)."""
+        conn = _make_db()
+        _insert_pr(conn, 42)
+
+        mock_api = AsyncMock(return_value=None)  # would fail if called
+        with patch("lib.pr_state.forgejo_api", mock_api), \
+             patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
+            result = asyncio.run(close_pr(conn, 42, close_on_forgejo=False))
+
+        assert result is True
+        row = _get_pr(conn, 42)
+        assert row["status"] == "closed"
+        mock_api.assert_not_called()
+
 
 # ---------------------------------------------------------------------------
 # approve_pr