refactor: Phase 3 — fix close_pr ghost bug, wire stale_pr, extract eval_parse
Some checks are pending
CI / lint-and-test (push) Waiting to run

Critical bug fix: close_pr now checks forgejo_api return value and
skips DB update on Forgejo failure, preventing ghost PRs (DB closed,
Forgejo open). Returns bool so callers can handle failures.

_terminate_pr checks return value — skips source requeue on failure.
stale_pr.py migrated from raw Forgejo+DB to close_pr (last raw close
transition eliminated).

eval_parse.py: 15 pure parsing functions extracted from evaluate.py
(~370 lines removed). Zero I/O, zero async, independently testable.
evaluate.py drops from ~1510 to ~1140 lines.

Tests: 295 passed (42 new eval_parse + 2 new close_pr), zero regressions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-16 12:40:17 +01:00
parent 716cc43890
commit 376b77999f
6 changed files with 815 additions and 430 deletions

434
lib/eval_parse.py Normal file
View file

@ -0,0 +1,434 @@
"""Pure parsing functions for the eval stage — zero I/O, zero async.
Extracted from evaluate.py to isolate testable parsing logic from
orchestration, DB, and Forgejo API calls.
Contents:
- Diff helpers: filter, classify, tier routing
- Verdict/issue parsing: structured tags + prose inference
- Batch response parsing: fan-out validation
All functions are pure (input output). The only external dependency
is config.MECHANICAL_ISSUE_TAGS / config.SUBSTANTIVE_ISSUE_TAGS for
_classify_issues.
"""
import logging
import re
from . import config
logger = logging.getLogger("pipeline.eval_parse")
# ─── Diff helpers ──────────────────────────────────────────────────────────
def _filter_diff(diff: str) -> tuple[str, str]:
"""Filter diff to only review-relevant files.
Returns (review_diff, entity_diff).
Strips: inbox/, schemas/, skills/, agents/*/musings/
"""
sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
claim_sections = []
entity_sections = []
for section in sections:
if not section.strip():
continue
if any(re.match(p, section) for p in skip_patterns):
continue
entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
if entity_match and entity_match.group(1) not in core_domains:
entity_sections.append(section)
continue
claim_sections.append(section)
return "".join(claim_sections), "".join(entity_sections)
def _extract_changed_files(diff: str) -> str:
"""Extract changed file paths from diff."""
return "\n".join(
line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
)
def _is_musings_only(diff: str) -> bool:
"""Check if PR only modifies musing files."""
has_musings = False
has_other = False
for line in diff.split("\n"):
if line.startswith("diff --git"):
if "agents/" in line and "/musings/" in line:
has_musings = True
else:
has_other = True
return has_musings and not has_other
def _diff_contains_claim_type(diff: str) -> bool:
"""Claim-shape detector: check if any file in diff has type: claim in frontmatter.
Mechanical check ($0). If YAML declares type: claim, this is a factual claim
not an entity update or formatting fix. Must be classified STANDARD minimum
regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
(Theseus: converts semantic problem to mechanical check)
"""
for line in diff.split("\n"):
if line.startswith("+") and not line.startswith("+++"):
stripped = line[1:].strip()
if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
return True
return False
def _deterministic_tier(diff: str) -> str | None:
"""Deterministic tier routing — skip Haiku triage for obvious cases.
Checks diff file patterns before calling the LLM. Returns tier string
if deterministic, None if Haiku triage is needed.
Rules (Leo-calibrated):
- All files in entities/ only LIGHT
- All files in inbox/ only (queue, archive, null-result) LIGHT
- Any file in core/ or foundations/ DEEP (structural KB changes)
- Has challenged_by field DEEP (challenges existing claims)
- Modifies existing file (not new) in domains/ DEEP (enrichment/change)
- Otherwise None (needs Haiku triage)
NOTE: Cross-domain wiki links are NOT a DEEP signal most claims link
across domains, that's the whole point of the knowledge graph (Leo).
"""
changed_files = []
for line in diff.split("\n"):
if line.startswith("diff --git a/"):
path = line.replace("diff --git a/", "").split(" b/")[0]
changed_files.append(path)
if not changed_files:
return None
# All entities/ only → LIGHT
if all(f.startswith("entities/") for f in changed_files):
logger.info("Deterministic tier: LIGHT (all files in entities/)")
return "LIGHT"
# All inbox/ only (queue, archive, null-result) → LIGHT
if all(f.startswith("inbox/") for f in changed_files):
logger.info("Deterministic tier: LIGHT (all files in inbox/)")
return "LIGHT"
# Any file in core/ or foundations/ → DEEP (structural KB changes)
if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
return "DEEP"
# Check diff content for DEEP signals
has_challenged_by = False
new_files: set[str] = set()
lines = diff.split("\n")
for i, line in enumerate(lines):
# Detect new files
if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
new_files.add(lines[i + 1][6:])
# Check for challenged_by field
if line.startswith("+") and not line.startswith("+++"):
stripped = line[1:].strip()
if stripped.startswith("challenged_by:"):
has_challenged_by = True
if has_challenged_by:
logger.info("Deterministic tier: DEEP (has challenged_by field)")
return "DEEP"
# NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
# (appending evidence) are common and should be STANDARD. Let Haiku triage
# distinguish enrichments from structural changes.
return None
# ─── Verdict parsing ──────────────────────────────────────────────────────
def _parse_verdict(review_text: str, reviewer: str) -> str:
"""Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
upper = reviewer.upper()
if f"VERDICT:{upper}:APPROVE" in review_text:
return "approve"
elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
return "request_changes"
else:
logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
return "request_changes"
# Map model-invented tags to valid tags. Models consistently ignore the valid
# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
_TAG_ALIASES: dict[str, str] = {
"schema_violation": "frontmatter_schema",
"missing_schema_fields": "frontmatter_schema",
"missing_schema": "frontmatter_schema",
"schema": "frontmatter_schema",
"missing_frontmatter": "frontmatter_schema",
"redundancy": "near_duplicate",
"duplicate": "near_duplicate",
"missing_confidence": "confidence_miscalibration",
"confidence_error": "confidence_miscalibration",
"vague_claims": "scope_error",
"unfalsifiable": "scope_error",
"unverified_wiki_links": "broken_wiki_links",
"unverified-wiki-links": "broken_wiki_links",
"missing_wiki_links": "broken_wiki_links",
"invalid_wiki_links": "broken_wiki_links",
"wiki_link_errors": "broken_wiki_links",
"overclaiming": "title_overclaims",
"title_overclaim": "title_overclaims",
"date_error": "date_errors",
"factual_error": "factual_discrepancy",
"factual_inaccuracy": "factual_discrepancy",
}
VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
"confidence_miscalibration", "date_errors", "factual_discrepancy",
"near_duplicate", "scope_error"}
def _normalize_tag(tag: str) -> str | None:
"""Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
tag = tag.strip().lower().replace("-", "_")
if tag in VALID_ISSUE_TAGS:
return tag
if tag in _TAG_ALIASES:
return _TAG_ALIASES[tag]
# Fuzzy: check if any valid tag is a substring or vice versa
for valid in VALID_ISSUE_TAGS:
if valid in tag or tag in valid:
return valid
return None
# ─── Issue parsing ─────────────────────────────────────────────────────────
# Keyword patterns for inferring issue tags from unstructured review prose.
# Conservative: only match unambiguous indicators. Order doesn't matter.
_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
"frontmatter_schema": [
re.compile(r"frontmatter", re.IGNORECASE),
re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
re.compile(r"required field", re.IGNORECASE),
re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
],
"broken_wiki_links": [
re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
],
"factual_discrepancy": [
re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
re.compile(r"misrepresent", re.IGNORECASE),
],
"confidence_miscalibration": [
re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
],
"scope_error": [
re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
],
"title_overclaims": [
re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
re.compile(r"overclaim", re.IGNORECASE),
],
"near_duplicate": [
re.compile(r"near.?duplicate", re.IGNORECASE),
re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
re.compile(r"redundan", re.IGNORECASE),
],
}
def _parse_issues(review_text: str) -> list[str]:
"""Extract issue tags from review.
First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
Falls back to keyword inference from prose.
"""
match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
if match:
raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
normalized = []
for tag in raw_tags:
norm = _normalize_tag(tag)
if norm and norm not in normalized:
normalized.append(norm)
else:
logger.debug("Unrecognized issue tag '%s' — dropped", tag)
if normalized:
return normalized
# Fallback: infer tags from review prose
return _infer_issues_from_prose(review_text)
def _infer_issues_from_prose(review_text: str) -> list[str]:
"""Infer issue tags from unstructured review text via keyword matching.
Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
Conservative: requires at least one unambiguous keyword match per tag.
"""
inferred = []
for tag, patterns in _PROSE_TAG_PATTERNS.items():
if any(p.search(review_text) for p in patterns):
inferred.append(tag)
return inferred
def _classify_issues(issues: list[str]) -> str:
"""Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
if not issues:
return "unknown"
mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
if substantive and not mechanical:
return "substantive"
if mechanical and not substantive:
return "mechanical"
if mechanical and substantive:
return "mixed"
return "unknown" # tags not in either set
# ─── Batch response parsing ───────────────────────────────────────────────
def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
"""Parse batched domain review into per-PR review sections.
Returns {pr_number: review_text} for each PR found in the response.
Missing PRs are omitted caller handles fallback.
"""
agent_upper = agent.upper()
result: dict[int, str] = {}
# Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
# Each marker terminates the previous PR's section
pattern = re.compile(
r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
)
matches = list(pattern.finditer(response))
if not matches:
return result
for i, match in enumerate(matches):
pr_num = int(match.group(1))
marker_end = match.end()
# Find the start of this PR's section by looking for the section header
# or the end of the previous verdict
section_header = f"=== PR #{pr_num}"
header_pos = response.rfind(section_header, 0, match.start())
if header_pos >= 0:
# Extract from header to end of verdict marker
section_text = response[header_pos:marker_end].strip()
else:
# No header found — extract from previous marker end to this marker end
prev_end = matches[i - 1].end() if i > 0 else 0
section_text = response[prev_end:marker_end].strip()
# Re-format as individual review comment
# Strip the batch section header, keep just the review content
# Add batch label for traceability
pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
review_text = (
f"*(batch review with PRs {pr_nums_str})*\n\n"
f"{section_text}\n"
)
result[pr_num] = review_text
return result
def _validate_batch_fanout(
parsed: dict[int, str],
pr_diffs: list[dict],
agent: str,
) -> tuple[dict[int, str], list[int]]:
"""Validate batch fan-out for completeness and cross-contamination.
Returns (valid_reviews, fallback_pr_numbers).
- valid_reviews: reviews that passed validation
- fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
"""
valid: dict[int, str] = {}
fallback: list[int] = []
# Build file map: pr_number → set of path segments for matching.
# Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
# to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
pr_files: dict[int, set[str]] = {}
for pr in pr_diffs:
files = set()
for line in pr["diff"].split("\n"):
if line.startswith("diff --git a/"):
path = line.replace("diff --git a/", "").split(" b/")[0]
files.add(path)
# Also add the last 2 path segments (e.g., "internet-finance/dao.md")
# for models that abbreviate paths
parts = path.split("/")
if len(parts) >= 2:
files.add("/".join(parts[-2:]))
pr_files[pr["number"]] = files
for pr in pr_diffs:
pr_num = pr["number"]
# Completeness check: is there a review for this PR?
if pr_num not in parsed:
logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
fallback.append(pr_num)
continue
review = parsed[pr_num]
# Cross-contamination check: does review mention at least one file from this PR?
# Use path segments (min 10 chars) to avoid false substring matches on short names.
my_files = pr_files.get(pr_num, set())
mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
if not mentions_own_file and my_files:
# Check if it references files from OTHER PRs (cross-contamination signal)
other_files = set()
for other_pr in pr_diffs:
if other_pr["number"] != pr_num:
other_files.update(pr_files.get(other_pr["number"], set()))
mentions_other = any(f in review for f in other_files if len(f) >= 10)
if mentions_other:
logger.warning(
"Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
pr_num,
)
fallback.append(pr_num)
continue
# If it doesn't mention any files at all, could be a generic review — accept it
# (some PRs have short diffs where the model doesn't reference filenames)
valid[pr_num] = review
return valid, fallback

View file

@ -21,11 +21,25 @@ LLM transport and prompts extracted to lib/llm.py (Phase 3c).
import json
import logging
import random
import re
from datetime import datetime, timezone
from . import config, db
from .domains import agent_for_domain, detect_domain_from_branch, detect_domain_from_diff
from .eval_parse import (
VALID_ISSUE_TAGS,
_classify_issues,
_deterministic_tier,
_diff_contains_claim_type,
_extract_changed_files,
_filter_diff,
_infer_issues_from_prose,
_is_musings_only,
_normalize_tag,
_parse_batch_response,
_parse_issues,
_parse_verdict,
_validate_batch_fanout,
)
from .forgejo import api as forgejo_api
from .forgejo import get_agent_token, get_pr_diff, repo_path
from .merge import PIPELINE_OWNED_PREFIXES
@ -37,289 +51,12 @@ from .validate import load_existing_claims
logger = logging.getLogger("pipeline.evaluate")
# ─── Diff helpers ──────────────────────────────────────────────────────────
def _filter_diff(diff: str) -> tuple[str, str]:
"""Filter diff to only review-relevant files.
Returns (review_diff, entity_diff).
Strips: inbox/, schemas/, skills/, agents/*/musings/
"""
sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
claim_sections = []
entity_sections = []
for section in sections:
if not section.strip():
continue
if any(re.match(p, section) for p in skip_patterns):
continue
entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
if entity_match and entity_match.group(1) not in core_domains:
entity_sections.append(section)
continue
claim_sections.append(section)
return "".join(claim_sections), "".join(entity_sections)
def _extract_changed_files(diff: str) -> str:
"""Extract changed file paths from diff."""
return "\n".join(
line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
)
def _is_musings_only(diff: str) -> bool:
"""Check if PR only modifies musing files."""
has_musings = False
has_other = False
for line in diff.split("\n"):
if line.startswith("diff --git"):
if "agents/" in line and "/musings/" in line:
has_musings = True
else:
has_other = True
return has_musings and not has_other
# ─── NOTE: Tier 0.5 mechanical pre-check moved to validate.py ────────────
# Tier 0.5 now runs as part of the validate stage (before eval), not inside
# evaluate_pr(). This prevents wasting eval_attempts on mechanically fixable
# PRs. Eval trusts that tier0_pass=1 means all mechanical checks passed.
# ─── Tier overrides ───────────────────────────────────────────────────────
def _diff_contains_claim_type(diff: str) -> bool:
"""Claim-shape detector: check if any file in diff has type: claim in frontmatter.
Mechanical check ($0). If YAML declares type: claim, this is a factual claim
not an entity update or formatting fix. Must be classified STANDARD minimum
regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
(Theseus: converts semantic problem to mechanical check)
"""
for line in diff.split("\n"):
if line.startswith("+") and not line.startswith("+++"):
stripped = line[1:].strip()
if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
return True
return False
def _deterministic_tier(diff: str) -> str | None:
"""Deterministic tier routing — skip Haiku triage for obvious cases.
Checks diff file patterns before calling the LLM. Returns tier string
if deterministic, None if Haiku triage is needed.
Rules (Leo-calibrated):
- All files in entities/ only LIGHT
- All files in inbox/ only (queue, archive, null-result) LIGHT
- Any file in core/ or foundations/ DEEP (structural KB changes)
- Has challenged_by field DEEP (challenges existing claims)
- Modifies existing file (not new) in domains/ DEEP (enrichment/change)
- Otherwise None (needs Haiku triage)
NOTE: Cross-domain wiki links are NOT a DEEP signal most claims link
across domains, that's the whole point of the knowledge graph (Leo).
"""
changed_files = []
for line in diff.split("\n"):
if line.startswith("diff --git a/"):
path = line.replace("diff --git a/", "").split(" b/")[0]
changed_files.append(path)
if not changed_files:
return None
# All entities/ only → LIGHT
if all(f.startswith("entities/") for f in changed_files):
logger.info("Deterministic tier: LIGHT (all files in entities/)")
return "LIGHT"
# All inbox/ only (queue, archive, null-result) → LIGHT
if all(f.startswith("inbox/") for f in changed_files):
logger.info("Deterministic tier: LIGHT (all files in inbox/)")
return "LIGHT"
# Any file in core/ or foundations/ → DEEP (structural KB changes)
if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
return "DEEP"
# Check diff content for DEEP signals
has_challenged_by = False
has_modified_claim = False
new_files: set[str] = set()
lines = diff.split("\n")
for i, line in enumerate(lines):
# Detect new files
if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
new_files.add(lines[i + 1][6:])
# Check for challenged_by field
if line.startswith("+") and not line.startswith("+++"):
stripped = line[1:].strip()
if stripped.startswith("challenged_by:"):
has_challenged_by = True
if has_challenged_by:
logger.info("Deterministic tier: DEEP (has challenged_by field)")
return "DEEP"
# NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
# (appending evidence) are common and should be STANDARD. Let Haiku triage
# distinguish enrichments from structural changes.
return None
# ─── Verdict parsing ──────────────────────────────────────────────────────
def _parse_verdict(review_text: str, reviewer: str) -> str:
"""Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
upper = reviewer.upper()
if f"VERDICT:{upper}:APPROVE" in review_text:
return "approve"
elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
return "request_changes"
else:
logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
return "request_changes"
# Map model-invented tags to valid tags. Models consistently ignore the valid
# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
_TAG_ALIASES: dict[str, str] = {
"schema_violation": "frontmatter_schema",
"missing_schema_fields": "frontmatter_schema",
"missing_schema": "frontmatter_schema",
"schema": "frontmatter_schema",
"missing_frontmatter": "frontmatter_schema",
"redundancy": "near_duplicate",
"duplicate": "near_duplicate",
"missing_confidence": "confidence_miscalibration",
"confidence_error": "confidence_miscalibration",
"vague_claims": "scope_error",
"unfalsifiable": "scope_error",
"unverified_wiki_links": "broken_wiki_links",
"unverified-wiki-links": "broken_wiki_links",
"missing_wiki_links": "broken_wiki_links",
"invalid_wiki_links": "broken_wiki_links",
"wiki_link_errors": "broken_wiki_links",
"overclaiming": "title_overclaims",
"title_overclaim": "title_overclaims",
"date_error": "date_errors",
"factual_error": "factual_discrepancy",
"factual_inaccuracy": "factual_discrepancy",
}
VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
"confidence_miscalibration", "date_errors", "factual_discrepancy",
"near_duplicate", "scope_error"}
def _normalize_tag(tag: str) -> str | None:
"""Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
tag = tag.strip().lower().replace("-", "_")
if tag in VALID_ISSUE_TAGS:
return tag
if tag in _TAG_ALIASES:
return _TAG_ALIASES[tag]
# Fuzzy: check if any valid tag is a substring or vice versa
for valid in VALID_ISSUE_TAGS:
if valid in tag or tag in valid:
return valid
return None
def _parse_issues(review_text: str) -> list[str]:
"""Extract issue tags from review.
First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
Falls back to keyword inference from prose.
"""
match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
if match:
raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
normalized = []
for tag in raw_tags:
norm = _normalize_tag(tag)
if norm and norm not in normalized:
normalized.append(norm)
else:
logger.debug("Unrecognized issue tag '%s' — dropped", tag)
if normalized:
return normalized
# Fallback: infer tags from review prose
return _infer_issues_from_prose(review_text)
# Keyword patterns for inferring issue tags from unstructured review prose.
# Conservative: only match unambiguous indicators. Order doesn't matter.
_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
"frontmatter_schema": [
re.compile(r"frontmatter", re.IGNORECASE),
re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
re.compile(r"required field", re.IGNORECASE),
re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
],
"broken_wiki_links": [
re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
],
"factual_discrepancy": [
re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
re.compile(r"misrepresent", re.IGNORECASE),
],
"confidence_miscalibration": [
re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
],
"scope_error": [
re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
],
"title_overclaims": [
re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
re.compile(r"overclaim", re.IGNORECASE),
],
"near_duplicate": [
re.compile(r"near.?duplicate", re.IGNORECASE),
re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
re.compile(r"redundan", re.IGNORECASE),
],
}
def _infer_issues_from_prose(review_text: str) -> list[str]:
"""Infer issue tags from unstructured review text via keyword matching.
Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
Conservative: requires at least one unambiguous keyword match per tag.
"""
inferred = []
for tag, patterns in _PROSE_TAG_PATTERNS.items():
if any(p.search(review_text) for p in patterns):
inferred.append(tag)
return inferred
async def _post_formal_approvals(pr_number: int, pr_author: str):
"""Submit formal Forgejo reviews from 2 agents (not the PR author)."""
approvals = 0
@ -376,7 +113,10 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
repo_path(f"issues/{pr_number}/comments"),
{"body": comment_body},
)
await close_pr(conn, pr_number, last_error=reason)
closed = await close_pr(conn, pr_number, last_error=reason)
if not closed:
logger.warning("PR #%d: Forgejo close failed — skipping source requeue, will retry next cycle", pr_number)
return
# Tag source for re-extraction with feedback
cursor = conn.execute(
@ -402,21 +142,6 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
logger.info("PR #%d: TERMINATED — %s", pr_number, reason)
def _classify_issues(issues: list[str]) -> str:
"""Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
if not issues:
return "unknown"
mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
if substantive and not mechanical:
return "substantive"
if mechanical and not substantive:
return "mechanical"
if mechanical and substantive:
return "mixed"
return "unknown" # tags not in either set
async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_issues: list[str]):
"""Disposition logic for rejected PRs on attempt 2+.
@ -872,126 +597,6 @@ _RATE_LIMIT_BACKOFF_MINUTES = 15
# ─── Batch domain review ─────────────────────────────────────────────────
def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
"""Parse batched domain review into per-PR review sections.
Returns {pr_number: review_text} for each PR found in the response.
Missing PRs are omitted caller handles fallback.
"""
agent_upper = agent.upper()
result: dict[int, str] = {}
# Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
# Each marker terminates the previous PR's section
pattern = re.compile(
r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
)
matches = list(pattern.finditer(response))
if not matches:
return result
for i, match in enumerate(matches):
pr_num = int(match.group(1))
verdict = match.group(2)
marker_end = match.end()
# Find the start of this PR's section by looking for the section header
# or the end of the previous verdict
section_header = f"=== PR #{pr_num}"
header_pos = response.rfind(section_header, 0, match.start())
if header_pos >= 0:
# Extract from header to end of verdict marker
section_text = response[header_pos:marker_end].strip()
else:
# No header found — extract from previous marker end to this marker end
prev_end = matches[i - 1].end() if i > 0 else 0
section_text = response[prev_end:marker_end].strip()
# Re-format as individual review comment
# Strip the batch section header, keep just the review content
# Add batch label for traceability
pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
review_text = (
f"*(batch review with PRs {pr_nums_str})*\n\n"
f"{section_text}\n"
)
result[pr_num] = review_text
return result
def _validate_batch_fanout(
parsed: dict[int, str],
pr_diffs: list[dict],
agent: str,
) -> tuple[dict[int, str], list[int]]:
"""Validate batch fan-out for completeness and cross-contamination.
Returns (valid_reviews, fallback_pr_numbers).
- valid_reviews: reviews that passed validation
- fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
"""
valid: dict[int, str] = {}
fallback: list[int] = []
# Build file map: pr_number → set of path segments for matching.
# Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
# to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
pr_files: dict[int, set[str]] = {}
for pr in pr_diffs:
files = set()
for line in pr["diff"].split("\n"):
if line.startswith("diff --git a/"):
path = line.replace("diff --git a/", "").split(" b/")[0]
files.add(path)
# Also add the last 2 path segments (e.g., "internet-finance/dao.md")
# for models that abbreviate paths
parts = path.split("/")
if len(parts) >= 2:
files.add("/".join(parts[-2:]))
pr_files[pr["number"]] = files
for pr in pr_diffs:
pr_num = pr["number"]
# Completeness check: is there a review for this PR?
if pr_num not in parsed:
logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
fallback.append(pr_num)
continue
review = parsed[pr_num]
# Cross-contamination check: does review mention at least one file from this PR?
# Use path segments (min 10 chars) to avoid false substring matches on short names.
my_files = pr_files.get(pr_num, set())
mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
if not mentions_own_file and my_files:
# Check if it references files from OTHER PRs (cross-contamination signal)
other_files = set()
for other_pr in pr_diffs:
if other_pr["number"] != pr_num:
other_files.update(pr_files.get(other_pr["number"], set()))
mentions_other = any(f in review for f in other_files if len(f) >= 10)
if mentions_other:
logger.warning(
"Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
pr_num,
)
fallback.append(pr_num)
continue
# If it doesn't mention any files at all, could be a generic review — accept it
# (some PRs have short diffs where the model doesn't reference filenames)
valid[pr_num] = review
return valid, fallback
async def _run_batch_domain_eval(
conn, batch_prs: list[dict], domain: str, agent: str,
) -> tuple[int, int]:

View file

@ -29,15 +29,22 @@ async def close_pr(
merge_cycled: bool = False,
inc_merge_failures: bool = False,
close_on_forgejo: bool = True,
):
"""Close a PR in DB and on Forgejo.
) -> bool:
"""Close a PR in DB and on Forgejo. Returns True on success, False on Forgejo failure.
Args:
close_on_forgejo: False only when caller already closed on Forgejo
(reconciliation, ghost PR cleanup after manual close).
If Forgejo API fails, the DB update is SKIPPED to prevent ghost PRs
(DB says closed, Forgejo says open). The reconciliation loop in
merge.py._reconcile_db_state catches any that slip through.
"""
if close_on_forgejo:
await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
result = await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
if result is None:
logger.error("close_pr: Forgejo API failed for PR #%d, skipping DB update", pr_number)
return False
parts = ["status = 'closed'"]
params = []
@ -54,6 +61,7 @@ async def close_pr(
params.append(pr_number)
conn.execute(f"UPDATE prs SET {', '.join(parts)} WHERE number = ?", params)
return True
def approve_pr(

View file

@ -17,6 +17,7 @@ from datetime import datetime, timezone
from . import config, db
from .forgejo import api, repo_path
from .pr_state import close_pr
logger = logging.getLogger("pipeline.stale_pr")
@ -48,13 +49,9 @@ async def check_stale_prs(conn) -> tuple[int, int]:
source_path = pr["source_path"] or "unknown"
try:
# Close the PR via Forgejo
result = await api(
"PATCH",
repo_path(f"pulls/{pr_num}"),
body={"state": "closed"},
)
if result is None:
closed = await close_pr(conn, pr_num,
last_error=f"stale: no claims after {STALE_THRESHOLD_MINUTES} min")
if not closed:
stale_errors += 1
logger.warning(
"Failed to close stale extraction PR #%d (%s, %s)",
@ -62,11 +59,6 @@ async def check_stale_prs(conn) -> tuple[int, int]:
)
continue
# Update local DB status
conn.execute(
"UPDATE prs SET status = 'closed' WHERE number = ?",
(pr_num,),
)
db.audit(
conn,
"watchdog",

315
tests/test_eval_parse.py Normal file
View file

@ -0,0 +1,315 @@
"""Tests for lib/eval_parse.py — pure parsing functions extracted from evaluate.py."""
import sys
import os
from unittest.mock import MagicMock
import pytest
# Mock heavy dependencies before importing
sys.modules.setdefault("aiohttp", MagicMock())
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from lib.eval_parse import (
VALID_ISSUE_TAGS,
_classify_issues,
_deterministic_tier,
_diff_contains_claim_type,
_extract_changed_files,
_filter_diff,
_infer_issues_from_prose,
_is_musings_only,
_normalize_tag,
_parse_batch_response,
_parse_issues,
_parse_verdict,
_validate_batch_fanout,
)
# ---------------------------------------------------------------------------
# _filter_diff
# ---------------------------------------------------------------------------
class TestFilterDiff:
def test_strips_inbox_files(self):
diff = (
"diff --git a/inbox/archive/foo.md b/inbox/archive/foo.md\n"
"+some content\n"
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
"+real content\n"
)
review_diff, entity_diff = _filter_diff(diff)
assert "inbox" not in review_diff
assert "claim.md" in review_diff
def test_separates_entity_diffs(self):
diff = (
"diff --git a/entities/people/alice.md b/entities/people/alice.md\n"
"+entity content\n"
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
"+claim content\n"
)
review_diff, entity_diff = _filter_diff(diff)
assert "alice.md" in entity_diff
assert "claim.md" in review_diff
def test_core_domain_entities_stay_in_review(self):
diff = (
"diff --git a/entities/living-agents/agent.md b/entities/living-agents/agent.md\n"
"+core entity\n"
)
review_diff, entity_diff = _filter_diff(diff)
assert "agent.md" in review_diff
assert entity_diff == ""
# ---------------------------------------------------------------------------
# _extract_changed_files
# ---------------------------------------------------------------------------
class TestExtractChangedFiles:
def test_extracts_paths(self):
diff = (
"diff --git a/domains/foo.md b/domains/foo.md\n"
"+content\n"
"diff --git a/entities/bar.md b/entities/bar.md\n"
"+more\n"
)
result = _extract_changed_files(diff)
assert "domains/foo.md" in result
assert "entities/bar.md" in result
def test_empty_diff(self):
assert _extract_changed_files("") == ""
# ---------------------------------------------------------------------------
# _is_musings_only
# ---------------------------------------------------------------------------
class TestIsMusingsOnly:
def test_musings_only(self):
diff = "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
assert _is_musings_only(diff) is True
def test_mixed_files(self):
diff = (
"diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
)
assert _is_musings_only(diff) is False
def test_no_musings(self):
diff = "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
assert _is_musings_only(diff) is False
# ---------------------------------------------------------------------------
# _diff_contains_claim_type
# ---------------------------------------------------------------------------
class TestDiffContainsClaimType:
def test_detects_claim_type(self):
diff = "+type: claim\n+confidence: 0.8\n"
assert _diff_contains_claim_type(diff) is True
def test_ignores_non_claim(self):
diff = "+type: entity\n"
assert _diff_contains_claim_type(diff) is False
def test_ignores_file_header(self):
diff = "+++ b/type: claim\n"
assert _diff_contains_claim_type(diff) is False
# ---------------------------------------------------------------------------
# _deterministic_tier
# ---------------------------------------------------------------------------
class TestDeterministicTier:
def test_entities_only_is_light(self):
diff = "diff --git a/entities/people/alice.md b/entities/people/alice.md\n+data\n"
assert _deterministic_tier(diff) == "LIGHT"
def test_inbox_only_is_light(self):
diff = "diff --git a/inbox/archive/src.md b/inbox/archive/src.md\n+data\n"
assert _deterministic_tier(diff) == "LIGHT"
def test_core_is_deep(self):
diff = "diff --git a/core/schema.md b/core/schema.md\n+structural change\n"
assert _deterministic_tier(diff) == "DEEP"
def test_challenged_by_is_deep(self):
diff = "diff --git a/domains/x/claim.md b/domains/x/claim.md\n+challenged_by: some-claim\n"
assert _deterministic_tier(diff) == "DEEP"
def test_domain_claim_returns_none(self):
diff = "diff --git a/domains/finance/new-claim.md b/domains/finance/new-claim.md\n--- /dev/null\n+++ b/domains/finance/new-claim.md\n+type: claim\n"
assert _deterministic_tier(diff) is None
def test_empty_diff_returns_none(self):
assert _deterministic_tier("") is None
# ---------------------------------------------------------------------------
# _parse_verdict
# ---------------------------------------------------------------------------
class TestParseVerdict:
def test_approve(self):
assert _parse_verdict("VERDICT:LEO:APPROVE", "LEO") == "approve"
def test_request_changes(self):
assert _parse_verdict("VERDICT:LEO:REQUEST_CHANGES", "LEO") == "request_changes"
def test_no_verdict_defaults_to_request_changes(self):
assert _parse_verdict("some review text", "LEO") == "request_changes"
def test_case_insensitive_reviewer(self):
assert _parse_verdict("VERDICT:LEO:APPROVE", "leo") == "approve"
# ---------------------------------------------------------------------------
# _normalize_tag
# ---------------------------------------------------------------------------
class TestNormalizeTag:
def test_valid_tag_passes_through(self):
assert _normalize_tag("broken_wiki_links") == "broken_wiki_links"
def test_alias_normalizes(self):
assert _normalize_tag("schema_violation") == "frontmatter_schema"
assert _normalize_tag("duplicate") == "near_duplicate"
assert _normalize_tag("factual_error") == "factual_discrepancy"
def test_hyphen_to_underscore(self):
assert _normalize_tag("unverified-wiki-links") == "broken_wiki_links"
def test_unknown_returns_none(self):
assert _normalize_tag("totally_made_up") is None
def test_fuzzy_substring_match(self):
assert _normalize_tag("wiki_links") == "broken_wiki_links"
# ---------------------------------------------------------------------------
# _parse_issues
# ---------------------------------------------------------------------------
class TestParseIssues:
def test_structured_comment(self):
text = "Review text. <!-- ISSUES: schema_violation, duplicate -->"
result = _parse_issues(text)
assert "frontmatter_schema" in result
assert "near_duplicate" in result
def test_fallback_to_prose(self):
text = "The frontmatter is missing required fields."
result = _parse_issues(text)
assert "frontmatter_schema" in result
def test_empty_structured_falls_back(self):
text = "Review. <!-- ISSUES: totally_unknown_tag -->"
# All tags unrecognizable, falls through to prose
result = _parse_issues(text)
# With no prose keywords either, should return empty
assert isinstance(result, list)
# ---------------------------------------------------------------------------
# _infer_issues_from_prose
# ---------------------------------------------------------------------------
class TestInferIssuesFromProse:
def test_detects_frontmatter_issues(self):
result = _infer_issues_from_prose("The YAML frontmatter is invalid.")
assert "frontmatter_schema" in result
def test_detects_wiki_link_issues(self):
result = _infer_issues_from_prose("Found broken wiki links in the claim.")
assert "broken_wiki_links" in result
def test_detects_near_duplicate(self):
result = _infer_issues_from_prose("This is a near-duplicate of an existing claim.")
assert "near_duplicate" in result
def test_no_matches(self):
result = _infer_issues_from_prose("This claim is well-structured and accurate.")
assert result == []
# ---------------------------------------------------------------------------
# _classify_issues
# ---------------------------------------------------------------------------
class TestClassifyIssues:
def test_mechanical_only(self):
assert _classify_issues(["frontmatter_schema", "near_duplicate"]) == "mechanical"
def test_substantive_only(self):
assert _classify_issues(["factual_discrepancy"]) == "substantive"
def test_mixed(self):
assert _classify_issues(["frontmatter_schema", "factual_discrepancy"]) == "mixed"
def test_empty(self):
assert _classify_issues([]) == "unknown"
# ---------------------------------------------------------------------------
# _parse_batch_response
# ---------------------------------------------------------------------------
class TestParseBatchResponse:
def test_parses_multiple_prs(self):
response = (
"=== PR #100\nReview for 100\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
"=== PR #200\nReview for 200\n<!-- PR:200 VERDICT:AGENT:REQUEST_CHANGES -->\n"
)
result = _parse_batch_response(response, [100, 200], "agent")
assert 100 in result
assert 200 in result
assert "APPROVE" in result[100]
assert "REQUEST_CHANGES" in result[200]
def test_missing_pr(self):
response = "=== PR #100\nReview\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
result = _parse_batch_response(response, [100, 200], "agent")
assert 100 in result
assert 200 not in result
def test_empty_response(self):
result = _parse_batch_response("no verdicts here", [100], "agent")
assert result == {}
# ---------------------------------------------------------------------------
# _validate_batch_fanout
# ---------------------------------------------------------------------------
class TestValidateBatchFanout:
def test_missing_pr_falls_back(self):
parsed = {100: "review for 100"}
pr_diffs = [
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
]
valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
assert 200 in fallback
assert 200 not in valid
def test_all_present_passes(self):
parsed = {
100: "review mentioning domains/long-enough-path.md",
200: "review mentioning domains/other-long-path.md",
}
pr_diffs = [
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
]
valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
assert 100 in valid
assert 200 in valid
assert fallback == []

View file

@ -135,13 +135,44 @@ class TestClosePr:
mock_api = AsyncMock(return_value={})
with patch("lib.pr_state.forgejo_api", mock_api), \
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
asyncio.run(close_pr(conn, 42))
result = asyncio.run(close_pr(conn, 42))
assert result is True
row = _get_pr(conn, 42)
assert row["status"] == "closed"
# last_error not overwritten when not provided
assert row["last_error"] == "old error"
def test_close_returns_false_on_forgejo_failure(self):
"""Critical: if Forgejo API fails, DB must NOT be updated (ghost PR prevention)."""
conn = _make_db()
_insert_pr(conn, 42)
mock_api = AsyncMock(return_value=None) # forgejo_api returns None on failure
with patch("lib.pr_state.forgejo_api", mock_api), \
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
result = asyncio.run(close_pr(conn, 42, last_error="should not persist"))
assert result is False
row = _get_pr(conn, 42)
assert row["status"] == "open" # DB not touched
assert row["last_error"] is None # last_error not set
def test_close_skips_forgejo_check_when_opted_out(self):
"""close_on_forgejo=False always succeeds (caller already closed Forgejo)."""
conn = _make_db()
_insert_pr(conn, 42)
mock_api = AsyncMock(return_value=None) # would fail if called
with patch("lib.pr_state.forgejo_api", mock_api), \
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
result = asyncio.run(close_pr(conn, 42, close_on_forgejo=False))
assert result is True
row = _get_pr(conn, 42)
assert row["status"] == "closed"
mock_api.assert_not_called()
# ---------------------------------------------------------------------------
# approve_pr