refactor: Phase 3 — fix close_pr ghost bug, wire stale_pr, extract eval_parse
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Critical bug fix: close_pr now checks forgejo_api return value and skips DB update on Forgejo failure, preventing ghost PRs (DB closed, Forgejo open). Returns bool so callers can handle failures. _terminate_pr checks return value — skips source requeue on failure. stale_pr.py migrated from raw Forgejo+DB to close_pr (last raw close transition eliminated). eval_parse.py: 15 pure parsing functions extracted from evaluate.py (~370 lines removed). Zero I/O, zero async, independently testable. evaluate.py drops from ~1510 to ~1140 lines. Tests: 295 passed (42 new eval_parse + 2 new close_pr), zero regressions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
716cc43890
commit
376b77999f
6 changed files with 815 additions and 430 deletions
434
lib/eval_parse.py
Normal file
434
lib/eval_parse.py
Normal file
|
|
@ -0,0 +1,434 @@
|
|||
"""Pure parsing functions for the eval stage — zero I/O, zero async.
|
||||
|
||||
Extracted from evaluate.py to isolate testable parsing logic from
|
||||
orchestration, DB, and Forgejo API calls.
|
||||
|
||||
Contents:
|
||||
- Diff helpers: filter, classify, tier routing
|
||||
- Verdict/issue parsing: structured tags + prose inference
|
||||
- Batch response parsing: fan-out validation
|
||||
|
||||
All functions are pure (input → output). The only external dependency
|
||||
is config.MECHANICAL_ISSUE_TAGS / config.SUBSTANTIVE_ISSUE_TAGS for
|
||||
_classify_issues.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from . import config
|
||||
|
||||
logger = logging.getLogger("pipeline.eval_parse")
|
||||
|
||||
|
||||
# ─── Diff helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _filter_diff(diff: str) -> tuple[str, str]:
|
||||
"""Filter diff to only review-relevant files.
|
||||
|
||||
Returns (review_diff, entity_diff).
|
||||
Strips: inbox/, schemas/, skills/, agents/*/musings/
|
||||
"""
|
||||
sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
|
||||
skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
|
||||
core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
|
||||
|
||||
claim_sections = []
|
||||
entity_sections = []
|
||||
|
||||
for section in sections:
|
||||
if not section.strip():
|
||||
continue
|
||||
if any(re.match(p, section) for p in skip_patterns):
|
||||
continue
|
||||
entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
|
||||
if entity_match and entity_match.group(1) not in core_domains:
|
||||
entity_sections.append(section)
|
||||
continue
|
||||
claim_sections.append(section)
|
||||
|
||||
return "".join(claim_sections), "".join(entity_sections)
|
||||
|
||||
|
||||
def _extract_changed_files(diff: str) -> str:
|
||||
"""Extract changed file paths from diff."""
|
||||
return "\n".join(
|
||||
line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
|
||||
)
|
||||
|
||||
|
||||
def _is_musings_only(diff: str) -> bool:
|
||||
"""Check if PR only modifies musing files."""
|
||||
has_musings = False
|
||||
has_other = False
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git"):
|
||||
if "agents/" in line and "/musings/" in line:
|
||||
has_musings = True
|
||||
else:
|
||||
has_other = True
|
||||
return has_musings and not has_other
|
||||
|
||||
|
||||
def _diff_contains_claim_type(diff: str) -> bool:
|
||||
"""Claim-shape detector: check if any file in diff has type: claim in frontmatter.
|
||||
|
||||
Mechanical check ($0). If YAML declares type: claim, this is a factual claim —
|
||||
not an entity update or formatting fix. Must be classified STANDARD minimum
|
||||
regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
|
||||
(Theseus: converts semantic problem to mechanical check)
|
||||
"""
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("+") and not line.startswith("+++"):
|
||||
stripped = line[1:].strip()
|
||||
if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _deterministic_tier(diff: str) -> str | None:
|
||||
"""Deterministic tier routing — skip Haiku triage for obvious cases.
|
||||
|
||||
Checks diff file patterns before calling the LLM. Returns tier string
|
||||
if deterministic, None if Haiku triage is needed.
|
||||
|
||||
Rules (Leo-calibrated):
|
||||
- All files in entities/ only → LIGHT
|
||||
- All files in inbox/ only (queue, archive, null-result) → LIGHT
|
||||
- Any file in core/ or foundations/ → DEEP (structural KB changes)
|
||||
- Has challenged_by field → DEEP (challenges existing claims)
|
||||
- Modifies existing file (not new) in domains/ → DEEP (enrichment/change)
|
||||
- Otherwise → None (needs Haiku triage)
|
||||
|
||||
NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link
|
||||
across domains, that's the whole point of the knowledge graph (Leo).
|
||||
"""
|
||||
changed_files = []
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git a/"):
|
||||
path = line.replace("diff --git a/", "").split(" b/")[0]
|
||||
changed_files.append(path)
|
||||
|
||||
if not changed_files:
|
||||
return None
|
||||
|
||||
# All entities/ only → LIGHT
|
||||
if all(f.startswith("entities/") for f in changed_files):
|
||||
logger.info("Deterministic tier: LIGHT (all files in entities/)")
|
||||
return "LIGHT"
|
||||
|
||||
# All inbox/ only (queue, archive, null-result) → LIGHT
|
||||
if all(f.startswith("inbox/") for f in changed_files):
|
||||
logger.info("Deterministic tier: LIGHT (all files in inbox/)")
|
||||
return "LIGHT"
|
||||
|
||||
# Any file in core/ or foundations/ → DEEP (structural KB changes)
|
||||
if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
|
||||
logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
|
||||
return "DEEP"
|
||||
|
||||
# Check diff content for DEEP signals
|
||||
has_challenged_by = False
|
||||
new_files: set[str] = set()
|
||||
|
||||
lines = diff.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
# Detect new files
|
||||
if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
|
||||
new_files.add(lines[i + 1][6:])
|
||||
# Check for challenged_by field
|
||||
if line.startswith("+") and not line.startswith("+++"):
|
||||
stripped = line[1:].strip()
|
||||
if stripped.startswith("challenged_by:"):
|
||||
has_challenged_by = True
|
||||
|
||||
if has_challenged_by:
|
||||
logger.info("Deterministic tier: DEEP (has challenged_by field)")
|
||||
return "DEEP"
|
||||
|
||||
# NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
|
||||
# (appending evidence) are common and should be STANDARD. Let Haiku triage
|
||||
# distinguish enrichments from structural changes.
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ─── Verdict parsing ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_verdict(review_text: str, reviewer: str) -> str:
|
||||
"""Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
|
||||
upper = reviewer.upper()
|
||||
if f"VERDICT:{upper}:APPROVE" in review_text:
|
||||
return "approve"
|
||||
elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
|
||||
return "request_changes"
|
||||
else:
|
||||
logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
|
||||
return "request_changes"
|
||||
|
||||
|
||||
# Map model-invented tags to valid tags. Models consistently ignore the valid
|
||||
# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
|
||||
_TAG_ALIASES: dict[str, str] = {
|
||||
"schema_violation": "frontmatter_schema",
|
||||
"missing_schema_fields": "frontmatter_schema",
|
||||
"missing_schema": "frontmatter_schema",
|
||||
"schema": "frontmatter_schema",
|
||||
"missing_frontmatter": "frontmatter_schema",
|
||||
"redundancy": "near_duplicate",
|
||||
"duplicate": "near_duplicate",
|
||||
"missing_confidence": "confidence_miscalibration",
|
||||
"confidence_error": "confidence_miscalibration",
|
||||
"vague_claims": "scope_error",
|
||||
"unfalsifiable": "scope_error",
|
||||
"unverified_wiki_links": "broken_wiki_links",
|
||||
"unverified-wiki-links": "broken_wiki_links",
|
||||
"missing_wiki_links": "broken_wiki_links",
|
||||
"invalid_wiki_links": "broken_wiki_links",
|
||||
"wiki_link_errors": "broken_wiki_links",
|
||||
"overclaiming": "title_overclaims",
|
||||
"title_overclaim": "title_overclaims",
|
||||
"date_error": "date_errors",
|
||||
"factual_error": "factual_discrepancy",
|
||||
"factual_inaccuracy": "factual_discrepancy",
|
||||
}
|
||||
|
||||
VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
|
||||
"confidence_miscalibration", "date_errors", "factual_discrepancy",
|
||||
"near_duplicate", "scope_error"}
|
||||
|
||||
|
||||
def _normalize_tag(tag: str) -> str | None:
|
||||
"""Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
|
||||
tag = tag.strip().lower().replace("-", "_")
|
||||
if tag in VALID_ISSUE_TAGS:
|
||||
return tag
|
||||
if tag in _TAG_ALIASES:
|
||||
return _TAG_ALIASES[tag]
|
||||
# Fuzzy: check if any valid tag is a substring or vice versa
|
||||
for valid in VALID_ISSUE_TAGS:
|
||||
if valid in tag or tag in valid:
|
||||
return valid
|
||||
return None
|
||||
|
||||
|
||||
# ─── Issue parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
# Keyword patterns for inferring issue tags from unstructured review prose.
|
||||
# Conservative: only match unambiguous indicators. Order doesn't matter.
|
||||
_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
|
||||
"frontmatter_schema": [
|
||||
re.compile(r"frontmatter", re.IGNORECASE),
|
||||
re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
|
||||
re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
|
||||
re.compile(r"required field", re.IGNORECASE),
|
||||
re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
|
||||
re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
|
||||
re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
|
||||
],
|
||||
"broken_wiki_links": [
|
||||
re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
|
||||
re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
|
||||
re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
|
||||
re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
|
||||
],
|
||||
"factual_discrepancy": [
|
||||
re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
|
||||
re.compile(r"misrepresent", re.IGNORECASE),
|
||||
],
|
||||
"confidence_miscalibration": [
|
||||
re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
|
||||
re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
|
||||
],
|
||||
"scope_error": [
|
||||
re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
|
||||
re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
|
||||
re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
|
||||
re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
|
||||
],
|
||||
"title_overclaims": [
|
||||
re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
|
||||
re.compile(r"overclaim", re.IGNORECASE),
|
||||
],
|
||||
"near_duplicate": [
|
||||
re.compile(r"near.?duplicate", re.IGNORECASE),
|
||||
re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
|
||||
re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
|
||||
re.compile(r"redundan", re.IGNORECASE),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _parse_issues(review_text: str) -> list[str]:
|
||||
"""Extract issue tags from review.
|
||||
|
||||
First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
|
||||
Falls back to keyword inference from prose.
|
||||
"""
|
||||
match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
|
||||
if match:
|
||||
raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
|
||||
normalized = []
|
||||
for tag in raw_tags:
|
||||
norm = _normalize_tag(tag)
|
||||
if norm and norm not in normalized:
|
||||
normalized.append(norm)
|
||||
else:
|
||||
logger.debug("Unrecognized issue tag '%s' — dropped", tag)
|
||||
if normalized:
|
||||
return normalized
|
||||
# Fallback: infer tags from review prose
|
||||
return _infer_issues_from_prose(review_text)
|
||||
|
||||
|
||||
def _infer_issues_from_prose(review_text: str) -> list[str]:
|
||||
"""Infer issue tags from unstructured review text via keyword matching.
|
||||
|
||||
Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
|
||||
Conservative: requires at least one unambiguous keyword match per tag.
|
||||
"""
|
||||
inferred = []
|
||||
for tag, patterns in _PROSE_TAG_PATTERNS.items():
|
||||
if any(p.search(review_text) for p in patterns):
|
||||
inferred.append(tag)
|
||||
return inferred
|
||||
|
||||
|
||||
def _classify_issues(issues: list[str]) -> str:
|
||||
"""Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
|
||||
if not issues:
|
||||
return "unknown"
|
||||
mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
|
||||
substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
|
||||
if substantive and not mechanical:
|
||||
return "substantive"
|
||||
if mechanical and not substantive:
|
||||
return "mechanical"
|
||||
if mechanical and substantive:
|
||||
return "mixed"
|
||||
return "unknown" # tags not in either set
|
||||
|
||||
|
||||
# ─── Batch response parsing ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
|
||||
"""Parse batched domain review into per-PR review sections.
|
||||
|
||||
Returns {pr_number: review_text} for each PR found in the response.
|
||||
Missing PRs are omitted — caller handles fallback.
|
||||
"""
|
||||
agent_upper = agent.upper()
|
||||
result: dict[int, str] = {}
|
||||
|
||||
# Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
|
||||
# Each marker terminates the previous PR's section
|
||||
pattern = re.compile(
|
||||
r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
|
||||
)
|
||||
|
||||
matches = list(pattern.finditer(response))
|
||||
if not matches:
|
||||
return result
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
pr_num = int(match.group(1))
|
||||
marker_end = match.end()
|
||||
|
||||
# Find the start of this PR's section by looking for the section header
|
||||
# or the end of the previous verdict
|
||||
section_header = f"=== PR #{pr_num}"
|
||||
header_pos = response.rfind(section_header, 0, match.start())
|
||||
|
||||
if header_pos >= 0:
|
||||
# Extract from header to end of verdict marker
|
||||
section_text = response[header_pos:marker_end].strip()
|
||||
else:
|
||||
# No header found — extract from previous marker end to this marker end
|
||||
prev_end = matches[i - 1].end() if i > 0 else 0
|
||||
section_text = response[prev_end:marker_end].strip()
|
||||
|
||||
# Re-format as individual review comment
|
||||
# Strip the batch section header, keep just the review content
|
||||
# Add batch label for traceability
|
||||
pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
|
||||
review_text = (
|
||||
f"*(batch review with PRs {pr_nums_str})*\n\n"
|
||||
f"{section_text}\n"
|
||||
)
|
||||
result[pr_num] = review_text
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _validate_batch_fanout(
|
||||
parsed: dict[int, str],
|
||||
pr_diffs: list[dict],
|
||||
agent: str,
|
||||
) -> tuple[dict[int, str], list[int]]:
|
||||
"""Validate batch fan-out for completeness and cross-contamination.
|
||||
|
||||
Returns (valid_reviews, fallback_pr_numbers).
|
||||
- valid_reviews: reviews that passed validation
|
||||
- fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
|
||||
"""
|
||||
valid: dict[int, str] = {}
|
||||
fallback: list[int] = []
|
||||
|
||||
# Build file map: pr_number → set of path segments for matching.
|
||||
# Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
|
||||
# to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
|
||||
pr_files: dict[int, set[str]] = {}
|
||||
for pr in pr_diffs:
|
||||
files = set()
|
||||
for line in pr["diff"].split("\n"):
|
||||
if line.startswith("diff --git a/"):
|
||||
path = line.replace("diff --git a/", "").split(" b/")[0]
|
||||
files.add(path)
|
||||
# Also add the last 2 path segments (e.g., "internet-finance/dao.md")
|
||||
# for models that abbreviate paths
|
||||
parts = path.split("/")
|
||||
if len(parts) >= 2:
|
||||
files.add("/".join(parts[-2:]))
|
||||
pr_files[pr["number"]] = files
|
||||
|
||||
for pr in pr_diffs:
|
||||
pr_num = pr["number"]
|
||||
|
||||
# Completeness check: is there a review for this PR?
|
||||
if pr_num not in parsed:
|
||||
logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
|
||||
fallback.append(pr_num)
|
||||
continue
|
||||
|
||||
review = parsed[pr_num]
|
||||
|
||||
# Cross-contamination check: does review mention at least one file from this PR?
|
||||
# Use path segments (min 10 chars) to avoid false substring matches on short names.
|
||||
my_files = pr_files.get(pr_num, set())
|
||||
mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
|
||||
|
||||
if not mentions_own_file and my_files:
|
||||
# Check if it references files from OTHER PRs (cross-contamination signal)
|
||||
other_files = set()
|
||||
for other_pr in pr_diffs:
|
||||
if other_pr["number"] != pr_num:
|
||||
other_files.update(pr_files.get(other_pr["number"], set()))
|
||||
mentions_other = any(f in review for f in other_files if len(f) >= 10)
|
||||
|
||||
if mentions_other:
|
||||
logger.warning(
|
||||
"Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
|
||||
pr_num,
|
||||
)
|
||||
fallback.append(pr_num)
|
||||
continue
|
||||
# If it doesn't mention any files at all, could be a generic review — accept it
|
||||
# (some PRs have short diffs where the model doesn't reference filenames)
|
||||
|
||||
valid[pr_num] = review
|
||||
|
||||
return valid, fallback
|
||||
433
lib/evaluate.py
433
lib/evaluate.py
|
|
@ -21,11 +21,25 @@ LLM transport and prompts extracted to lib/llm.py (Phase 3c).
|
|||
import json
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from . import config, db
|
||||
from .domains import agent_for_domain, detect_domain_from_branch, detect_domain_from_diff
|
||||
from .eval_parse import (
|
||||
VALID_ISSUE_TAGS,
|
||||
_classify_issues,
|
||||
_deterministic_tier,
|
||||
_diff_contains_claim_type,
|
||||
_extract_changed_files,
|
||||
_filter_diff,
|
||||
_infer_issues_from_prose,
|
||||
_is_musings_only,
|
||||
_normalize_tag,
|
||||
_parse_batch_response,
|
||||
_parse_issues,
|
||||
_parse_verdict,
|
||||
_validate_batch_fanout,
|
||||
)
|
||||
from .forgejo import api as forgejo_api
|
||||
from .forgejo import get_agent_token, get_pr_diff, repo_path
|
||||
from .merge import PIPELINE_OWNED_PREFIXES
|
||||
|
|
@ -37,289 +51,12 @@ from .validate import load_existing_claims
|
|||
logger = logging.getLogger("pipeline.evaluate")
|
||||
|
||||
|
||||
# ─── Diff helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _filter_diff(diff: str) -> tuple[str, str]:
|
||||
"""Filter diff to only review-relevant files.
|
||||
|
||||
Returns (review_diff, entity_diff).
|
||||
Strips: inbox/, schemas/, skills/, agents/*/musings/
|
||||
"""
|
||||
sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE)
|
||||
skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"]
|
||||
core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"}
|
||||
|
||||
claim_sections = []
|
||||
entity_sections = []
|
||||
|
||||
for section in sections:
|
||||
if not section.strip():
|
||||
continue
|
||||
if any(re.match(p, section) for p in skip_patterns):
|
||||
continue
|
||||
entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section)
|
||||
if entity_match and entity_match.group(1) not in core_domains:
|
||||
entity_sections.append(section)
|
||||
continue
|
||||
claim_sections.append(section)
|
||||
|
||||
return "".join(claim_sections), "".join(entity_sections)
|
||||
|
||||
|
||||
def _extract_changed_files(diff: str) -> str:
|
||||
"""Extract changed file paths from diff."""
|
||||
return "\n".join(
|
||||
line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git")
|
||||
)
|
||||
|
||||
|
||||
def _is_musings_only(diff: str) -> bool:
|
||||
"""Check if PR only modifies musing files."""
|
||||
has_musings = False
|
||||
has_other = False
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git"):
|
||||
if "agents/" in line and "/musings/" in line:
|
||||
has_musings = True
|
||||
else:
|
||||
has_other = True
|
||||
return has_musings and not has_other
|
||||
|
||||
|
||||
# ─── NOTE: Tier 0.5 mechanical pre-check moved to validate.py ────────────
|
||||
# Tier 0.5 now runs as part of the validate stage (before eval), not inside
|
||||
# evaluate_pr(). This prevents wasting eval_attempts on mechanically fixable
|
||||
# PRs. Eval trusts that tier0_pass=1 means all mechanical checks passed.
|
||||
|
||||
|
||||
# ─── Tier overrides ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _diff_contains_claim_type(diff: str) -> bool:
|
||||
"""Claim-shape detector: check if any file in diff has type: claim in frontmatter.
|
||||
|
||||
Mechanical check ($0). If YAML declares type: claim, this is a factual claim —
|
||||
not an entity update or formatting fix. Must be classified STANDARD minimum
|
||||
regardless of Haiku triage. Catches factual claims disguised as LIGHT content.
|
||||
(Theseus: converts semantic problem to mechanical check)
|
||||
"""
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("+") and not line.startswith("+++"):
|
||||
stripped = line[1:].strip()
|
||||
if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _deterministic_tier(diff: str) -> str | None:
|
||||
"""Deterministic tier routing — skip Haiku triage for obvious cases.
|
||||
|
||||
Checks diff file patterns before calling the LLM. Returns tier string
|
||||
if deterministic, None if Haiku triage is needed.
|
||||
|
||||
Rules (Leo-calibrated):
|
||||
- All files in entities/ only → LIGHT
|
||||
- All files in inbox/ only (queue, archive, null-result) → LIGHT
|
||||
- Any file in core/ or foundations/ → DEEP (structural KB changes)
|
||||
- Has challenged_by field → DEEP (challenges existing claims)
|
||||
- Modifies existing file (not new) in domains/ → DEEP (enrichment/change)
|
||||
- Otherwise → None (needs Haiku triage)
|
||||
|
||||
NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link
|
||||
across domains, that's the whole point of the knowledge graph (Leo).
|
||||
"""
|
||||
changed_files = []
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git a/"):
|
||||
path = line.replace("diff --git a/", "").split(" b/")[0]
|
||||
changed_files.append(path)
|
||||
|
||||
if not changed_files:
|
||||
return None
|
||||
|
||||
# All entities/ only → LIGHT
|
||||
if all(f.startswith("entities/") for f in changed_files):
|
||||
logger.info("Deterministic tier: LIGHT (all files in entities/)")
|
||||
return "LIGHT"
|
||||
|
||||
# All inbox/ only (queue, archive, null-result) → LIGHT
|
||||
if all(f.startswith("inbox/") for f in changed_files):
|
||||
logger.info("Deterministic tier: LIGHT (all files in inbox/)")
|
||||
return "LIGHT"
|
||||
|
||||
# Any file in core/ or foundations/ → DEEP (structural KB changes)
|
||||
if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files):
|
||||
logger.info("Deterministic tier: DEEP (touches core/ or foundations/)")
|
||||
return "DEEP"
|
||||
|
||||
# Check diff content for DEEP signals
|
||||
has_challenged_by = False
|
||||
has_modified_claim = False
|
||||
new_files: set[str] = set()
|
||||
|
||||
lines = diff.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
# Detect new files
|
||||
if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
|
||||
new_files.add(lines[i + 1][6:])
|
||||
# Check for challenged_by field
|
||||
if line.startswith("+") and not line.startswith("+++"):
|
||||
stripped = line[1:].strip()
|
||||
if stripped.startswith("challenged_by:"):
|
||||
has_challenged_by = True
|
||||
|
||||
if has_challenged_by:
|
||||
logger.info("Deterministic tier: DEEP (has challenged_by field)")
|
||||
return "DEEP"
|
||||
|
||||
# NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments
|
||||
# (appending evidence) are common and should be STANDARD. Let Haiku triage
|
||||
# distinguish enrichments from structural changes.
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ─── Verdict parsing ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_verdict(review_text: str, reviewer: str) -> str:
|
||||
"""Parse VERDICT tag from review. Returns 'approve' or 'request_changes'."""
|
||||
upper = reviewer.upper()
|
||||
if f"VERDICT:{upper}:APPROVE" in review_text:
|
||||
return "approve"
|
||||
elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text:
|
||||
return "request_changes"
|
||||
else:
|
||||
logger.warning("No parseable verdict from %s — treating as request_changes", reviewer)
|
||||
return "request_changes"
|
||||
|
||||
|
||||
# Map model-invented tags to valid tags. Models consistently ignore the valid
|
||||
# tag list and invent their own. This normalizes them. (Ganymede, Mar 14)
|
||||
_TAG_ALIASES: dict[str, str] = {
|
||||
"schema_violation": "frontmatter_schema",
|
||||
"missing_schema_fields": "frontmatter_schema",
|
||||
"missing_schema": "frontmatter_schema",
|
||||
"schema": "frontmatter_schema",
|
||||
"missing_frontmatter": "frontmatter_schema",
|
||||
"redundancy": "near_duplicate",
|
||||
"duplicate": "near_duplicate",
|
||||
"missing_confidence": "confidence_miscalibration",
|
||||
"confidence_error": "confidence_miscalibration",
|
||||
"vague_claims": "scope_error",
|
||||
"unfalsifiable": "scope_error",
|
||||
"unverified_wiki_links": "broken_wiki_links",
|
||||
"unverified-wiki-links": "broken_wiki_links",
|
||||
"missing_wiki_links": "broken_wiki_links",
|
||||
"invalid_wiki_links": "broken_wiki_links",
|
||||
"wiki_link_errors": "broken_wiki_links",
|
||||
"overclaiming": "title_overclaims",
|
||||
"title_overclaim": "title_overclaims",
|
||||
"date_error": "date_errors",
|
||||
"factual_error": "factual_discrepancy",
|
||||
"factual_inaccuracy": "factual_discrepancy",
|
||||
}
|
||||
|
||||
VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims",
|
||||
"confidence_miscalibration", "date_errors", "factual_discrepancy",
|
||||
"near_duplicate", "scope_error"}
|
||||
|
||||
|
||||
def _normalize_tag(tag: str) -> str | None:
|
||||
"""Normalize a model-generated tag to a valid tag, or None if unrecognizable."""
|
||||
tag = tag.strip().lower().replace("-", "_")
|
||||
if tag in VALID_ISSUE_TAGS:
|
||||
return tag
|
||||
if tag in _TAG_ALIASES:
|
||||
return _TAG_ALIASES[tag]
|
||||
# Fuzzy: check if any valid tag is a substring or vice versa
|
||||
for valid in VALID_ISSUE_TAGS:
|
||||
if valid in tag or tag in valid:
|
||||
return valid
|
||||
return None
|
||||
|
||||
|
||||
def _parse_issues(review_text: str) -> list[str]:
|
||||
"""Extract issue tags from review.
|
||||
|
||||
First tries structured <!-- ISSUES: tag1, tag2 --> comment with tag normalization.
|
||||
Falls back to keyword inference from prose.
|
||||
"""
|
||||
match = re.search(r"<!-- ISSUES: ([^>]+) -->", review_text)
|
||||
if match:
|
||||
raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()]
|
||||
normalized = []
|
||||
for tag in raw_tags:
|
||||
norm = _normalize_tag(tag)
|
||||
if norm and norm not in normalized:
|
||||
normalized.append(norm)
|
||||
else:
|
||||
logger.debug("Unrecognized issue tag '%s' — dropped", tag)
|
||||
if normalized:
|
||||
return normalized
|
||||
# Fallback: infer tags from review prose
|
||||
return _infer_issues_from_prose(review_text)
|
||||
|
||||
|
||||
# Keyword patterns for inferring issue tags from unstructured review prose.
|
||||
# Conservative: only match unambiguous indicators. Order doesn't matter.
|
||||
_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = {
|
||||
"frontmatter_schema": [
|
||||
re.compile(r"frontmatter", re.IGNORECASE),
|
||||
re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE),
|
||||
re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE),
|
||||
re.compile(r"required field", re.IGNORECASE),
|
||||
re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE),
|
||||
re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE),
|
||||
re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE),
|
||||
],
|
||||
"broken_wiki_links": [
|
||||
re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE),
|
||||
re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE),
|
||||
re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE),
|
||||
re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE),
|
||||
],
|
||||
"factual_discrepancy": [
|
||||
re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE),
|
||||
re.compile(r"misrepresent", re.IGNORECASE),
|
||||
],
|
||||
"confidence_miscalibration": [
|
||||
re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE),
|
||||
re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE),
|
||||
],
|
||||
"scope_error": [
|
||||
re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE),
|
||||
re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE),
|
||||
re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE),
|
||||
re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE),
|
||||
],
|
||||
"title_overclaims": [
|
||||
re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE),
|
||||
re.compile(r"overclaim", re.IGNORECASE),
|
||||
],
|
||||
"near_duplicate": [
|
||||
re.compile(r"near.?duplicate", re.IGNORECASE),
|
||||
re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE),
|
||||
re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE),
|
||||
re.compile(r"redundan", re.IGNORECASE),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _infer_issues_from_prose(review_text: str) -> list[str]:
|
||||
"""Infer issue tags from unstructured review text via keyword matching.
|
||||
|
||||
Fallback for reviews that reject without structured <!-- ISSUES: --> tags.
|
||||
Conservative: requires at least one unambiguous keyword match per tag.
|
||||
"""
|
||||
inferred = []
|
||||
for tag, patterns in _PROSE_TAG_PATTERNS.items():
|
||||
if any(p.search(review_text) for p in patterns):
|
||||
inferred.append(tag)
|
||||
return inferred
|
||||
|
||||
|
||||
async def _post_formal_approvals(pr_number: int, pr_author: str):
|
||||
"""Submit formal Forgejo reviews from 2 agents (not the PR author)."""
|
||||
approvals = 0
|
||||
|
|
@ -376,7 +113,10 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
|
|||
repo_path(f"issues/{pr_number}/comments"),
|
||||
{"body": comment_body},
|
||||
)
|
||||
await close_pr(conn, pr_number, last_error=reason)
|
||||
closed = await close_pr(conn, pr_number, last_error=reason)
|
||||
if not closed:
|
||||
logger.warning("PR #%d: Forgejo close failed — skipping source requeue, will retry next cycle", pr_number)
|
||||
return
|
||||
|
||||
# Tag source for re-extraction with feedback
|
||||
cursor = conn.execute(
|
||||
|
|
@ -402,21 +142,6 @@ async def _terminate_pr(conn, pr_number: int, reason: str):
|
|||
logger.info("PR #%d: TERMINATED — %s", pr_number, reason)
|
||||
|
||||
|
||||
def _classify_issues(issues: list[str]) -> str:
|
||||
"""Classify issue tags as 'mechanical', 'substantive', or 'mixed'."""
|
||||
if not issues:
|
||||
return "unknown"
|
||||
mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS
|
||||
substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS
|
||||
if substantive and not mechanical:
|
||||
return "substantive"
|
||||
if mechanical and not substantive:
|
||||
return "mechanical"
|
||||
if mechanical and substantive:
|
||||
return "mixed"
|
||||
return "unknown" # tags not in either set
|
||||
|
||||
|
||||
async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_issues: list[str]):
|
||||
"""Disposition logic for rejected PRs on attempt 2+.
|
||||
|
||||
|
|
@ -872,126 +597,6 @@ _RATE_LIMIT_BACKOFF_MINUTES = 15
|
|||
# ─── Batch domain review ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]:
|
||||
"""Parse batched domain review into per-PR review sections.
|
||||
|
||||
Returns {pr_number: review_text} for each PR found in the response.
|
||||
Missing PRs are omitted — caller handles fallback.
|
||||
"""
|
||||
agent_upper = agent.upper()
|
||||
result: dict[int, str] = {}
|
||||
|
||||
# Split by PR verdict markers: <!-- PR:NNN VERDICT:AGENT:... -->
|
||||
# Each marker terminates the previous PR's section
|
||||
pattern = re.compile(
|
||||
r"<!-- PR:(\d+) VERDICT:" + re.escape(agent_upper) + r":(APPROVE|REQUEST_CHANGES) -->"
|
||||
)
|
||||
|
||||
matches = list(pattern.finditer(response))
|
||||
if not matches:
|
||||
return result
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
pr_num = int(match.group(1))
|
||||
verdict = match.group(2)
|
||||
marker_end = match.end()
|
||||
|
||||
# Find the start of this PR's section by looking for the section header
|
||||
# or the end of the previous verdict
|
||||
section_header = f"=== PR #{pr_num}"
|
||||
header_pos = response.rfind(section_header, 0, match.start())
|
||||
|
||||
if header_pos >= 0:
|
||||
# Extract from header to end of verdict marker
|
||||
section_text = response[header_pos:marker_end].strip()
|
||||
else:
|
||||
# No header found — extract from previous marker end to this marker end
|
||||
prev_end = matches[i - 1].end() if i > 0 else 0
|
||||
section_text = response[prev_end:marker_end].strip()
|
||||
|
||||
# Re-format as individual review comment
|
||||
# Strip the batch section header, keep just the review content
|
||||
# Add batch label for traceability
|
||||
pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers)
|
||||
review_text = (
|
||||
f"*(batch review with PRs {pr_nums_str})*\n\n"
|
||||
f"{section_text}\n"
|
||||
)
|
||||
result[pr_num] = review_text
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _validate_batch_fanout(
|
||||
parsed: dict[int, str],
|
||||
pr_diffs: list[dict],
|
||||
agent: str,
|
||||
) -> tuple[dict[int, str], list[int]]:
|
||||
"""Validate batch fan-out for completeness and cross-contamination.
|
||||
|
||||
Returns (valid_reviews, fallback_pr_numbers).
|
||||
- valid_reviews: reviews that passed validation
|
||||
- fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated)
|
||||
"""
|
||||
valid: dict[int, str] = {}
|
||||
fallback: list[int] = []
|
||||
|
||||
# Build file map: pr_number → set of path segments for matching.
|
||||
# Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames
|
||||
# to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3).
|
||||
pr_files: dict[int, set[str]] = {}
|
||||
for pr in pr_diffs:
|
||||
files = set()
|
||||
for line in pr["diff"].split("\n"):
|
||||
if line.startswith("diff --git a/"):
|
||||
path = line.replace("diff --git a/", "").split(" b/")[0]
|
||||
files.add(path)
|
||||
# Also add the last 2 path segments (e.g., "internet-finance/dao.md")
|
||||
# for models that abbreviate paths
|
||||
parts = path.split("/")
|
||||
if len(parts) >= 2:
|
||||
files.add("/".join(parts[-2:]))
|
||||
pr_files[pr["number"]] = files
|
||||
|
||||
for pr in pr_diffs:
|
||||
pr_num = pr["number"]
|
||||
|
||||
# Completeness check: is there a review for this PR?
|
||||
if pr_num not in parsed:
|
||||
logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num)
|
||||
fallback.append(pr_num)
|
||||
continue
|
||||
|
||||
review = parsed[pr_num]
|
||||
|
||||
# Cross-contamination check: does review mention at least one file from this PR?
|
||||
# Use path segments (min 10 chars) to avoid false substring matches on short names.
|
||||
my_files = pr_files.get(pr_num, set())
|
||||
mentions_own_file = any(f in review for f in my_files if len(f) >= 10)
|
||||
|
||||
if not mentions_own_file and my_files:
|
||||
# Check if it references files from OTHER PRs (cross-contamination signal)
|
||||
other_files = set()
|
||||
for other_pr in pr_diffs:
|
||||
if other_pr["number"] != pr_num:
|
||||
other_files.update(pr_files.get(other_pr["number"], set()))
|
||||
mentions_other = any(f in review for f in other_files if len(f) >= 10)
|
||||
|
||||
if mentions_other:
|
||||
logger.warning(
|
||||
"Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback",
|
||||
pr_num,
|
||||
)
|
||||
fallback.append(pr_num)
|
||||
continue
|
||||
# If it doesn't mention any files at all, could be a generic review — accept it
|
||||
# (some PRs have short diffs where the model doesn't reference filenames)
|
||||
|
||||
valid[pr_num] = review
|
||||
|
||||
return valid, fallback
|
||||
|
||||
|
||||
async def _run_batch_domain_eval(
|
||||
conn, batch_prs: list[dict], domain: str, agent: str,
|
||||
) -> tuple[int, int]:
|
||||
|
|
|
|||
|
|
@ -29,15 +29,22 @@ async def close_pr(
|
|||
merge_cycled: bool = False,
|
||||
inc_merge_failures: bool = False,
|
||||
close_on_forgejo: bool = True,
|
||||
):
|
||||
"""Close a PR in DB and on Forgejo.
|
||||
) -> bool:
|
||||
"""Close a PR in DB and on Forgejo. Returns True on success, False on Forgejo failure.
|
||||
|
||||
Args:
|
||||
close_on_forgejo: False only when caller already closed on Forgejo
|
||||
(reconciliation, ghost PR cleanup after manual close).
|
||||
|
||||
If Forgejo API fails, the DB update is SKIPPED to prevent ghost PRs
|
||||
(DB says closed, Forgejo says open). The reconciliation loop in
|
||||
merge.py._reconcile_db_state catches any that slip through.
|
||||
"""
|
||||
if close_on_forgejo:
|
||||
await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
|
||||
result = await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"})
|
||||
if result is None:
|
||||
logger.error("close_pr: Forgejo API failed for PR #%d, skipping DB update", pr_number)
|
||||
return False
|
||||
|
||||
parts = ["status = 'closed'"]
|
||||
params = []
|
||||
|
|
@ -54,6 +61,7 @@ async def close_pr(
|
|||
|
||||
params.append(pr_number)
|
||||
conn.execute(f"UPDATE prs SET {', '.join(parts)} WHERE number = ?", params)
|
||||
return True
|
||||
|
||||
|
||||
def approve_pr(
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from datetime import datetime, timezone
|
|||
|
||||
from . import config, db
|
||||
from .forgejo import api, repo_path
|
||||
from .pr_state import close_pr
|
||||
|
||||
logger = logging.getLogger("pipeline.stale_pr")
|
||||
|
||||
|
|
@ -48,13 +49,9 @@ async def check_stale_prs(conn) -> tuple[int, int]:
|
|||
source_path = pr["source_path"] or "unknown"
|
||||
|
||||
try:
|
||||
# Close the PR via Forgejo
|
||||
result = await api(
|
||||
"PATCH",
|
||||
repo_path(f"pulls/{pr_num}"),
|
||||
body={"state": "closed"},
|
||||
)
|
||||
if result is None:
|
||||
closed = await close_pr(conn, pr_num,
|
||||
last_error=f"stale: no claims after {STALE_THRESHOLD_MINUTES} min")
|
||||
if not closed:
|
||||
stale_errors += 1
|
||||
logger.warning(
|
||||
"Failed to close stale extraction PR #%d (%s, %s)",
|
||||
|
|
@ -62,11 +59,6 @@ async def check_stale_prs(conn) -> tuple[int, int]:
|
|||
)
|
||||
continue
|
||||
|
||||
# Update local DB status
|
||||
conn.execute(
|
||||
"UPDATE prs SET status = 'closed' WHERE number = ?",
|
||||
(pr_num,),
|
||||
)
|
||||
db.audit(
|
||||
conn,
|
||||
"watchdog",
|
||||
|
|
|
|||
315
tests/test_eval_parse.py
Normal file
315
tests/test_eval_parse.py
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
"""Tests for lib/eval_parse.py — pure parsing functions extracted from evaluate.py."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# Mock heavy dependencies before importing
|
||||
sys.modules.setdefault("aiohttp", MagicMock())
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from lib.eval_parse import (
|
||||
VALID_ISSUE_TAGS,
|
||||
_classify_issues,
|
||||
_deterministic_tier,
|
||||
_diff_contains_claim_type,
|
||||
_extract_changed_files,
|
||||
_filter_diff,
|
||||
_infer_issues_from_prose,
|
||||
_is_musings_only,
|
||||
_normalize_tag,
|
||||
_parse_batch_response,
|
||||
_parse_issues,
|
||||
_parse_verdict,
|
||||
_validate_batch_fanout,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _filter_diff
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFilterDiff:
|
||||
def test_strips_inbox_files(self):
|
||||
diff = (
|
||||
"diff --git a/inbox/archive/foo.md b/inbox/archive/foo.md\n"
|
||||
"+some content\n"
|
||||
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
|
||||
"+real content\n"
|
||||
)
|
||||
review_diff, entity_diff = _filter_diff(diff)
|
||||
assert "inbox" not in review_diff
|
||||
assert "claim.md" in review_diff
|
||||
|
||||
def test_separates_entity_diffs(self):
|
||||
diff = (
|
||||
"diff --git a/entities/people/alice.md b/entities/people/alice.md\n"
|
||||
"+entity content\n"
|
||||
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n"
|
||||
"+claim content\n"
|
||||
)
|
||||
review_diff, entity_diff = _filter_diff(diff)
|
||||
assert "alice.md" in entity_diff
|
||||
assert "claim.md" in review_diff
|
||||
|
||||
def test_core_domain_entities_stay_in_review(self):
|
||||
diff = (
|
||||
"diff --git a/entities/living-agents/agent.md b/entities/living-agents/agent.md\n"
|
||||
"+core entity\n"
|
||||
)
|
||||
review_diff, entity_diff = _filter_diff(diff)
|
||||
assert "agent.md" in review_diff
|
||||
assert entity_diff == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_changed_files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestExtractChangedFiles:
|
||||
def test_extracts_paths(self):
|
||||
diff = (
|
||||
"diff --git a/domains/foo.md b/domains/foo.md\n"
|
||||
"+content\n"
|
||||
"diff --git a/entities/bar.md b/entities/bar.md\n"
|
||||
"+more\n"
|
||||
)
|
||||
result = _extract_changed_files(diff)
|
||||
assert "domains/foo.md" in result
|
||||
assert "entities/bar.md" in result
|
||||
|
||||
def test_empty_diff(self):
|
||||
assert _extract_changed_files("") == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_musings_only
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIsMusingsOnly:
|
||||
def test_musings_only(self):
|
||||
diff = "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
|
||||
assert _is_musings_only(diff) is True
|
||||
|
||||
def test_mixed_files(self):
|
||||
diff = (
|
||||
"diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n"
|
||||
"diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
|
||||
)
|
||||
assert _is_musings_only(diff) is False
|
||||
|
||||
def test_no_musings(self):
|
||||
diff = "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n"
|
||||
assert _is_musings_only(diff) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _diff_contains_claim_type
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDiffContainsClaimType:
|
||||
def test_detects_claim_type(self):
|
||||
diff = "+type: claim\n+confidence: 0.8\n"
|
||||
assert _diff_contains_claim_type(diff) is True
|
||||
|
||||
def test_ignores_non_claim(self):
|
||||
diff = "+type: entity\n"
|
||||
assert _diff_contains_claim_type(diff) is False
|
||||
|
||||
def test_ignores_file_header(self):
|
||||
diff = "+++ b/type: claim\n"
|
||||
assert _diff_contains_claim_type(diff) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _deterministic_tier
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDeterministicTier:
|
||||
def test_entities_only_is_light(self):
|
||||
diff = "diff --git a/entities/people/alice.md b/entities/people/alice.md\n+data\n"
|
||||
assert _deterministic_tier(diff) == "LIGHT"
|
||||
|
||||
def test_inbox_only_is_light(self):
|
||||
diff = "diff --git a/inbox/archive/src.md b/inbox/archive/src.md\n+data\n"
|
||||
assert _deterministic_tier(diff) == "LIGHT"
|
||||
|
||||
def test_core_is_deep(self):
|
||||
diff = "diff --git a/core/schema.md b/core/schema.md\n+structural change\n"
|
||||
assert _deterministic_tier(diff) == "DEEP"
|
||||
|
||||
def test_challenged_by_is_deep(self):
|
||||
diff = "diff --git a/domains/x/claim.md b/domains/x/claim.md\n+challenged_by: some-claim\n"
|
||||
assert _deterministic_tier(diff) == "DEEP"
|
||||
|
||||
def test_domain_claim_returns_none(self):
|
||||
diff = "diff --git a/domains/finance/new-claim.md b/domains/finance/new-claim.md\n--- /dev/null\n+++ b/domains/finance/new-claim.md\n+type: claim\n"
|
||||
assert _deterministic_tier(diff) is None
|
||||
|
||||
def test_empty_diff_returns_none(self):
|
||||
assert _deterministic_tier("") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_verdict
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestParseVerdict:
|
||||
def test_approve(self):
|
||||
assert _parse_verdict("VERDICT:LEO:APPROVE", "LEO") == "approve"
|
||||
|
||||
def test_request_changes(self):
|
||||
assert _parse_verdict("VERDICT:LEO:REQUEST_CHANGES", "LEO") == "request_changes"
|
||||
|
||||
def test_no_verdict_defaults_to_request_changes(self):
|
||||
assert _parse_verdict("some review text", "LEO") == "request_changes"
|
||||
|
||||
def test_case_insensitive_reviewer(self):
|
||||
assert _parse_verdict("VERDICT:LEO:APPROVE", "leo") == "approve"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _normalize_tag
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNormalizeTag:
|
||||
def test_valid_tag_passes_through(self):
|
||||
assert _normalize_tag("broken_wiki_links") == "broken_wiki_links"
|
||||
|
||||
def test_alias_normalizes(self):
|
||||
assert _normalize_tag("schema_violation") == "frontmatter_schema"
|
||||
assert _normalize_tag("duplicate") == "near_duplicate"
|
||||
assert _normalize_tag("factual_error") == "factual_discrepancy"
|
||||
|
||||
def test_hyphen_to_underscore(self):
|
||||
assert _normalize_tag("unverified-wiki-links") == "broken_wiki_links"
|
||||
|
||||
def test_unknown_returns_none(self):
|
||||
assert _normalize_tag("totally_made_up") is None
|
||||
|
||||
def test_fuzzy_substring_match(self):
|
||||
assert _normalize_tag("wiki_links") == "broken_wiki_links"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_issues
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestParseIssues:
|
||||
def test_structured_comment(self):
|
||||
text = "Review text. <!-- ISSUES: schema_violation, duplicate -->"
|
||||
result = _parse_issues(text)
|
||||
assert "frontmatter_schema" in result
|
||||
assert "near_duplicate" in result
|
||||
|
||||
def test_fallback_to_prose(self):
|
||||
text = "The frontmatter is missing required fields."
|
||||
result = _parse_issues(text)
|
||||
assert "frontmatter_schema" in result
|
||||
|
||||
def test_empty_structured_falls_back(self):
|
||||
text = "Review. <!-- ISSUES: totally_unknown_tag -->"
|
||||
# All tags unrecognizable, falls through to prose
|
||||
result = _parse_issues(text)
|
||||
# With no prose keywords either, should return empty
|
||||
assert isinstance(result, list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _infer_issues_from_prose
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInferIssuesFromProse:
|
||||
def test_detects_frontmatter_issues(self):
|
||||
result = _infer_issues_from_prose("The YAML frontmatter is invalid.")
|
||||
assert "frontmatter_schema" in result
|
||||
|
||||
def test_detects_wiki_link_issues(self):
|
||||
result = _infer_issues_from_prose("Found broken wiki links in the claim.")
|
||||
assert "broken_wiki_links" in result
|
||||
|
||||
def test_detects_near_duplicate(self):
|
||||
result = _infer_issues_from_prose("This is a near-duplicate of an existing claim.")
|
||||
assert "near_duplicate" in result
|
||||
|
||||
def test_no_matches(self):
|
||||
result = _infer_issues_from_prose("This claim is well-structured and accurate.")
|
||||
assert result == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _classify_issues
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClassifyIssues:
|
||||
def test_mechanical_only(self):
|
||||
assert _classify_issues(["frontmatter_schema", "near_duplicate"]) == "mechanical"
|
||||
|
||||
def test_substantive_only(self):
|
||||
assert _classify_issues(["factual_discrepancy"]) == "substantive"
|
||||
|
||||
def test_mixed(self):
|
||||
assert _classify_issues(["frontmatter_schema", "factual_discrepancy"]) == "mixed"
|
||||
|
||||
def test_empty(self):
|
||||
assert _classify_issues([]) == "unknown"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_batch_response
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestParseBatchResponse:
|
||||
def test_parses_multiple_prs(self):
|
||||
response = (
|
||||
"=== PR #100\nReview for 100\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
|
||||
"=== PR #200\nReview for 200\n<!-- PR:200 VERDICT:AGENT:REQUEST_CHANGES -->\n"
|
||||
)
|
||||
result = _parse_batch_response(response, [100, 200], "agent")
|
||||
assert 100 in result
|
||||
assert 200 in result
|
||||
assert "APPROVE" in result[100]
|
||||
assert "REQUEST_CHANGES" in result[200]
|
||||
|
||||
def test_missing_pr(self):
|
||||
response = "=== PR #100\nReview\n<!-- PR:100 VERDICT:AGENT:APPROVE -->\n"
|
||||
result = _parse_batch_response(response, [100, 200], "agent")
|
||||
assert 100 in result
|
||||
assert 200 not in result
|
||||
|
||||
def test_empty_response(self):
|
||||
result = _parse_batch_response("no verdicts here", [100], "agent")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _validate_batch_fanout
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestValidateBatchFanout:
|
||||
def test_missing_pr_falls_back(self):
|
||||
parsed = {100: "review for 100"}
|
||||
pr_diffs = [
|
||||
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
|
||||
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
|
||||
]
|
||||
valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
|
||||
assert 200 in fallback
|
||||
assert 200 not in valid
|
||||
|
||||
def test_all_present_passes(self):
|
||||
parsed = {
|
||||
100: "review mentioning domains/long-enough-path.md",
|
||||
200: "review mentioning domains/other-long-path.md",
|
||||
}
|
||||
pr_diffs = [
|
||||
{"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"},
|
||||
{"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"},
|
||||
]
|
||||
valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent")
|
||||
assert 100 in valid
|
||||
assert 200 in valid
|
||||
assert fallback == []
|
||||
|
|
@ -135,13 +135,44 @@ class TestClosePr:
|
|||
mock_api = AsyncMock(return_value={})
|
||||
with patch("lib.pr_state.forgejo_api", mock_api), \
|
||||
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
|
||||
asyncio.run(close_pr(conn, 42))
|
||||
result = asyncio.run(close_pr(conn, 42))
|
||||
|
||||
assert result is True
|
||||
row = _get_pr(conn, 42)
|
||||
assert row["status"] == "closed"
|
||||
# last_error not overwritten when not provided
|
||||
assert row["last_error"] == "old error"
|
||||
|
||||
def test_close_returns_false_on_forgejo_failure(self):
|
||||
"""Critical: if Forgejo API fails, DB must NOT be updated (ghost PR prevention)."""
|
||||
conn = _make_db()
|
||||
_insert_pr(conn, 42)
|
||||
|
||||
mock_api = AsyncMock(return_value=None) # forgejo_api returns None on failure
|
||||
with patch("lib.pr_state.forgejo_api", mock_api), \
|
||||
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
|
||||
result = asyncio.run(close_pr(conn, 42, last_error="should not persist"))
|
||||
|
||||
assert result is False
|
||||
row = _get_pr(conn, 42)
|
||||
assert row["status"] == "open" # DB not touched
|
||||
assert row["last_error"] is None # last_error not set
|
||||
|
||||
def test_close_skips_forgejo_check_when_opted_out(self):
|
||||
"""close_on_forgejo=False always succeeds (caller already closed Forgejo)."""
|
||||
conn = _make_db()
|
||||
_insert_pr(conn, 42)
|
||||
|
||||
mock_api = AsyncMock(return_value=None) # would fail if called
|
||||
with patch("lib.pr_state.forgejo_api", mock_api), \
|
||||
patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"):
|
||||
result = asyncio.run(close_pr(conn, 42, close_on_forgejo=False))
|
||||
|
||||
assert result is True
|
||||
row = _get_pr(conn, 42)
|
||||
assert row["status"] == "closed"
|
||||
mock_api.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# approve_pr
|
||||
|
|
|
|||
Loading…
Reference in a new issue