From 376b77999f68155f7f22eb6348000807ba106897 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 16 Apr 2026 12:40:17 +0100 Subject: [PATCH] =?UTF-8?q?refactor:=20Phase=203=20=E2=80=94=20fix=20close?= =?UTF-8?q?=5Fpr=20ghost=20bug,=20wire=20stale=5Fpr,=20extract=20eval=5Fpa?= =?UTF-8?q?rse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical bug fix: close_pr now checks forgejo_api return value and skips DB update on Forgejo failure, preventing ghost PRs (DB closed, Forgejo open). Returns bool so callers can handle failures. _terminate_pr checks return value — skips source requeue on failure. stale_pr.py migrated from raw Forgejo+DB to close_pr (last raw close transition eliminated). eval_parse.py: 15 pure parsing functions extracted from evaluate.py (~370 lines removed). Zero I/O, zero async, independently testable. evaluate.py drops from ~1510 to ~1140 lines. Tests: 295 passed (42 new eval_parse + 2 new close_pr), zero regressions. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/eval_parse.py | 434 +++++++++++++++++++++++++++++++++++++++ lib/evaluate.py | 433 ++------------------------------------ lib/pr_state.py | 14 +- lib/stale_pr.py | 16 +- tests/test_eval_parse.py | 315 ++++++++++++++++++++++++++++ tests/test_pr_state.py | 33 ++- 6 files changed, 815 insertions(+), 430 deletions(-) create mode 100644 lib/eval_parse.py create mode 100644 tests/test_eval_parse.py diff --git a/lib/eval_parse.py b/lib/eval_parse.py new file mode 100644 index 0000000..cc94753 --- /dev/null +++ b/lib/eval_parse.py @@ -0,0 +1,434 @@ +"""Pure parsing functions for the eval stage — zero I/O, zero async. + +Extracted from evaluate.py to isolate testable parsing logic from +orchestration, DB, and Forgejo API calls. + +Contents: +- Diff helpers: filter, classify, tier routing +- Verdict/issue parsing: structured tags + prose inference +- Batch response parsing: fan-out validation + +All functions are pure (input → output). The only external dependency +is config.MECHANICAL_ISSUE_TAGS / config.SUBSTANTIVE_ISSUE_TAGS for +_classify_issues. +""" + +import logging +import re + +from . import config + +logger = logging.getLogger("pipeline.eval_parse") + + +# ─── Diff helpers ────────────────────────────────────────────────────────── + + +def _filter_diff(diff: str) -> tuple[str, str]: + """Filter diff to only review-relevant files. + + Returns (review_diff, entity_diff). + Strips: inbox/, schemas/, skills/, agents/*/musings/ + """ + sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE) + skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"] + core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"} + + claim_sections = [] + entity_sections = [] + + for section in sections: + if not section.strip(): + continue + if any(re.match(p, section) for p in skip_patterns): + continue + entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section) + if entity_match and entity_match.group(1) not in core_domains: + entity_sections.append(section) + continue + claim_sections.append(section) + + return "".join(claim_sections), "".join(entity_sections) + + +def _extract_changed_files(diff: str) -> str: + """Extract changed file paths from diff.""" + return "\n".join( + line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git") + ) + + +def _is_musings_only(diff: str) -> bool: + """Check if PR only modifies musing files.""" + has_musings = False + has_other = False + for line in diff.split("\n"): + if line.startswith("diff --git"): + if "agents/" in line and "/musings/" in line: + has_musings = True + else: + has_other = True + return has_musings and not has_other + + +def _diff_contains_claim_type(diff: str) -> bool: + """Claim-shape detector: check if any file in diff has type: claim in frontmatter. + + Mechanical check ($0). If YAML declares type: claim, this is a factual claim — + not an entity update or formatting fix. Must be classified STANDARD minimum + regardless of Haiku triage. Catches factual claims disguised as LIGHT content. + (Theseus: converts semantic problem to mechanical check) + """ + for line in diff.split("\n"): + if line.startswith("+") and not line.startswith("+++"): + stripped = line[1:].strip() + if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"): + return True + return False + + +def _deterministic_tier(diff: str) -> str | None: + """Deterministic tier routing — skip Haiku triage for obvious cases. + + Checks diff file patterns before calling the LLM. Returns tier string + if deterministic, None if Haiku triage is needed. + + Rules (Leo-calibrated): + - All files in entities/ only → LIGHT + - All files in inbox/ only (queue, archive, null-result) → LIGHT + - Any file in core/ or foundations/ → DEEP (structural KB changes) + - Has challenged_by field → DEEP (challenges existing claims) + - Modifies existing file (not new) in domains/ → DEEP (enrichment/change) + - Otherwise → None (needs Haiku triage) + + NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link + across domains, that's the whole point of the knowledge graph (Leo). + """ + changed_files = [] + for line in diff.split("\n"): + if line.startswith("diff --git a/"): + path = line.replace("diff --git a/", "").split(" b/")[0] + changed_files.append(path) + + if not changed_files: + return None + + # All entities/ only → LIGHT + if all(f.startswith("entities/") for f in changed_files): + logger.info("Deterministic tier: LIGHT (all files in entities/)") + return "LIGHT" + + # All inbox/ only (queue, archive, null-result) → LIGHT + if all(f.startswith("inbox/") for f in changed_files): + logger.info("Deterministic tier: LIGHT (all files in inbox/)") + return "LIGHT" + + # Any file in core/ or foundations/ → DEEP (structural KB changes) + if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files): + logger.info("Deterministic tier: DEEP (touches core/ or foundations/)") + return "DEEP" + + # Check diff content for DEEP signals + has_challenged_by = False + new_files: set[str] = set() + + lines = diff.split("\n") + for i, line in enumerate(lines): + # Detect new files + if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"): + new_files.add(lines[i + 1][6:]) + # Check for challenged_by field + if line.startswith("+") and not line.startswith("+++"): + stripped = line[1:].strip() + if stripped.startswith("challenged_by:"): + has_challenged_by = True + + if has_challenged_by: + logger.info("Deterministic tier: DEEP (has challenged_by field)") + return "DEEP" + + # NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments + # (appending evidence) are common and should be STANDARD. Let Haiku triage + # distinguish enrichments from structural changes. + + return None + + +# ─── Verdict parsing ────────────────────────────────────────────────────── + + +def _parse_verdict(review_text: str, reviewer: str) -> str: + """Parse VERDICT tag from review. Returns 'approve' or 'request_changes'.""" + upper = reviewer.upper() + if f"VERDICT:{upper}:APPROVE" in review_text: + return "approve" + elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text: + return "request_changes" + else: + logger.warning("No parseable verdict from %s — treating as request_changes", reviewer) + return "request_changes" + + +# Map model-invented tags to valid tags. Models consistently ignore the valid +# tag list and invent their own. This normalizes them. (Ganymede, Mar 14) +_TAG_ALIASES: dict[str, str] = { + "schema_violation": "frontmatter_schema", + "missing_schema_fields": "frontmatter_schema", + "missing_schema": "frontmatter_schema", + "schema": "frontmatter_schema", + "missing_frontmatter": "frontmatter_schema", + "redundancy": "near_duplicate", + "duplicate": "near_duplicate", + "missing_confidence": "confidence_miscalibration", + "confidence_error": "confidence_miscalibration", + "vague_claims": "scope_error", + "unfalsifiable": "scope_error", + "unverified_wiki_links": "broken_wiki_links", + "unverified-wiki-links": "broken_wiki_links", + "missing_wiki_links": "broken_wiki_links", + "invalid_wiki_links": "broken_wiki_links", + "wiki_link_errors": "broken_wiki_links", + "overclaiming": "title_overclaims", + "title_overclaim": "title_overclaims", + "date_error": "date_errors", + "factual_error": "factual_discrepancy", + "factual_inaccuracy": "factual_discrepancy", +} + +VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims", + "confidence_miscalibration", "date_errors", "factual_discrepancy", + "near_duplicate", "scope_error"} + + +def _normalize_tag(tag: str) -> str | None: + """Normalize a model-generated tag to a valid tag, or None if unrecognizable.""" + tag = tag.strip().lower().replace("-", "_") + if tag in VALID_ISSUE_TAGS: + return tag + if tag in _TAG_ALIASES: + return _TAG_ALIASES[tag] + # Fuzzy: check if any valid tag is a substring or vice versa + for valid in VALID_ISSUE_TAGS: + if valid in tag or tag in valid: + return valid + return None + + +# ─── Issue parsing ───────────────────────────────────────────────────────── + + +# Keyword patterns for inferring issue tags from unstructured review prose. +# Conservative: only match unambiguous indicators. Order doesn't matter. +_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = { + "frontmatter_schema": [ + re.compile(r"frontmatter", re.IGNORECASE), + re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE), + re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE), + re.compile(r"required field", re.IGNORECASE), + re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE), + re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE), + re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE), + ], + "broken_wiki_links": [ + re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE), + re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE), + re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE), + re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE), + ], + "factual_discrepancy": [ + re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE), + re.compile(r"misrepresent", re.IGNORECASE), + ], + "confidence_miscalibration": [ + re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE), + re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE), + ], + "scope_error": [ + re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE), + re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE), + re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE), + re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE), + ], + "title_overclaims": [ + re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE), + re.compile(r"overclaim", re.IGNORECASE), + ], + "near_duplicate": [ + re.compile(r"near.?duplicate", re.IGNORECASE), + re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE), + re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE), + re.compile(r"redundan", re.IGNORECASE), + ], +} + + +def _parse_issues(review_text: str) -> list[str]: + """Extract issue tags from review. + + First tries structured comment with tag normalization. + Falls back to keyword inference from prose. + """ + match = re.search(r"", review_text) + if match: + raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()] + normalized = [] + for tag in raw_tags: + norm = _normalize_tag(tag) + if norm and norm not in normalized: + normalized.append(norm) + else: + logger.debug("Unrecognized issue tag '%s' — dropped", tag) + if normalized: + return normalized + # Fallback: infer tags from review prose + return _infer_issues_from_prose(review_text) + + +def _infer_issues_from_prose(review_text: str) -> list[str]: + """Infer issue tags from unstructured review text via keyword matching. + + Fallback for reviews that reject without structured tags. + Conservative: requires at least one unambiguous keyword match per tag. + """ + inferred = [] + for tag, patterns in _PROSE_TAG_PATTERNS.items(): + if any(p.search(review_text) for p in patterns): + inferred.append(tag) + return inferred + + +def _classify_issues(issues: list[str]) -> str: + """Classify issue tags as 'mechanical', 'substantive', or 'mixed'.""" + if not issues: + return "unknown" + mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS + substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS + if substantive and not mechanical: + return "substantive" + if mechanical and not substantive: + return "mechanical" + if mechanical and substantive: + return "mixed" + return "unknown" # tags not in either set + + +# ─── Batch response parsing ─────────────────────────────────────────────── + + +def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]: + """Parse batched domain review into per-PR review sections. + + Returns {pr_number: review_text} for each PR found in the response. + Missing PRs are omitted — caller handles fallback. + """ + agent_upper = agent.upper() + result: dict[int, str] = {} + + # Split by PR verdict markers: + # Each marker terminates the previous PR's section + pattern = re.compile( + r"" + ) + + matches = list(pattern.finditer(response)) + if not matches: + return result + + for i, match in enumerate(matches): + pr_num = int(match.group(1)) + marker_end = match.end() + + # Find the start of this PR's section by looking for the section header + # or the end of the previous verdict + section_header = f"=== PR #{pr_num}" + header_pos = response.rfind(section_header, 0, match.start()) + + if header_pos >= 0: + # Extract from header to end of verdict marker + section_text = response[header_pos:marker_end].strip() + else: + # No header found — extract from previous marker end to this marker end + prev_end = matches[i - 1].end() if i > 0 else 0 + section_text = response[prev_end:marker_end].strip() + + # Re-format as individual review comment + # Strip the batch section header, keep just the review content + # Add batch label for traceability + pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers) + review_text = ( + f"*(batch review with PRs {pr_nums_str})*\n\n" + f"{section_text}\n" + ) + result[pr_num] = review_text + + return result + + +def _validate_batch_fanout( + parsed: dict[int, str], + pr_diffs: list[dict], + agent: str, +) -> tuple[dict[int, str], list[int]]: + """Validate batch fan-out for completeness and cross-contamination. + + Returns (valid_reviews, fallback_pr_numbers). + - valid_reviews: reviews that passed validation + - fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated) + """ + valid: dict[int, str] = {} + fallback: list[int] = [] + + # Build file map: pr_number → set of path segments for matching. + # Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames + # to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3). + pr_files: dict[int, set[str]] = {} + for pr in pr_diffs: + files = set() + for line in pr["diff"].split("\n"): + if line.startswith("diff --git a/"): + path = line.replace("diff --git a/", "").split(" b/")[0] + files.add(path) + # Also add the last 2 path segments (e.g., "internet-finance/dao.md") + # for models that abbreviate paths + parts = path.split("/") + if len(parts) >= 2: + files.add("/".join(parts[-2:])) + pr_files[pr["number"]] = files + + for pr in pr_diffs: + pr_num = pr["number"] + + # Completeness check: is there a review for this PR? + if pr_num not in parsed: + logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num) + fallback.append(pr_num) + continue + + review = parsed[pr_num] + + # Cross-contamination check: does review mention at least one file from this PR? + # Use path segments (min 10 chars) to avoid false substring matches on short names. + my_files = pr_files.get(pr_num, set()) + mentions_own_file = any(f in review for f in my_files if len(f) >= 10) + + if not mentions_own_file and my_files: + # Check if it references files from OTHER PRs (cross-contamination signal) + other_files = set() + for other_pr in pr_diffs: + if other_pr["number"] != pr_num: + other_files.update(pr_files.get(other_pr["number"], set())) + mentions_other = any(f in review for f in other_files if len(f) >= 10) + + if mentions_other: + logger.warning( + "Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback", + pr_num, + ) + fallback.append(pr_num) + continue + # If it doesn't mention any files at all, could be a generic review — accept it + # (some PRs have short diffs where the model doesn't reference filenames) + + valid[pr_num] = review + + return valid, fallback diff --git a/lib/evaluate.py b/lib/evaluate.py index 47812b9..ad726c2 100644 --- a/lib/evaluate.py +++ b/lib/evaluate.py @@ -21,11 +21,25 @@ LLM transport and prompts extracted to lib/llm.py (Phase 3c). import json import logging import random -import re from datetime import datetime, timezone from . import config, db from .domains import agent_for_domain, detect_domain_from_branch, detect_domain_from_diff +from .eval_parse import ( + VALID_ISSUE_TAGS, + _classify_issues, + _deterministic_tier, + _diff_contains_claim_type, + _extract_changed_files, + _filter_diff, + _infer_issues_from_prose, + _is_musings_only, + _normalize_tag, + _parse_batch_response, + _parse_issues, + _parse_verdict, + _validate_batch_fanout, +) from .forgejo import api as forgejo_api from .forgejo import get_agent_token, get_pr_diff, repo_path from .merge import PIPELINE_OWNED_PREFIXES @@ -37,289 +51,12 @@ from .validate import load_existing_claims logger = logging.getLogger("pipeline.evaluate") -# ─── Diff helpers ────────────────────────────────────────────────────────── - - -def _filter_diff(diff: str) -> tuple[str, str]: - """Filter diff to only review-relevant files. - - Returns (review_diff, entity_diff). - Strips: inbox/, schemas/, skills/, agents/*/musings/ - """ - sections = re.split(r"(?=^diff --git )", diff, flags=re.MULTILINE) - skip_patterns = [r"^diff --git a/(inbox/(archive|queue|null-result)|schemas|skills|agents/[^/]+/musings)/"] - core_domains = {"living-agents", "living-capital", "teleohumanity", "mechanisms"} - - claim_sections = [] - entity_sections = [] - - for section in sections: - if not section.strip(): - continue - if any(re.match(p, section) for p in skip_patterns): - continue - entity_match = re.match(r"^diff --git a/entities/([^/]+)/", section) - if entity_match and entity_match.group(1) not in core_domains: - entity_sections.append(section) - continue - claim_sections.append(section) - - return "".join(claim_sections), "".join(entity_sections) - - -def _extract_changed_files(diff: str) -> str: - """Extract changed file paths from diff.""" - return "\n".join( - line.replace("diff --git a/", "").split(" b/")[0] for line in diff.split("\n") if line.startswith("diff --git") - ) - - -def _is_musings_only(diff: str) -> bool: - """Check if PR only modifies musing files.""" - has_musings = False - has_other = False - for line in diff.split("\n"): - if line.startswith("diff --git"): - if "agents/" in line and "/musings/" in line: - has_musings = True - else: - has_other = True - return has_musings and not has_other - - # ─── NOTE: Tier 0.5 mechanical pre-check moved to validate.py ──────────── # Tier 0.5 now runs as part of the validate stage (before eval), not inside # evaluate_pr(). This prevents wasting eval_attempts on mechanically fixable # PRs. Eval trusts that tier0_pass=1 means all mechanical checks passed. -# ─── Tier overrides ─────────────────────────────────────────────────────── - - -def _diff_contains_claim_type(diff: str) -> bool: - """Claim-shape detector: check if any file in diff has type: claim in frontmatter. - - Mechanical check ($0). If YAML declares type: claim, this is a factual claim — - not an entity update or formatting fix. Must be classified STANDARD minimum - regardless of Haiku triage. Catches factual claims disguised as LIGHT content. - (Theseus: converts semantic problem to mechanical check) - """ - for line in diff.split("\n"): - if line.startswith("+") and not line.startswith("+++"): - stripped = line[1:].strip() - if stripped in ("type: claim", 'type: "claim"', "type: 'claim'"): - return True - return False - - -def _deterministic_tier(diff: str) -> str | None: - """Deterministic tier routing — skip Haiku triage for obvious cases. - - Checks diff file patterns before calling the LLM. Returns tier string - if deterministic, None if Haiku triage is needed. - - Rules (Leo-calibrated): - - All files in entities/ only → LIGHT - - All files in inbox/ only (queue, archive, null-result) → LIGHT - - Any file in core/ or foundations/ → DEEP (structural KB changes) - - Has challenged_by field → DEEP (challenges existing claims) - - Modifies existing file (not new) in domains/ → DEEP (enrichment/change) - - Otherwise → None (needs Haiku triage) - - NOTE: Cross-domain wiki links are NOT a DEEP signal — most claims link - across domains, that's the whole point of the knowledge graph (Leo). - """ - changed_files = [] - for line in diff.split("\n"): - if line.startswith("diff --git a/"): - path = line.replace("diff --git a/", "").split(" b/")[0] - changed_files.append(path) - - if not changed_files: - return None - - # All entities/ only → LIGHT - if all(f.startswith("entities/") for f in changed_files): - logger.info("Deterministic tier: LIGHT (all files in entities/)") - return "LIGHT" - - # All inbox/ only (queue, archive, null-result) → LIGHT - if all(f.startswith("inbox/") for f in changed_files): - logger.info("Deterministic tier: LIGHT (all files in inbox/)") - return "LIGHT" - - # Any file in core/ or foundations/ → DEEP (structural KB changes) - if any(f.startswith("core/") or f.startswith("foundations/") for f in changed_files): - logger.info("Deterministic tier: DEEP (touches core/ or foundations/)") - return "DEEP" - - # Check diff content for DEEP signals - has_challenged_by = False - has_modified_claim = False - new_files: set[str] = set() - - lines = diff.split("\n") - for i, line in enumerate(lines): - # Detect new files - if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"): - new_files.add(lines[i + 1][6:]) - # Check for challenged_by field - if line.startswith("+") and not line.startswith("+++"): - stripped = line[1:].strip() - if stripped.startswith("challenged_by:"): - has_challenged_by = True - - if has_challenged_by: - logger.info("Deterministic tier: DEEP (has challenged_by field)") - return "DEEP" - - # NOTE: Modified existing domain claims are NOT auto-DEEP — enrichments - # (appending evidence) are common and should be STANDARD. Let Haiku triage - # distinguish enrichments from structural changes. - - return None - - -# ─── Verdict parsing ────────────────────────────────────────────────────── - - -def _parse_verdict(review_text: str, reviewer: str) -> str: - """Parse VERDICT tag from review. Returns 'approve' or 'request_changes'.""" - upper = reviewer.upper() - if f"VERDICT:{upper}:APPROVE" in review_text: - return "approve" - elif f"VERDICT:{upper}:REQUEST_CHANGES" in review_text: - return "request_changes" - else: - logger.warning("No parseable verdict from %s — treating as request_changes", reviewer) - return "request_changes" - - -# Map model-invented tags to valid tags. Models consistently ignore the valid -# tag list and invent their own. This normalizes them. (Ganymede, Mar 14) -_TAG_ALIASES: dict[str, str] = { - "schema_violation": "frontmatter_schema", - "missing_schema_fields": "frontmatter_schema", - "missing_schema": "frontmatter_schema", - "schema": "frontmatter_schema", - "missing_frontmatter": "frontmatter_schema", - "redundancy": "near_duplicate", - "duplicate": "near_duplicate", - "missing_confidence": "confidence_miscalibration", - "confidence_error": "confidence_miscalibration", - "vague_claims": "scope_error", - "unfalsifiable": "scope_error", - "unverified_wiki_links": "broken_wiki_links", - "unverified-wiki-links": "broken_wiki_links", - "missing_wiki_links": "broken_wiki_links", - "invalid_wiki_links": "broken_wiki_links", - "wiki_link_errors": "broken_wiki_links", - "overclaiming": "title_overclaims", - "title_overclaim": "title_overclaims", - "date_error": "date_errors", - "factual_error": "factual_discrepancy", - "factual_inaccuracy": "factual_discrepancy", -} - -VALID_ISSUE_TAGS = {"broken_wiki_links", "frontmatter_schema", "title_overclaims", - "confidence_miscalibration", "date_errors", "factual_discrepancy", - "near_duplicate", "scope_error"} - - -def _normalize_tag(tag: str) -> str | None: - """Normalize a model-generated tag to a valid tag, or None if unrecognizable.""" - tag = tag.strip().lower().replace("-", "_") - if tag in VALID_ISSUE_TAGS: - return tag - if tag in _TAG_ALIASES: - return _TAG_ALIASES[tag] - # Fuzzy: check if any valid tag is a substring or vice versa - for valid in VALID_ISSUE_TAGS: - if valid in tag or tag in valid: - return valid - return None - - -def _parse_issues(review_text: str) -> list[str]: - """Extract issue tags from review. - - First tries structured comment with tag normalization. - Falls back to keyword inference from prose. - """ - match = re.search(r"", review_text) - if match: - raw_tags = [tag.strip() for tag in match.group(1).split(",") if tag.strip()] - normalized = [] - for tag in raw_tags: - norm = _normalize_tag(tag) - if norm and norm not in normalized: - normalized.append(norm) - else: - logger.debug("Unrecognized issue tag '%s' — dropped", tag) - if normalized: - return normalized - # Fallback: infer tags from review prose - return _infer_issues_from_prose(review_text) - - -# Keyword patterns for inferring issue tags from unstructured review prose. -# Conservative: only match unambiguous indicators. Order doesn't matter. -_PROSE_TAG_PATTERNS: dict[str, list[re.Pattern]] = { - "frontmatter_schema": [ - re.compile(r"frontmatter", re.IGNORECASE), - re.compile(r"missing.{0,20}(type|domain|confidence|source|created)\b", re.IGNORECASE), - re.compile(r"yaml.{0,10}(invalid|missing|error|schema)", re.IGNORECASE), - re.compile(r"required field", re.IGNORECASE), - re.compile(r"lacks?.{0,15}(required|yaml|schema|fields)", re.IGNORECASE), - re.compile(r"missing.{0,15}(schema|fields|frontmatter)", re.IGNORECASE), - re.compile(r"schema.{0,10}(compliance|violation|missing|invalid)", re.IGNORECASE), - ], - "broken_wiki_links": [ - re.compile(r"(broken|dead|invalid).{0,10}(wiki.?)?link", re.IGNORECASE), - re.compile(r"wiki.?link.{0,20}(not found|missing|broken|invalid|resolv|unverif)", re.IGNORECASE), - re.compile(r"\[\[.{1,80}\]\].{0,20}(not found|doesn.t exist|missing)", re.IGNORECASE), - re.compile(r"unverified.{0,10}(wiki|link)", re.IGNORECASE), - ], - "factual_discrepancy": [ - re.compile(r"factual.{0,10}(error|inaccura|discrepanc|incorrect)", re.IGNORECASE), - re.compile(r"misrepresent", re.IGNORECASE), - ], - "confidence_miscalibration": [ - re.compile(r"confidence.{0,20}(too high|too low|miscalibrat|overstat|should be)", re.IGNORECASE), - re.compile(r"(overstat|understat).{0,20}confidence", re.IGNORECASE), - ], - "scope_error": [ - re.compile(r"scope.{0,10}(error|too broad|overscop|unscoped)", re.IGNORECASE), - re.compile(r"unscoped.{0,10}(universal|claim)", re.IGNORECASE), - re.compile(r"(vague|unfalsifiable).{0,15}(claim|assertion)", re.IGNORECASE), - re.compile(r"not.{0,10}(specific|falsifiable|disagreeable).{0,10}enough", re.IGNORECASE), - ], - "title_overclaims": [ - re.compile(r"title.{0,20}(overclaim|overstat|too broad)", re.IGNORECASE), - re.compile(r"overclaim", re.IGNORECASE), - ], - "near_duplicate": [ - re.compile(r"near.?duplicate", re.IGNORECASE), - re.compile(r"(very|too) similar.{0,20}(claim|title|existing)", re.IGNORECASE), - re.compile(r"duplicate.{0,20}(of|claim|title|existing|information)", re.IGNORECASE), - re.compile(r"redundan", re.IGNORECASE), - ], -} - - -def _infer_issues_from_prose(review_text: str) -> list[str]: - """Infer issue tags from unstructured review text via keyword matching. - - Fallback for reviews that reject without structured tags. - Conservative: requires at least one unambiguous keyword match per tag. - """ - inferred = [] - for tag, patterns in _PROSE_TAG_PATTERNS.items(): - if any(p.search(review_text) for p in patterns): - inferred.append(tag) - return inferred - - async def _post_formal_approvals(pr_number: int, pr_author: str): """Submit formal Forgejo reviews from 2 agents (not the PR author).""" approvals = 0 @@ -376,7 +113,10 @@ async def _terminate_pr(conn, pr_number: int, reason: str): repo_path(f"issues/{pr_number}/comments"), {"body": comment_body}, ) - await close_pr(conn, pr_number, last_error=reason) + closed = await close_pr(conn, pr_number, last_error=reason) + if not closed: + logger.warning("PR #%d: Forgejo close failed — skipping source requeue, will retry next cycle", pr_number) + return # Tag source for re-extraction with feedback cursor = conn.execute( @@ -402,21 +142,6 @@ async def _terminate_pr(conn, pr_number: int, reason: str): logger.info("PR #%d: TERMINATED — %s", pr_number, reason) -def _classify_issues(issues: list[str]) -> str: - """Classify issue tags as 'mechanical', 'substantive', or 'mixed'.""" - if not issues: - return "unknown" - mechanical = set(issues) & config.MECHANICAL_ISSUE_TAGS - substantive = set(issues) & config.SUBSTANTIVE_ISSUE_TAGS - if substantive and not mechanical: - return "substantive" - if mechanical and not substantive: - return "mechanical" - if mechanical and substantive: - return "mixed" - return "unknown" # tags not in either set - - async def _dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_issues: list[str]): """Disposition logic for rejected PRs on attempt 2+. @@ -872,126 +597,6 @@ _RATE_LIMIT_BACKOFF_MINUTES = 15 # ─── Batch domain review ───────────────────────────────────────────────── -def _parse_batch_response(response: str, pr_numbers: list[int], agent: str) -> dict[int, str]: - """Parse batched domain review into per-PR review sections. - - Returns {pr_number: review_text} for each PR found in the response. - Missing PRs are omitted — caller handles fallback. - """ - agent_upper = agent.upper() - result: dict[int, str] = {} - - # Split by PR verdict markers: - # Each marker terminates the previous PR's section - pattern = re.compile( - r"" - ) - - matches = list(pattern.finditer(response)) - if not matches: - return result - - for i, match in enumerate(matches): - pr_num = int(match.group(1)) - verdict = match.group(2) - marker_end = match.end() - - # Find the start of this PR's section by looking for the section header - # or the end of the previous verdict - section_header = f"=== PR #{pr_num}" - header_pos = response.rfind(section_header, 0, match.start()) - - if header_pos >= 0: - # Extract from header to end of verdict marker - section_text = response[header_pos:marker_end].strip() - else: - # No header found — extract from previous marker end to this marker end - prev_end = matches[i - 1].end() if i > 0 else 0 - section_text = response[prev_end:marker_end].strip() - - # Re-format as individual review comment - # Strip the batch section header, keep just the review content - # Add batch label for traceability - pr_nums_str = ", ".join(f"#{n}" for n in pr_numbers) - review_text = ( - f"*(batch review with PRs {pr_nums_str})*\n\n" - f"{section_text}\n" - ) - result[pr_num] = review_text - - return result - - -def _validate_batch_fanout( - parsed: dict[int, str], - pr_diffs: list[dict], - agent: str, -) -> tuple[dict[int, str], list[int]]: - """Validate batch fan-out for completeness and cross-contamination. - - Returns (valid_reviews, fallback_pr_numbers). - - valid_reviews: reviews that passed validation - - fallback_pr_numbers: PRs that need individual review (missing or cross-contaminated) - """ - valid: dict[int, str] = {} - fallback: list[int] = [] - - # Build file map: pr_number → set of path segments for matching. - # Use full paths (e.g., "domains/internet-finance/dao.md") not bare filenames - # to avoid false matches on short names like "dao.md" or "space.md" (Leo note #3). - pr_files: dict[int, set[str]] = {} - for pr in pr_diffs: - files = set() - for line in pr["diff"].split("\n"): - if line.startswith("diff --git a/"): - path = line.replace("diff --git a/", "").split(" b/")[0] - files.add(path) - # Also add the last 2 path segments (e.g., "internet-finance/dao.md") - # for models that abbreviate paths - parts = path.split("/") - if len(parts) >= 2: - files.add("/".join(parts[-2:])) - pr_files[pr["number"]] = files - - for pr in pr_diffs: - pr_num = pr["number"] - - # Completeness check: is there a review for this PR? - if pr_num not in parsed: - logger.warning("Batch fan-out: PR #%d missing from response — fallback to individual", pr_num) - fallback.append(pr_num) - continue - - review = parsed[pr_num] - - # Cross-contamination check: does review mention at least one file from this PR? - # Use path segments (min 10 chars) to avoid false substring matches on short names. - my_files = pr_files.get(pr_num, set()) - mentions_own_file = any(f in review for f in my_files if len(f) >= 10) - - if not mentions_own_file and my_files: - # Check if it references files from OTHER PRs (cross-contamination signal) - other_files = set() - for other_pr in pr_diffs: - if other_pr["number"] != pr_num: - other_files.update(pr_files.get(other_pr["number"], set())) - mentions_other = any(f in review for f in other_files if len(f) >= 10) - - if mentions_other: - logger.warning( - "Batch fan-out: PR #%d review references files from another PR — cross-contamination, fallback", - pr_num, - ) - fallback.append(pr_num) - continue - # If it doesn't mention any files at all, could be a generic review — accept it - # (some PRs have short diffs where the model doesn't reference filenames) - - valid[pr_num] = review - - return valid, fallback - - async def _run_batch_domain_eval( conn, batch_prs: list[dict], domain: str, agent: str, ) -> tuple[int, int]: diff --git a/lib/pr_state.py b/lib/pr_state.py index 10dcf71..b143897 100644 --- a/lib/pr_state.py +++ b/lib/pr_state.py @@ -29,15 +29,22 @@ async def close_pr( merge_cycled: bool = False, inc_merge_failures: bool = False, close_on_forgejo: bool = True, -): - """Close a PR in DB and on Forgejo. +) -> bool: + """Close a PR in DB and on Forgejo. Returns True on success, False on Forgejo failure. Args: close_on_forgejo: False only when caller already closed on Forgejo (reconciliation, ghost PR cleanup after manual close). + + If Forgejo API fails, the DB update is SKIPPED to prevent ghost PRs + (DB says closed, Forgejo says open). The reconciliation loop in + merge.py._reconcile_db_state catches any that slip through. """ if close_on_forgejo: - await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"}) + result = await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"}) + if result is None: + logger.error("close_pr: Forgejo API failed for PR #%d, skipping DB update", pr_number) + return False parts = ["status = 'closed'"] params = [] @@ -54,6 +61,7 @@ async def close_pr( params.append(pr_number) conn.execute(f"UPDATE prs SET {', '.join(parts)} WHERE number = ?", params) + return True def approve_pr( diff --git a/lib/stale_pr.py b/lib/stale_pr.py index abd2643..97602e5 100644 --- a/lib/stale_pr.py +++ b/lib/stale_pr.py @@ -17,6 +17,7 @@ from datetime import datetime, timezone from . import config, db from .forgejo import api, repo_path +from .pr_state import close_pr logger = logging.getLogger("pipeline.stale_pr") @@ -48,13 +49,9 @@ async def check_stale_prs(conn) -> tuple[int, int]: source_path = pr["source_path"] or "unknown" try: - # Close the PR via Forgejo - result = await api( - "PATCH", - repo_path(f"pulls/{pr_num}"), - body={"state": "closed"}, - ) - if result is None: + closed = await close_pr(conn, pr_num, + last_error=f"stale: no claims after {STALE_THRESHOLD_MINUTES} min") + if not closed: stale_errors += 1 logger.warning( "Failed to close stale extraction PR #%d (%s, %s)", @@ -62,11 +59,6 @@ async def check_stale_prs(conn) -> tuple[int, int]: ) continue - # Update local DB status - conn.execute( - "UPDATE prs SET status = 'closed' WHERE number = ?", - (pr_num,), - ) db.audit( conn, "watchdog", diff --git a/tests/test_eval_parse.py b/tests/test_eval_parse.py new file mode 100644 index 0000000..0c37fa3 --- /dev/null +++ b/tests/test_eval_parse.py @@ -0,0 +1,315 @@ +"""Tests for lib/eval_parse.py — pure parsing functions extracted from evaluate.py.""" + +import sys +import os +from unittest.mock import MagicMock + +import pytest + +# Mock heavy dependencies before importing +sys.modules.setdefault("aiohttp", MagicMock()) + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from lib.eval_parse import ( + VALID_ISSUE_TAGS, + _classify_issues, + _deterministic_tier, + _diff_contains_claim_type, + _extract_changed_files, + _filter_diff, + _infer_issues_from_prose, + _is_musings_only, + _normalize_tag, + _parse_batch_response, + _parse_issues, + _parse_verdict, + _validate_batch_fanout, +) + + +# --------------------------------------------------------------------------- +# _filter_diff +# --------------------------------------------------------------------------- + +class TestFilterDiff: + def test_strips_inbox_files(self): + diff = ( + "diff --git a/inbox/archive/foo.md b/inbox/archive/foo.md\n" + "+some content\n" + "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n" + "+real content\n" + ) + review_diff, entity_diff = _filter_diff(diff) + assert "inbox" not in review_diff + assert "claim.md" in review_diff + + def test_separates_entity_diffs(self): + diff = ( + "diff --git a/entities/people/alice.md b/entities/people/alice.md\n" + "+entity content\n" + "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n" + "+claim content\n" + ) + review_diff, entity_diff = _filter_diff(diff) + assert "alice.md" in entity_diff + assert "claim.md" in review_diff + + def test_core_domain_entities_stay_in_review(self): + diff = ( + "diff --git a/entities/living-agents/agent.md b/entities/living-agents/agent.md\n" + "+core entity\n" + ) + review_diff, entity_diff = _filter_diff(diff) + assert "agent.md" in review_diff + assert entity_diff == "" + + +# --------------------------------------------------------------------------- +# _extract_changed_files +# --------------------------------------------------------------------------- + +class TestExtractChangedFiles: + def test_extracts_paths(self): + diff = ( + "diff --git a/domains/foo.md b/domains/foo.md\n" + "+content\n" + "diff --git a/entities/bar.md b/entities/bar.md\n" + "+more\n" + ) + result = _extract_changed_files(diff) + assert "domains/foo.md" in result + assert "entities/bar.md" in result + + def test_empty_diff(self): + assert _extract_changed_files("") == "" + + +# --------------------------------------------------------------------------- +# _is_musings_only +# --------------------------------------------------------------------------- + +class TestIsMusingsOnly: + def test_musings_only(self): + diff = "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n" + assert _is_musings_only(diff) is True + + def test_mixed_files(self): + diff = ( + "diff --git a/agents/rio/musings/thought.md b/agents/rio/musings/thought.md\n+musing\n" + "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n" + ) + assert _is_musings_only(diff) is False + + def test_no_musings(self): + diff = "diff --git a/domains/finance/claim.md b/domains/finance/claim.md\n+claim\n" + assert _is_musings_only(diff) is False + + +# --------------------------------------------------------------------------- +# _diff_contains_claim_type +# --------------------------------------------------------------------------- + +class TestDiffContainsClaimType: + def test_detects_claim_type(self): + diff = "+type: claim\n+confidence: 0.8\n" + assert _diff_contains_claim_type(diff) is True + + def test_ignores_non_claim(self): + diff = "+type: entity\n" + assert _diff_contains_claim_type(diff) is False + + def test_ignores_file_header(self): + diff = "+++ b/type: claim\n" + assert _diff_contains_claim_type(diff) is False + + +# --------------------------------------------------------------------------- +# _deterministic_tier +# --------------------------------------------------------------------------- + +class TestDeterministicTier: + def test_entities_only_is_light(self): + diff = "diff --git a/entities/people/alice.md b/entities/people/alice.md\n+data\n" + assert _deterministic_tier(diff) == "LIGHT" + + def test_inbox_only_is_light(self): + diff = "diff --git a/inbox/archive/src.md b/inbox/archive/src.md\n+data\n" + assert _deterministic_tier(diff) == "LIGHT" + + def test_core_is_deep(self): + diff = "diff --git a/core/schema.md b/core/schema.md\n+structural change\n" + assert _deterministic_tier(diff) == "DEEP" + + def test_challenged_by_is_deep(self): + diff = "diff --git a/domains/x/claim.md b/domains/x/claim.md\n+challenged_by: some-claim\n" + assert _deterministic_tier(diff) == "DEEP" + + def test_domain_claim_returns_none(self): + diff = "diff --git a/domains/finance/new-claim.md b/domains/finance/new-claim.md\n--- /dev/null\n+++ b/domains/finance/new-claim.md\n+type: claim\n" + assert _deterministic_tier(diff) is None + + def test_empty_diff_returns_none(self): + assert _deterministic_tier("") is None + + +# --------------------------------------------------------------------------- +# _parse_verdict +# --------------------------------------------------------------------------- + +class TestParseVerdict: + def test_approve(self): + assert _parse_verdict("VERDICT:LEO:APPROVE", "LEO") == "approve" + + def test_request_changes(self): + assert _parse_verdict("VERDICT:LEO:REQUEST_CHANGES", "LEO") == "request_changes" + + def test_no_verdict_defaults_to_request_changes(self): + assert _parse_verdict("some review text", "LEO") == "request_changes" + + def test_case_insensitive_reviewer(self): + assert _parse_verdict("VERDICT:LEO:APPROVE", "leo") == "approve" + + +# --------------------------------------------------------------------------- +# _normalize_tag +# --------------------------------------------------------------------------- + +class TestNormalizeTag: + def test_valid_tag_passes_through(self): + assert _normalize_tag("broken_wiki_links") == "broken_wiki_links" + + def test_alias_normalizes(self): + assert _normalize_tag("schema_violation") == "frontmatter_schema" + assert _normalize_tag("duplicate") == "near_duplicate" + assert _normalize_tag("factual_error") == "factual_discrepancy" + + def test_hyphen_to_underscore(self): + assert _normalize_tag("unverified-wiki-links") == "broken_wiki_links" + + def test_unknown_returns_none(self): + assert _normalize_tag("totally_made_up") is None + + def test_fuzzy_substring_match(self): + assert _normalize_tag("wiki_links") == "broken_wiki_links" + + +# --------------------------------------------------------------------------- +# _parse_issues +# --------------------------------------------------------------------------- + +class TestParseIssues: + def test_structured_comment(self): + text = "Review text. " + result = _parse_issues(text) + assert "frontmatter_schema" in result + assert "near_duplicate" in result + + def test_fallback_to_prose(self): + text = "The frontmatter is missing required fields." + result = _parse_issues(text) + assert "frontmatter_schema" in result + + def test_empty_structured_falls_back(self): + text = "Review. " + # All tags unrecognizable, falls through to prose + result = _parse_issues(text) + # With no prose keywords either, should return empty + assert isinstance(result, list) + + +# --------------------------------------------------------------------------- +# _infer_issues_from_prose +# --------------------------------------------------------------------------- + +class TestInferIssuesFromProse: + def test_detects_frontmatter_issues(self): + result = _infer_issues_from_prose("The YAML frontmatter is invalid.") + assert "frontmatter_schema" in result + + def test_detects_wiki_link_issues(self): + result = _infer_issues_from_prose("Found broken wiki links in the claim.") + assert "broken_wiki_links" in result + + def test_detects_near_duplicate(self): + result = _infer_issues_from_prose("This is a near-duplicate of an existing claim.") + assert "near_duplicate" in result + + def test_no_matches(self): + result = _infer_issues_from_prose("This claim is well-structured and accurate.") + assert result == [] + + +# --------------------------------------------------------------------------- +# _classify_issues +# --------------------------------------------------------------------------- + +class TestClassifyIssues: + def test_mechanical_only(self): + assert _classify_issues(["frontmatter_schema", "near_duplicate"]) == "mechanical" + + def test_substantive_only(self): + assert _classify_issues(["factual_discrepancy"]) == "substantive" + + def test_mixed(self): + assert _classify_issues(["frontmatter_schema", "factual_discrepancy"]) == "mixed" + + def test_empty(self): + assert _classify_issues([]) == "unknown" + + +# --------------------------------------------------------------------------- +# _parse_batch_response +# --------------------------------------------------------------------------- + +class TestParseBatchResponse: + def test_parses_multiple_prs(self): + response = ( + "=== PR #100\nReview for 100\n\n" + "=== PR #200\nReview for 200\n\n" + ) + result = _parse_batch_response(response, [100, 200], "agent") + assert 100 in result + assert 200 in result + assert "APPROVE" in result[100] + assert "REQUEST_CHANGES" in result[200] + + def test_missing_pr(self): + response = "=== PR #100\nReview\n\n" + result = _parse_batch_response(response, [100, 200], "agent") + assert 100 in result + assert 200 not in result + + def test_empty_response(self): + result = _parse_batch_response("no verdicts here", [100], "agent") + assert result == {} + + +# --------------------------------------------------------------------------- +# _validate_batch_fanout +# --------------------------------------------------------------------------- + +class TestValidateBatchFanout: + def test_missing_pr_falls_back(self): + parsed = {100: "review for 100"} + pr_diffs = [ + {"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"}, + {"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"}, + ] + valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent") + assert 200 in fallback + assert 200 not in valid + + def test_all_present_passes(self): + parsed = { + 100: "review mentioning domains/long-enough-path.md", + 200: "review mentioning domains/other-long-path.md", + } + pr_diffs = [ + {"number": 100, "diff": "diff --git a/domains/long-enough-path.md b/domains/long-enough-path.md\n"}, + {"number": 200, "diff": "diff --git a/domains/other-long-path.md b/domains/other-long-path.md\n"}, + ] + valid, fallback = _validate_batch_fanout(parsed, pr_diffs, "agent") + assert 100 in valid + assert 200 in valid + assert fallback == [] diff --git a/tests/test_pr_state.py b/tests/test_pr_state.py index 6c0cb98..735d51b 100644 --- a/tests/test_pr_state.py +++ b/tests/test_pr_state.py @@ -135,13 +135,44 @@ class TestClosePr: mock_api = AsyncMock(return_value={}) with patch("lib.pr_state.forgejo_api", mock_api), \ patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"): - asyncio.run(close_pr(conn, 42)) + result = asyncio.run(close_pr(conn, 42)) + assert result is True row = _get_pr(conn, 42) assert row["status"] == "closed" # last_error not overwritten when not provided assert row["last_error"] == "old error" + def test_close_returns_false_on_forgejo_failure(self): + """Critical: if Forgejo API fails, DB must NOT be updated (ghost PR prevention).""" + conn = _make_db() + _insert_pr(conn, 42) + + mock_api = AsyncMock(return_value=None) # forgejo_api returns None on failure + with patch("lib.pr_state.forgejo_api", mock_api), \ + patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"): + result = asyncio.run(close_pr(conn, 42, last_error="should not persist")) + + assert result is False + row = _get_pr(conn, 42) + assert row["status"] == "open" # DB not touched + assert row["last_error"] is None # last_error not set + + def test_close_skips_forgejo_check_when_opted_out(self): + """close_on_forgejo=False always succeeds (caller already closed Forgejo).""" + conn = _make_db() + _insert_pr(conn, 42) + + mock_api = AsyncMock(return_value=None) # would fail if called + with patch("lib.pr_state.forgejo_api", mock_api), \ + patch("lib.pr_state.repo_path", lambda s: f"/repos/test/{s}"): + result = asyncio.run(close_pr(conn, 42, close_on_forgejo=False)) + + assert result is True + row = _get_pr(conn, 42) + assert row["status"] == "closed" + mock_api.assert_not_called() + # --------------------------------------------------------------------------- # approve_pr