"""Substantive fixer — acts on reviewer feedback for non-mechanical issues. When Leo or a domain agent requests changes with substantive issues (confidence_miscalibration, title_overclaims, scope_error, near_duplicate), this module reads the claim + reviewer comment + original source material, sends to an LLM, pushes the fix, and resets eval. Issue routing: FIXABLE (confidence, title, scope) → LLM edits the claim CONVERTIBLE (near_duplicate) → flag for Leo to pick target, then convert UNFIXABLE (factual_discrepancy) → close PR, re-extract with feedback DROPPABLE (low-value, reviewer explicitly closed) → close PR Design reviewed by Ganymede (architecture), Rhea (ops), Leo (quality). Epimetheus owns this module. Leo reviews changes. """ import asyncio import json import logging import os import re from pathlib import Path from . import config, db from .forgejo import api as forgejo_api, get_agent_token, get_pr_diff, repo_path from .llm import openrouter_call logger = logging.getLogger("pipeline.substantive_fixer") # Issue type routing FIXABLE_TAGS = {"confidence_miscalibration", "title_overclaims", "scope_error", "frontmatter_schema"} CONVERTIBLE_TAGS = {"near_duplicate"} UNFIXABLE_TAGS = {"factual_discrepancy"} # Max substantive fix attempts per PR (Rhea: prevent infinite loops) MAX_SUBSTANTIVE_FIXES = 2 # Model for fixes — Gemini Flash: cheap ($0.001/fix), different family from Sonnet reviewer FIX_MODEL = config.MODEL_GEMINI_FLASH # ─── Fix prompt ──────────────────────────────────────────────────────────── def _build_fix_prompt( claim_content: str, review_comment: str, issue_tags: list[str], source_content: str | None, domain_index: str | None = None, ) -> str: """Build the targeted fix prompt. Includes claim + reviewer feedback + source material. Does NOT re-extract — makes targeted edits based on specific feedback. """ source_section = "" if source_content: # Truncate source to keep prompt manageable source_section = f""" ## Original Source Material {source_content[:8000]} """ index_section = "" if domain_index and "near_duplicate" in issue_tags: index_section = f""" ## Existing Claims in Domain (for near-duplicate resolution) {domain_index[:4000]} """ issue_descriptions = [] for tag in issue_tags: if tag == "confidence_miscalibration": issue_descriptions.append("CONFIDENCE: Reviewer says the confidence level doesn't match the evidence.") elif tag == "title_overclaims": issue_descriptions.append("TITLE: Reviewer says the title asserts more than the evidence supports.") elif tag == "scope_error": issue_descriptions.append("SCOPE: Reviewer says the claim needs explicit scope qualification.") elif tag == "near_duplicate": issue_descriptions.append("DUPLICATE: Reviewer says this substantially duplicates an existing claim.") return f"""You are fixing a knowledge base claim based on reviewer feedback. Make targeted edits — do NOT rewrite from scratch. ## The Claim (current version) {claim_content} ## Reviewer Feedback {review_comment} ## Issues to Fix {chr(10).join(issue_descriptions)} {source_section} {index_section} ## Rules 1. **Implement the reviewer's explicit instructions.** If the reviewer says "change confidence to experimental," do that. If the reviewer says "confidence seems high" without a specific target, set it to one level below current. 2. **For title_overclaims:** Scope the title down to match evidence. Add qualifiers. Keep the mechanism but bound the claim. 3. **For scope_error:** Add explicit scope (structural/functional/causal/correlational) to the title. Add scoping language to the body. 4. **For near_duplicate:** Do NOT fix. Instead, identify the top 3 most similar existing claims from the domain index and output them in your response. The reviewer will pick the target. 5. **Preserve the claim's core argument.** You're adjusting precision, not changing what the claim says. 6. **Keep all frontmatter fields.** Do not remove or rename fields. Only modify the values the reviewer flagged. ## Output For FIXABLE issues (confidence, title, scope): Return the complete fixed claim file content (full markdown with frontmatter). For near_duplicate: Return JSON: ```json {{"action": "flag_duplicate", "candidates": ["existing-claim-1.md", "existing-claim-2.md", "existing-claim-3.md"], "reasoning": "Why each candidate matches"}} ``` """ # ─── Git helpers ─────────────────────────────────────────────────────────── async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]: proc = await asyncio.create_subprocess_exec( "git", *args, cwd=cwd or str(config.REPO_DIR), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) try: stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout) except asyncio.TimeoutError: proc.kill() await proc.wait() return -1, f"git {args[0]} timed out" output = (stdout or b"").decode().strip() if stderr: output += "\n" + stderr.decode().strip() return proc.returncode, output # ─── Source and review retrieval ─────────────────────────────────────────── def _read_source_content(source_path: str) -> str | None: """Read source archive from main worktree.""" if not source_path: return None full_path = config.MAIN_WORKTREE / source_path try: return full_path.read_text() except (FileNotFoundError, PermissionError): return None async def _get_review_comments(pr_number: int) -> str: """Get all review comments for a PR, concatenated.""" comments = [] page = 1 while True: result = await forgejo_api( "GET", repo_path(f"issues/{pr_number}/comments?limit=50&page={page}"), ) if not result: break for c in result: body = c.get("body", "") # Skip tier0 validation comments and pipeline ack comments if "TIER0-VALIDATION" in body or "queued for evaluation" in body: continue if "VERDICT:" in body or "REJECTION:" in body: comments.append(body) if len(result) < 50: break page += 1 return "\n\n---\n\n".join(comments) async def _get_claim_files_from_pr(pr_number: int) -> dict[str, str]: """Get claim file contents from a PR's diff.""" diff = await get_pr_diff(pr_number) if not diff: return {} from .validate import extract_claim_files_from_diff return extract_claim_files_from_diff(diff) def _get_domain_index(domain: str) -> str | None: """Get domain-filtered KB index for near-duplicate resolution.""" index_file = f"/tmp/kb-indexes/{domain}.txt" if os.path.exists(index_file): return Path(index_file).read_text() # Fallback: list domain claim files domain_dir = config.MAIN_WORKTREE / "domains" / domain if not domain_dir.is_dir(): return None lines = [] for f in sorted(domain_dir.glob("*.md")): if not f.name.startswith("_"): lines.append(f"- {f.name}: {f.stem.replace('-', ' ')}") return "\n".join(lines[:150]) if lines else None # ─── Issue classification ────────────────────────────────────────────────── def _classify_substantive(issues: list[str]) -> str: """Classify issue list as fixable/convertible/unfixable/droppable.""" issue_set = set(issues) if issue_set & UNFIXABLE_TAGS: return "unfixable" if issue_set & CONVERTIBLE_TAGS and not (issue_set & FIXABLE_TAGS): return "convertible" if issue_set & FIXABLE_TAGS: return "fixable" return "droppable" # ─── Fix execution ──────────────────────────────────────────────────────── async def _fix_pr(conn, pr_number: int) -> dict: """Attempt a substantive fix on a single PR. Returns result dict.""" # Atomic claim cursor = conn.execute( "UPDATE prs SET status = 'fixing', last_attempt = datetime('now') WHERE number = ? AND status = 'open'", (pr_number,), ) if cursor.rowcount == 0: return {"pr": pr_number, "skipped": True, "reason": "not_open"} # Increment fix attempts conn.execute( "UPDATE prs SET fix_attempts = COALESCE(fix_attempts, 0) + 1 WHERE number = ?", (pr_number,), ) row = conn.execute( "SELECT branch, source_path, domain, eval_issues, fix_attempts FROM prs WHERE number = ?", (pr_number,), ).fetchone() branch = row["branch"] source_path = row["source_path"] domain = row["domain"] fix_attempts = row["fix_attempts"] or 0 # Parse issue tags try: issues = json.loads(row["eval_issues"] or "[]") except (json.JSONDecodeError, TypeError): issues = [] # Check fix budget if fix_attempts > MAX_SUBSTANTIVE_FIXES: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "fix_budget_exhausted"} # Classify classification = _classify_substantive(issues) if classification == "unfixable": # Close and re-extract logger.info("PR #%d: unfixable (%s) — closing, source re-queued", pr_number, issues) await _close_and_reextract(conn, pr_number, issues) return {"pr": pr_number, "action": "closed_reextract", "issues": issues} if classification == "droppable": logger.info("PR #%d: droppable (%s) — closing", pr_number, issues) conn.execute( "UPDATE prs SET status = 'closed', last_error = ? WHERE number = ?", (f"droppable: {issues}", pr_number), ) return {"pr": pr_number, "action": "closed_droppable", "issues": issues} # Refresh main worktree for source read (Ganymede: ensure freshness) await _git("fetch", "origin", "main", cwd=str(config.MAIN_WORKTREE)) await _git("reset", "--hard", "origin/main", cwd=str(config.MAIN_WORKTREE)) # Gather context review_text = await _get_review_comments(pr_number) claim_files = await _get_claim_files_from_pr(pr_number) source_content = _read_source_content(source_path) domain_index = _get_domain_index(domain) if "near_duplicate" in issues else None if not claim_files: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "no_claim_files"} if not review_text: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "no_review_comments"} if classification == "convertible": # Near-duplicate: auto-convert to enrichment if high-confidence match (>= 0.90). # Below threshold: flag for Leo. (Leo approved: "evidence loss > wrong target risk") result = await _auto_convert_near_duplicate( conn, pr_number, claim_files, domain, ) if result.get("converted"): conn.execute( "UPDATE prs SET status = 'closed', last_error = ? WHERE number = ?", (f"auto-enriched: {result['target_claim']} (sim={result['similarity']:.2f})", pr_number), ) await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"}) await forgejo_api("POST", repo_path(f"issues/{pr_number}/comments"), { "body": ( f"**Auto-converted:** Evidence from this PR enriched " f"`{result['target_claim']}` (similarity: {result['similarity']:.2f}).\n\n" f"Leo: review if wrong target. Enrichment labeled " f"`### Auto-enrichment (near-duplicate conversion)` in the target file." ), }) db.audit(conn, "substantive_fixer", "auto_enrichment", json.dumps({ "pr": pr_number, "target_claim": result["target_claim"], "similarity": round(result["similarity"], 3), "domain": domain, })) logger.info("PR #%d: auto-enriched on %s (sim=%.2f)", pr_number, result["target_claim"], result["similarity"]) return {"pr": pr_number, "action": "auto_enriched", "target": result["target_claim"]} else: # Below 0.90 threshold — flag for Leo logger.info("PR #%d: near_duplicate, best match %.2f < 0.90 — flagging Leo", pr_number, result.get("best_similarity", 0)) await _flag_for_leo_review(conn, pr_number, claim_files, review_text, domain_index) conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "action": "flagged_duplicate", "issues": issues} # FIXABLE: send to LLM # Fix each claim file individually fixed_any = False for filepath, content in claim_files.items(): prompt = _build_fix_prompt(content, review_text, issues, source_content, domain_index) result, _usage = await openrouter_call(FIX_MODEL, prompt, timeout_sec=120, max_tokens=4096) if not result: logger.warning("PR #%d: fix LLM call failed for %s", pr_number, filepath) continue # Check if result is a duplicate flag (JSON) or fixed content (markdown) if result.strip().startswith("{"): try: parsed = json.loads(result) if parsed.get("action") == "flag_duplicate": await _flag_for_leo_review(conn, pr_number, claim_files, review_text, domain_index) conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "action": "flagged_duplicate_by_llm"} except json.JSONDecodeError: pass # Write fixed content to worktree and push fixed_any = True logger.info("PR #%d: fixed %s for %s", pr_number, filepath, issues) if not fixed_any: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "no_fixes_applied"} # Push fix and reset for re-eval # Create worktree, apply fix, commit, push worktree_path = str(config.BASE_DIR / "workspaces" / f"subfix-{pr_number}") await _git("fetch", "origin", branch, timeout=30) rc, out = await _git("worktree", "add", "--detach", worktree_path, f"origin/{branch}") if rc != 0: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "worktree_failed"} try: rc, out = await _git("checkout", "-B", branch, f"origin/{branch}", cwd=worktree_path) if rc != 0: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "checkout_failed"} # Write fixed files for filepath, content in claim_files.items(): prompt = _build_fix_prompt(content, review_text, issues, source_content, domain_index) fixed_content, _usage = await openrouter_call(FIX_MODEL, prompt, timeout_sec=120, max_tokens=4096) if fixed_content and not fixed_content.strip().startswith("{"): full_path = Path(worktree_path) / filepath full_path.parent.mkdir(parents=True, exist_ok=True) full_path.write_text(fixed_content) # Commit and push rc, _ = await _git("add", "-A", cwd=worktree_path) commit_msg = f"substantive-fix: address reviewer feedback ({', '.join(issues)})" rc, _ = await _git("commit", "-m", commit_msg, cwd=worktree_path) if rc != 0: conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,)) return {"pr": pr_number, "skipped": True, "reason": "nothing_to_commit"} # Reset eval state BEFORE push (same pattern as fixer.py) conn.execute( """UPDATE prs SET status = 'open', eval_attempts = 0, eval_issues = '[]', tier0_pass = NULL, domain_verdict = 'pending', leo_verdict = 'pending', last_error = NULL WHERE number = ?""", (pr_number,), ) rc, out = await _git("push", "origin", branch, cwd=worktree_path, timeout=30) if rc != 0: logger.error("PR #%d: push failed: %s", pr_number, out) return {"pr": pr_number, "skipped": True, "reason": "push_failed"} db.audit( conn, "substantive_fixer", "fixed", json.dumps({"pr": pr_number, "issues": issues, "attempt": fix_attempts}), ) logger.info("PR #%d: substantive fix pushed, reset for re-eval", pr_number) return {"pr": pr_number, "action": "fixed", "issues": issues} finally: await _git("worktree", "remove", "--force", worktree_path) async def _auto_convert_near_duplicate( conn, pr_number: int, claim_files: dict, domain: str, ) -> dict: """Auto-convert a near-duplicate claim into an enrichment on the best-match existing claim. Returns {"converted": True, "target_claim": "...", "similarity": 0.95} on success. Returns {"converted": False, "best_similarity": 0.80} when no match >= 0.90. Threshold 0.90 (Leo: conservative, lower later based on false-positive rate). """ from difflib import SequenceMatcher SIMILARITY_THRESHOLD = 0.90 main_wt = str(config.MAIN_WORKTREE) # Get the duplicate claim's title and body first_filepath = next(iter(claim_files.keys()), "") first_content = next(iter(claim_files.values()), "") dup_title = Path(first_filepath).stem.replace("-", " ").lower() # Extract the body (evidence) from the duplicate — this is what we preserve from .post_extract import parse_frontmatter fm, body = parse_frontmatter(first_content) if not body: body = first_content # Fallback: use full content # Strip the H1 and Relevant Notes sections — keep just the argument evidence = re.sub(r"^# .+\n*", "", body).strip() evidence = re.split(r"\n---\n", evidence)[0].strip() if not evidence or len(evidence) < 20: return {"converted": False, "best_similarity": 0, "reason": "no_evidence_to_preserve"} # Find best-match existing claim in the domain domain_dir = Path(main_wt) / "domains" / (domain or "") best_match = None best_similarity = 0.0 if domain_dir.is_dir(): for f in domain_dir.glob("*.md"): if f.name.startswith("_"): continue existing_title = f.stem.replace("-", " ").lower() sim = SequenceMatcher(None, dup_title, existing_title).ratio() if sim > best_similarity: best_similarity = sim best_match = f if best_similarity < SIMILARITY_THRESHOLD or best_match is None: return {"converted": False, "best_similarity": best_similarity} # Queue the enrichment — entity_batch handles the actual write to main. # Single writer pattern prevents race conditions. (Ganymede) from .entity_queue import queue_enrichment try: queue_enrichment( target_claim=best_match.name, evidence=evidence, pr_number=pr_number, original_title=dup_title, similarity=best_similarity, domain=domain or "", ) except Exception as e: logger.error("PR #%d: failed to queue enrichment: %s", pr_number, e) return {"converted": False, "best_similarity": best_similarity, "reason": f"queue_failed: {e}"} return { "converted": True, "target_claim": best_match.name, "similarity": best_similarity, } async def _close_and_reextract(conn, pr_number: int, issues: list[str]): """Close PR and mark source for re-extraction with feedback.""" await forgejo_api( "PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"}, ) conn.execute( "UPDATE prs SET status = 'closed', last_error = ? WHERE number = ?", (f"unfixable: {', '.join(issues)}", pr_number), ) conn.execute( """UPDATE sources SET status = 'needs_reextraction', feedback = ?, updated_at = datetime('now') WHERE path = (SELECT source_path FROM prs WHERE number = ?)""", (json.dumps({"issues": issues, "pr": pr_number}), pr_number), ) db.audit(conn, "substantive_fixer", "closed_reextract", json.dumps({"pr": pr_number, "issues": issues})) async def _flag_for_leo_review( conn, pr_number: int, claim_files: dict, review_text: str, domain_index: str | None, ): """Flag a near-duplicate PR for Leo to pick the enrichment target.""" # Get first claim content for matching first_claim = next(iter(claim_files.values()), "") # Use LLM to identify candidate matches if domain_index: prompt = _build_fix_prompt(first_claim, review_text, ["near_duplicate"], None, domain_index) result, _usage = await openrouter_call(FIX_MODEL, prompt, timeout_sec=60, max_tokens=1024) candidates_text = result or "Could not identify candidates." else: candidates_text = "No domain index available." comment = ( f"**Substantive fixer: near-duplicate detected**\n\n" f"This PR's claims may duplicate existing KB content. " f"Leo: please pick the enrichment target or close if not worth converting.\n\n" f"**Candidate matches:**\n{candidates_text}\n\n" f"_Reply with the target claim filename to convert, or close the PR._" ) await forgejo_api( "POST", repo_path(f"issues/{pr_number}/comments"), {"body": comment}, ) db.audit(conn, "substantive_fixer", "flagged_duplicate", json.dumps({"pr": pr_number})) # ─── Stage entry point ───────────────────────────────────────────────────── async def substantive_fix_cycle(conn, max_workers=None) -> tuple[int, int]: """Run one substantive fix cycle. Called by the fixer stage after mechanical fixes. Finds PRs with substantive issue tags that haven't exceeded fix budget. Processes up to 3 per cycle (Rhea: 180s interval, don't overwhelm eval). """ rows = conn.execute( """SELECT number, eval_issues FROM prs WHERE status = 'open' AND tier0_pass = 1 AND (domain_verdict = 'request_changes' OR leo_verdict = 'request_changes') AND COALESCE(fix_attempts, 0) < ? AND (last_attempt IS NULL OR last_attempt < datetime('now', '-3 minutes')) ORDER BY created_at ASC LIMIT 3""", (MAX_SUBSTANTIVE_FIXES + config.MAX_FIX_ATTEMPTS,), # Total budget: mechanical + substantive ).fetchall() if not rows: return 0, 0 # Filter to only PRs with substantive issues (not just mechanical) substantive_rows = [] for row in rows: try: issues = json.loads(row["eval_issues"] or "[]") except (json.JSONDecodeError, TypeError): continue if set(issues) & (FIXABLE_TAGS | CONVERTIBLE_TAGS | UNFIXABLE_TAGS): substantive_rows.append(row) if not substantive_rows: return 0, 0 fixed = 0 errors = 0 for row in substantive_rows: try: result = await _fix_pr(conn, row["number"]) if result.get("action"): fixed += 1 elif result.get("skipped"): logger.debug("PR #%d: substantive fix skipped: %s", row["number"], result.get("reason")) except Exception: logger.exception("PR #%d: substantive fix failed", row["number"]) errors += 1 conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (row["number"],)) if fixed or errors: logger.info("Substantive fix cycle: %d fixed, %d errors", fixed, errors) return fixed, errors