teleo-codex/ops/pipeline-v2/lib/fixer.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

295 lines
12 KiB
Python

"""Auto-fixer stage — mechanical fixes for known issue types.
Currently fixes:
- broken_wiki_links: strips [[ ]] brackets from links that don't resolve
Runs as a pipeline stage on FIX_INTERVAL. Only fixes mechanical issues
that don't require content understanding. Does NOT fix frontmatter_schema,
near_duplicate, or any substantive issues.
Key design decisions (Ganymede):
- Only fix files in the PR diff (not the whole worktree/repo)
- Add intra-PR file stems to valid set (avoids stripping cross-references
between new claims in the same PR)
- Atomic claim via status='fixing' (same pattern as eval's 'reviewing')
- fix_attempts cap prevents infinite fix loops
- Reset eval_attempts + tier0_pass on successful fix for re-evaluation
"""
import asyncio
import json
import logging
from pathlib import Path
from . import config, db
from .validate import WIKI_LINK_RE, load_existing_claims
logger = logging.getLogger("pipeline.fixer")
# ─── Git helper (async subprocess, same pattern as merge.py) ─────────────
async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]:
"""Run a git command async. Returns (returncode, combined output)."""
proc = await asyncio.create_subprocess_exec(
"git",
*args,
cwd=cwd or str(config.REPO_DIR),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
return -1, f"git {args[0]} timed out after {timeout}s"
output = (stdout or b"").decode().strip()
if stderr:
output += "\n" + stderr.decode().strip()
return proc.returncode, output
# ─── Wiki link fixer ─────────────────────────────────────────────────────
async def _fix_wiki_links_in_pr(conn, pr_number: int) -> dict:
"""Fix broken wiki links in a single PR by stripping brackets.
Only processes files in the PR diff (not the whole repo).
Adds intra-PR file stems to the valid set so cross-references
between new claims in the same PR are preserved.
"""
# Atomic claim — prevent concurrent fixers and evaluators
cursor = conn.execute(
"UPDATE prs SET status = 'fixing', last_attempt = datetime('now') WHERE number = ? AND status = 'open'",
(pr_number,),
)
if cursor.rowcount == 0:
return {"pr": pr_number, "skipped": True, "reason": "not_open"}
# Increment fix_attempts
conn.execute(
"UPDATE prs SET fix_attempts = COALESCE(fix_attempts, 0) + 1 WHERE number = ?",
(pr_number,),
)
# Get PR branch from DB first, fall back to Forgejo API
row = conn.execute("SELECT branch FROM prs WHERE number = ?", (pr_number,)).fetchone()
branch = row["branch"] if row and row["branch"] else None
if not branch:
from .forgejo import api as forgejo_api
from .forgejo import repo_path
pr_info = await forgejo_api("GET", repo_path(f"pulls/{pr_number}"))
if pr_info:
branch = pr_info.get("head", {}).get("ref")
if not branch:
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "no_branch"}
# Fetch latest refs
await _git("fetch", "origin", branch, timeout=30)
# Create worktree
worktree_path = str(config.BASE_DIR / "workspaces" / f"fix-{pr_number}")
rc, out = await _git("worktree", "add", "--detach", worktree_path, f"origin/{branch}")
if rc != 0:
logger.error("PR #%d: worktree creation failed: %s", pr_number, out)
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "worktree_failed"}
try:
# Checkout the actual branch (so we can push)
rc, out = await _git("checkout", "-B", branch, f"origin/{branch}", cwd=worktree_path)
if rc != 0:
logger.error("PR #%d: checkout failed: %s", pr_number, out)
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "checkout_failed"}
# Get files changed in PR (only fix these, not the whole repo)
rc, out = await _git("diff", "--name-only", "origin/main...HEAD", cwd=worktree_path)
if rc != 0:
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "diff_failed"}
pr_files = [f for f in out.split("\n") if f.strip() and f.endswith(".md")]
if not pr_files:
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "no_md_files"}
# Load existing claims from main + add intra-PR stems
# (avoids stripping cross-references between new claims in same PR)
existing_claims = load_existing_claims()
for f in pr_files:
existing_claims.add(Path(f).stem)
# Fix broken links in each PR file
total_fixed = 0
for filepath in pr_files:
full_path = Path(worktree_path) / filepath
if not full_path.is_file():
continue
content = full_path.read_text(encoding="utf-8")
file_fixes = 0
def replace_broken_link(match):
nonlocal file_fixes
link_text = match.group(1)
if link_text.strip() not in existing_claims:
file_fixes += 1
return link_text # Strip brackets, keep text
return match.group(0) # Keep valid link
new_content = WIKI_LINK_RE.sub(replace_broken_link, content)
if new_content != content:
full_path.write_text(new_content, encoding="utf-8")
total_fixed += file_fixes
if total_fixed == 0:
# No broken links found — issue might be something else
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "no_broken_links"}
# Commit and push
rc, out = await _git("add", *pr_files, cwd=worktree_path)
if rc != 0:
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "git_add_failed"}
commit_msg = (
f"auto-fix: strip {total_fixed} broken wiki links\n\n"
f"Pipeline auto-fixer: removed [[ ]] brackets from links\n"
f"that don't resolve to existing claims in the knowledge base."
)
rc, out = await _git("commit", "-m", commit_msg, cwd=worktree_path)
if rc != 0:
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (pr_number,))
return {"pr": pr_number, "skipped": True, "reason": "commit_failed"}
# Reset eval state BEFORE push — if daemon crashes between push and
# reset, the PR would be permanently stuck at max eval_attempts.
# Reset-first: worst case is one wasted eval cycle on old content.
conn.execute(
"""UPDATE prs SET
status = 'open',
eval_attempts = 0,
eval_issues = '[]',
tier0_pass = NULL,
domain_verdict = 'pending',
leo_verdict = 'pending',
last_error = NULL
WHERE number = ?""",
(pr_number,),
)
rc, out = await _git("push", "origin", branch, cwd=worktree_path, timeout=30)
if rc != 0:
logger.error("PR #%d: push failed: %s", pr_number, out)
# Eval state already reset — PR will re-evaluate old content,
# find same issues, and fixer will retry next cycle. No harm.
return {"pr": pr_number, "skipped": True, "reason": "push_failed"}
db.audit(
conn,
"fixer",
"wiki_links_fixed",
json.dumps({"pr": pr_number, "links_fixed": total_fixed}),
)
logger.info("PR #%d: fixed %d broken wiki links, reset for re-evaluation", pr_number, total_fixed)
return {"pr": pr_number, "fixed": True, "links_fixed": total_fixed}
finally:
# Always cleanup worktree
await _git("worktree", "remove", "--force", worktree_path)
# ─── Stage entry point ───────────────────────────────────────────────────
async def fix_cycle(conn, max_workers=None) -> tuple[int, int]:
"""Run one fix cycle. Returns (fixed, errors).
Finds PRs with broken_wiki_links issues (from eval or tier0) that
haven't exceeded fix_attempts cap. Processes up to 5 per cycle
to avoid overlapping with eval.
"""
# Garbage collection: close PRs with exhausted fix budget that are stuck in open.
# These were evaluated, rejected, fixer couldn't help, nobody closes them.
# (Epimetheus session 2 — prevents zombie PR accumulation)
# Bug fix: must also close on Forgejo + delete branch, not just DB update.
# DB-only close caused Forgejo/DB state divergence — branches stayed alive,
# blocking Gate 2 in batch-extract for 5 days. (Epimetheus session 4)
gc_rows = conn.execute(
"""SELECT number, branch FROM prs
WHERE status = 'open'
AND fix_attempts >= ?
AND (domain_verdict = 'request_changes' OR leo_verdict = 'request_changes')""",
(config.MAX_FIX_ATTEMPTS + 2,),
).fetchall()
if gc_rows:
from .forgejo import api as _gc_forgejo, repo_path as _gc_repo_path
for row in gc_rows:
pr_num, branch = row["number"], row["branch"]
try:
await _gc_forgejo("POST", _gc_repo_path(f"issues/{pr_num}/comments"),
{"body": "Auto-closed: fix budget exhausted. Source will be re-extracted."})
await _gc_forgejo("PATCH", _gc_repo_path(f"pulls/{pr_num}"), {"state": "closed"})
if branch:
await _gc_forgejo("DELETE", _gc_repo_path(f"branches/{branch}"))
except Exception as e:
logger.warning("GC: failed to close PR #%d on Forgejo: %s", pr_num, e)
conn.execute(
"UPDATE prs SET status = 'closed', last_error = 'fix budget exhausted — auto-closed' WHERE number = ?",
(pr_num,),
)
logger.info("GC: closed %d exhausted PRs (DB + Forgejo + branch cleanup)", len(gc_rows))
batch_limit = min(max_workers or config.MAX_FIX_PER_CYCLE, config.MAX_FIX_PER_CYCLE)
# Only fix PRs that passed tier0 but have broken_wiki_links from eval.
# Do NOT fix PRs with tier0_pass=0 where the only issue is wiki links —
# wiki links are warnings, not gates. Fixing them creates an infinite
# fixer→validate→fixer loop. (Epimetheus session 2 — root cause of overnight stall)
rows = conn.execute(
"""SELECT number FROM prs
WHERE status = 'open'
AND tier0_pass = 1
AND eval_issues LIKE '%broken_wiki_links%'
AND COALESCE(fix_attempts, 0) < ?
AND (last_attempt IS NULL OR last_attempt < datetime('now', '-5 minutes'))
ORDER BY created_at ASC
LIMIT ?""",
(config.MAX_FIX_ATTEMPTS, batch_limit),
).fetchall()
if not rows:
return 0, 0
fixed = 0
errors = 0
for row in rows:
try:
result = await _fix_wiki_links_in_pr(conn, row["number"])
if result.get("fixed"):
fixed += 1
elif result.get("skipped"):
logger.debug("PR #%d fix skipped: %s", row["number"], result.get("reason"))
except Exception:
logger.exception("Failed to fix PR #%d", row["number"])
errors += 1
conn.execute("UPDATE prs SET status = 'open' WHERE number = ?", (row["number"],))
if fixed or errors:
logger.info("Fix cycle: %d fixed, %d errors", fixed, errors)
return fixed, errors