From e14b5f2f0533962d11325fdc0dd71dd993d87fd4 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 7 May 2026 18:18:32 -0400 Subject: [PATCH] =?UTF-8?q?fix(reaper):=20apply=20Ganymede=20review=20?= =?UTF-8?q?=E2=80=94=20dual-PATCH=20drift,=20breaker=20isolation,=20env=20?= =?UTF-8?q?config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Followup to f97dd15. Four fixes from review: MUST-FIX #1 — Forgejo double-PATCH drift reaper closes PR via forgejo_api PATCH at line 689, then close_pr() at line 700 issued a second PATCH (default close_on_forgejo=True). On transient failure of the second PATCH, close_pr returns False without updating the DB → status='open' even though Forgejo is closed. Pass close_on_forgejo=False so DB close is unconditional after the explicit Forgejo PATCH succeeds. MUST-FIX #2 — reaper exception trips fix breaker Unhandled exception in verdict_deadlock_reaper_cycle propagated to stage_loop, recording fix-stage failures. After 5 reaper failures the fix breaker would open and block mechanical+substantive for 15 min. Wrap reaper call in try/except in fix_cycle (same exception-isolation pattern as ingest_cycle's extract_cycle wrapper). Defense-in-depth must never block primary paths. WARNING #1 — throttle SQL full-scan audit_log only has idx_audit_stage. Filtering on event alone caused full-table scans every 60s. Added stage='reaper' so the planner uses the existing index — reaper writes audit rows under stage='reaper' already so the filter is correct. WARNING #2 — REAPER_DRY_RUN as code constant Flipping dry-run → live required edit + commit + push + deploy + restart. Moved REAPER_DRY_RUN, REAPER_DEADLOCK_AGE_HOURS, REAPER_INTERVAL_SECONDS, REAPER_MAX_PER_RUN to lib/config.py with os.environ.get() overrides. Operator now flips via systemctl edit teleo-pipeline.service (Environment=REAPER_DRY_RUN=false) + restart. Defaults remain safe: dry-run, 24h age, hourly throttle, 50/run cap. NIT — dry-run counter naming Renamed local `closed` counter in dry-run path to `would_close` so the heartbeat audit ("X closed, Y would-close") and journal log are unambiguous. Function still returns closed + would_close so callers see total work done. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/config.py | 8 ++++++ lib/substantive_fixer.py | 55 ++++++++++++++++++++++++---------------- teleo-pipeline.py | 13 +++++++++- 3 files changed, 53 insertions(+), 23 deletions(-) diff --git a/lib/config.py b/lib/config.py index fb09412..b408fd4 100644 --- a/lib/config.py +++ b/lib/config.py @@ -211,6 +211,14 @@ HEALTH_CHECK_INTERVAL = 60 # --- Extraction gates --- EXTRACTION_COOLDOWN_HOURS = 4 # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter. +# --- Verdict-deadlock reaper --- +# Defaults safe (dry-run, 24h age, hourly throttle). Operator flips REAPER_DRY_RUN +# to "false" via systemctl edit teleo-pipeline → restart, no code change required. +REAPER_DRY_RUN = os.environ.get("REAPER_DRY_RUN", "true").lower() == "true" +REAPER_DEADLOCK_AGE_HOURS = int(os.environ.get("REAPER_DEADLOCK_AGE_HOURS", "24")) +REAPER_INTERVAL_SECONDS = int(os.environ.get("REAPER_INTERVAL_SECONDS", "3600")) +REAPER_MAX_PER_RUN = int(os.environ.get("REAPER_MAX_PER_RUN", "50")) + # --- Retrieval (Telegram bot) --- RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities diff --git a/lib/substantive_fixer.py b/lib/substantive_fixer.py index 04f0f5d..eabe93e 100644 --- a/lib/substantive_fixer.py +++ b/lib/substantive_fixer.py @@ -586,14 +586,14 @@ async def substantive_fix_cycle(conn, max_workers=None) -> tuple[int, int]: # structured eval_issues. fixer can't classify → silent skip → forever. # # Both shapes need a clearance path. Reaper closes them after a 24h cooldown -# with audit_log breadcrumbs for forensics. First deploy runs in DRY_RUN -# mode (audit "would_close" events only — no Forgejo writes, no DB closes). -# Operator reviews dry-run output, flips REAPER_DRY_RUN to False, redeploys. - -REAPER_DEADLOCK_AGE_HOURS = 24 -REAPER_INTERVAL_SECONDS = 3600 # at most once per hour -REAPER_MAX_PER_RUN = 50 # safety cap so a single cycle can't close everything -REAPER_DRY_RUN = True # FIRST-DEPLOY DEFAULT — flip to False after audit verification +# with audit_log breadcrumbs for forensics. First deploy runs in dry-run mode +# (audit "would_close" events only — no Forgejo writes, no DB closes). +# +# Reaper config (REAPER_DRY_RUN, REAPER_DEADLOCK_AGE_HOURS, REAPER_INTERVAL_SECONDS, +# REAPER_MAX_PER_RUN) lives in lib/config.py with env-var overrides — operator +# flips dry-run to live via `systemctl edit teleo-pipeline.service` +# (Environment=REAPER_DRY_RUN=false) + restart. No code change, no commit, no +# redeploy required. async def verdict_deadlock_reaper_cycle(conn) -> int: @@ -604,14 +604,15 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: """ # Throttle: skip if last reaper run was within REAPER_INTERVAL_SECONDS. # Uses audit_log as the rate-limit ledger so no schema/state needed. + # stage='reaper' filter so the planner uses idx_audit_stage (avoids full scan). last_run = conn.execute( "SELECT MAX(timestamp) FROM audit_log " - "WHERE event = 'verdict_deadlock_reaper_run'" + "WHERE stage = 'reaper' AND event = 'verdict_deadlock_reaper_run'" ).fetchone()[0] if last_run: cur = conn.execute( "SELECT (julianday('now') - julianday(?)) * 86400 < ?", - (last_run, REAPER_INTERVAL_SECONDS), + (last_run, config.REAPER_INTERVAL_SECONDS), ).fetchone()[0] if cur: return 0 @@ -631,10 +632,10 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: ) ORDER BY last_attempt ASC LIMIT ?""", - (f"-{REAPER_DEADLOCK_AGE_HOURS}", REAPER_MAX_PER_RUN), + (f"-{config.REAPER_DEADLOCK_AGE_HOURS}", config.REAPER_MAX_PER_RUN), ).fetchall() - mode = "dryrun" if REAPER_DRY_RUN else "live" + mode = "dryrun" if config.REAPER_DRY_RUN else "live" if not rows: # Heartbeat anyway so throttle ticks even when nothing to reap. @@ -645,10 +646,11 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: logger.info( "Verdict-deadlock reaper [%s]: %d candidate(s) in deadlock >%dh", - mode, len(rows), REAPER_DEADLOCK_AGE_HOURS, + mode, len(rows), config.REAPER_DEADLOCK_AGE_HOURS, ) closed = 0 + would_close = 0 errors = 0 for row in rows: pr = row["number"] @@ -662,7 +664,7 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: "fix_attempts": row["fix_attempts"], } - if REAPER_DRY_RUN: + if config.REAPER_DRY_RUN: # Audit only — do NOT touch DB row or Forgejo state. db.audit(conn, "reaper", "verdict_deadlock_would_close", json.dumps(reason_detail)) @@ -670,13 +672,13 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: "Reaper [dryrun]: would close PR #%d (leo=%s domain=%s issues=%s)", pr, row["leo_verdict"], row["domain_verdict"], row["eval_issues"], ) - closed += 1 + would_close += 1 continue try: comment_body = ( "Closed by verdict-deadlock reaper.\n\n" - f"This PR sat for >{REAPER_DEADLOCK_AGE_HOURS}h with conflicting " + f"This PR sat for >{config.REAPER_DEADLOCK_AGE_HOURS}h with conflicting " f"verdicts (leo={row['leo_verdict']}, domain={row['domain_verdict']}) " f"that the substantive fixer couldn't auto-resolve.\n\n" f"Eval issues: `{row['eval_issues']}`\n" @@ -697,12 +699,16 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: ) errors += 1 continue + # Forgejo already closed at the PATCH above — pass close_on_forgejo=False + # so close_pr() doesn't issue a redundant PATCH (which on transient + # failure returns False and skips the DB close → status drift). await close_pr( conn, pr, last_error=( f"verdict_deadlock_reaper: leo={row['leo_verdict']} " - f"domain={row['domain_verdict']} age>{REAPER_DEADLOCK_AGE_HOURS}h" + f"domain={row['domain_verdict']} age>{config.REAPER_DEADLOCK_AGE_HOURS}h" ), + close_on_forgejo=False, ) db.audit(conn, "reaper", "verdict_deadlock_closed", json.dumps(reason_detail)) @@ -712,11 +718,16 @@ async def verdict_deadlock_reaper_cycle(conn) -> int: errors += 1 db.audit(conn, "reaper", "verdict_deadlock_reaper_run", json.dumps({ - "candidates": len(rows), "closed": closed, "errors": errors, "mode": mode, + "candidates": len(rows), "closed": closed, "would_close": would_close, + "errors": errors, "mode": mode, })) if errors: - logger.warning("Verdict-deadlock reaper [%s]: %d closed, %d errors", - mode, closed, errors) + logger.warning( + "Verdict-deadlock reaper [%s]: %d closed, %d would-close, %d errors", + mode, closed, would_close, errors, + ) + elif config.REAPER_DRY_RUN: + logger.info("Verdict-deadlock reaper [dryrun]: %d would-close", would_close) else: - logger.info("Verdict-deadlock reaper [%s]: %d closed", mode, closed) - return closed + logger.info("Verdict-deadlock reaper [live]: %d closed", closed) + return closed + would_close diff --git a/teleo-pipeline.py b/teleo-pipeline.py index 58e0bf5..9b78edf 100644 --- a/teleo-pipeline.py +++ b/teleo-pipeline.py @@ -102,7 +102,18 @@ async def fix_cycle(conn, max_workers=None): """ m_fixed, m_errors = await mechanical_fix_cycle(conn, max_workers=max_workers) s_fixed, s_errors = await substantive_fix_cycle(conn, max_workers=max_workers) - r_closed = await verdict_deadlock_reaper_cycle(conn) + # Defense-in-depth: reaper exception must never block primary fix paths. + # Same exception-isolation pattern as ingest_cycle's extract_cycle wrapper — + # propagating would trip the fix breaker and lock out mechanical+substantive + # for 15 min after 5 reaper failures. + try: + r_closed = await verdict_deadlock_reaper_cycle(conn) + except Exception: + import logging + logging.getLogger("pipeline").exception( + "Reaper cycle failed (non-fatal)" + ) + r_closed = 0 return m_fixed + s_fixed + r_closed, m_errors + s_errors