Compare commits

..

No commits in common. "4b2b59b184ec837eda31e5c54eb2459e7a119784" and "517e9884ccabbdb04a41f4472bf677e8bee28099" have entirely different histories.

3 changed files with 3 additions and 194 deletions

View file

@ -211,14 +211,6 @@ HEALTH_CHECK_INTERVAL = 60
# --- Extraction gates ---
EXTRACTION_COOLDOWN_HOURS = 4 # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
# --- Verdict-deadlock reaper ---
# Defaults safe (dry-run, 24h age, hourly throttle). Operator flips REAPER_DRY_RUN
# to "false" via systemctl edit teleo-pipeline → restart, no code change required.
REAPER_DRY_RUN = os.environ.get("REAPER_DRY_RUN", "true").lower() == "true"
REAPER_DEADLOCK_AGE_HOURS = int(os.environ.get("REAPER_DEADLOCK_AGE_HOURS", "24"))
REAPER_INTERVAL_SECONDS = int(os.environ.get("REAPER_INTERVAL_SECONDS", "3600"))
REAPER_MAX_PER_RUN = int(os.environ.get("REAPER_MAX_PER_RUN", "50"))
# --- Retrieval (Telegram bot) ---
RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source
RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities

View file

@ -596,170 +596,3 @@ async def substantive_fix_cycle(conn, max_workers=None) -> tuple[int, int]:
logger.info("Substantive fix cycle: %d fixed, %d errors", fixed, errors)
return fixed, errors
# ─── Verdict-deadlock reaper ──────────────────────────────────────────────
#
# Defense-in-depth for PRs that substantive_fixer can't make progress on.
# Targets two stuck-verdict shapes empirically observed in production:
#
# 1. leo:request_changes + domain:approve
# Leo asked for substantive fix; fixer either failed silently
# (no_claim_files / no_review_comments / etc.) or the issue tag isn't
# in FIXABLE | CONVERTIBLE | UNFIXABLE. PR sits forever.
#
# 2. leo:skipped + domain:request_changes
# Eval bypassed Leo (eval_attempts >= MAX). Domain rejected with no
# structured eval_issues. fixer can't classify → silent skip → forever.
#
# Both shapes need a clearance path. Reaper closes them after a 24h cooldown
# with audit_log breadcrumbs for forensics. First deploy runs in dry-run mode
# (audit "would_close" events only — no Forgejo writes, no DB closes).
#
# Reaper config (REAPER_DRY_RUN, REAPER_DEADLOCK_AGE_HOURS, REAPER_INTERVAL_SECONDS,
# REAPER_MAX_PER_RUN) lives in lib/config.py with env-var overrides — operator
# flips dry-run to live via `systemctl edit teleo-pipeline.service`
# (Environment=REAPER_DRY_RUN=false) + restart. No code change, no commit, no
# redeploy required.
async def verdict_deadlock_reaper_cycle(conn) -> int:
"""Reap PRs stuck in conflicting-verdict deadlock for >24h.
Returns count of PRs closed (or "would-close" in dry-run mode).
Throttled to once per REAPER_INTERVAL_SECONDS via sentinel audit event.
"""
# Throttle: skip if last reaper run was within REAPER_INTERVAL_SECONDS.
# Uses audit_log as the rate-limit ledger so no schema/state needed.
# stage='reaper' filter so the planner uses idx_audit_stage (avoids full scan).
last_run = conn.execute(
"SELECT MAX(timestamp) FROM audit_log "
"WHERE stage = 'reaper' AND event = 'verdict_deadlock_reaper_run'"
).fetchone()[0]
if last_run:
cur = conn.execute(
"SELECT (julianday('now') - julianday(?)) * 86400 < ?",
(last_run, config.REAPER_INTERVAL_SECONDS),
).fetchone()[0]
if cur:
return 0
# Two stuck-verdict shapes: leo:rc+domain:approve, leo:skipped+domain:rc.
# Branch allowlist scopes the reaper to disposable pipeline-managed branches.
# extract/, reweave/, fix/ are content the pipeline created and can recreate;
# agent branches (theseus/, vida/, epimetheus/, etc.) are WIP feature work
# and must not be reaped — owners review their own PRs on their own cadence.
rows = conn.execute(
"""SELECT number, branch, eval_issues, leo_verdict, domain_verdict,
last_attempt, fix_attempts
FROM prs
WHERE status = 'open'
AND tier0_pass = 1
AND last_attempt IS NOT NULL
AND last_attempt < datetime('now', ? || ' hours')
AND (branch LIKE 'extract/%' OR branch LIKE 'reweave/%' OR branch LIKE 'fix/%')
AND (
(leo_verdict = 'request_changes' AND domain_verdict = 'approve')
OR (leo_verdict = 'skipped' AND domain_verdict = 'request_changes')
)
ORDER BY last_attempt ASC
LIMIT ?""",
(f"-{config.REAPER_DEADLOCK_AGE_HOURS}", config.REAPER_MAX_PER_RUN),
).fetchall()
mode = "dryrun" if config.REAPER_DRY_RUN else "live"
if not rows:
# Heartbeat anyway so throttle ticks even when nothing to reap.
db.audit(conn, "reaper", "verdict_deadlock_reaper_run", json.dumps({
"candidates": 0, "closed": 0, "mode": mode,
}))
return 0
logger.info(
"Verdict-deadlock reaper [%s]: %d candidate(s) in deadlock >%dh",
mode, len(rows), config.REAPER_DEADLOCK_AGE_HOURS,
)
closed = 0
would_close = 0
errors = 0
for row in rows:
pr = row["number"]
reason_detail = {
"pr": pr,
"branch": row["branch"],
"leo_verdict": row["leo_verdict"],
"domain_verdict": row["domain_verdict"],
"eval_issues": row["eval_issues"],
"last_attempt": row["last_attempt"],
"fix_attempts": row["fix_attempts"],
}
if config.REAPER_DRY_RUN:
# Audit only — do NOT touch DB row or Forgejo state.
db.audit(conn, "reaper", "verdict_deadlock_would_close",
json.dumps(reason_detail))
logger.info(
"Reaper [dryrun]: would close PR #%d (leo=%s domain=%s issues=%s)",
pr, row["leo_verdict"], row["domain_verdict"], row["eval_issues"],
)
would_close += 1
continue
try:
comment_body = (
"Closed by verdict-deadlock reaper.\n\n"
f"This PR sat for >{config.REAPER_DEADLOCK_AGE_HOURS}h with conflicting "
f"verdicts (leo={row['leo_verdict']}, domain={row['domain_verdict']}) "
f"that the substantive fixer couldn't auto-resolve.\n\n"
f"Eval issues: `{row['eval_issues']}`\n"
f"Last attempt: {row['last_attempt']}\n\n"
"_Automated message from the LivingIP pipeline._"
)
await forgejo_api(
"POST", repo_path(f"issues/{pr}/comments"), {"body": comment_body},
)
patch_result = await forgejo_api(
"PATCH", repo_path(f"pulls/{pr}"), {"state": "closed"},
token=get_agent_token("leo"),
)
if patch_result is None:
logger.warning(
"Reaper: PR #%d Forgejo close failed — skipping DB close to "
"avoid drift", pr,
)
errors += 1
continue
# Forgejo already closed at the PATCH above — pass close_on_forgejo=False
# so close_pr() doesn't issue a redundant PATCH (which on transient
# failure returns False and skips the DB close → status drift).
await close_pr(
conn, pr,
last_error=(
f"verdict_deadlock_reaper: leo={row['leo_verdict']} "
f"domain={row['domain_verdict']} age>{config.REAPER_DEADLOCK_AGE_HOURS}h"
),
close_on_forgejo=False,
)
db.audit(conn, "reaper", "verdict_deadlock_closed",
json.dumps(reason_detail))
closed += 1
except Exception:
logger.exception("Reaper: PR #%d close failed", pr)
errors += 1
db.audit(conn, "reaper", "verdict_deadlock_reaper_run", json.dumps({
"candidates": len(rows), "closed": closed, "would_close": would_close,
"errors": errors, "mode": mode,
}))
if errors:
logger.warning(
"Verdict-deadlock reaper [%s]: %d closed, %d would-close, %d errors",
mode, closed, would_close, errors,
)
elif config.REAPER_DRY_RUN:
logger.info("Verdict-deadlock reaper [dryrun]: %d would-close", would_close)
else:
logger.info("Verdict-deadlock reaper [live]: %d closed", closed)
return closed + would_close

View file

@ -20,7 +20,7 @@ from lib import log as logmod
from lib.breaker import CircuitBreaker
from lib.evaluate import evaluate_cycle
from lib.fixer import fix_cycle as mechanical_fix_cycle
from lib.substantive_fixer import substantive_fix_cycle, verdict_deadlock_reaper_cycle
from lib.substantive_fixer import substantive_fix_cycle
from lib.health import start_health_server, stop_health_server
from lib.llm import kill_active_subprocesses
from lib.merge import merge_cycle
@ -91,30 +91,14 @@ async def ingest_cycle(conn, max_workers=None):
async def fix_cycle(conn, max_workers=None):
"""Combined fix stage: mechanical fixes first, then substantive fixes,
finally the verdict-deadlock reaper.
"""Combined fix stage: mechanical fixes first, then substantive fixes.
Mechanical (fixer.py): wiki link bracket stripping, $0
Substantive (substantive_fixer.py): confidence/title/scope fixes via LLM, $0.001
Reaper (substantive_fixer.verdict_deadlock_reaper_cycle): defense-in-depth
for stuck-verdict PRs that the substantive fixer can't progress on.
Hourly throttle, dry-run by default. Cost $0.
"""
m_fixed, m_errors = await mechanical_fix_cycle(conn, max_workers=max_workers)
s_fixed, s_errors = await substantive_fix_cycle(conn, max_workers=max_workers)
# Defense-in-depth: reaper exception must never block primary fix paths.
# Same exception-isolation pattern as ingest_cycle's extract_cycle wrapper —
# propagating would trip the fix breaker and lock out mechanical+substantive
# for 15 min after 5 reaper failures.
try:
r_closed = await verdict_deadlock_reaper_cycle(conn)
except Exception:
import logging
logging.getLogger("pipeline").exception(
"Reaper cycle failed (non-fatal)"
)
r_closed = 0
return m_fixed + s_fixed + r_closed, m_errors + s_errors
return m_fixed + s_fixed, m_errors + s_errors
async def snapshot_cycle(conn, max_workers=None):