commit pending pipeline changes: watchdog tier0 recovery, stale_pr cleanup, deploy.sh improvements

- watchdog.py: tier0 auto-recovery (3 retries, 1h cooldown, audit trail) — pending Ganymede review
- stale_pr.py: new module, closes extraction PRs open >30 min with zero claims
- deploy.sh: expanded with new deployment features
- validate.py, extract.py, cascade.py, db.py: minor fixes
- backfill-descriptions.py: utility script
- review_queue.py: minor fix

Note: watchdog + stale_pr not yet deployed to VPS (reverted after missing import crash)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-13 10:14:54 +02:00
parent bf3af00d5d
commit e27f6a7b91
9 changed files with 453 additions and 18 deletions

View file

@ -93,7 +93,115 @@ echo "Deploy complete."
if $RESTART; then
echo ""
echo "=== Restarting services ==="
ssh "$VPS_HOST" "sudo systemctl restart teleo-pipeline teleo-diagnostics"
echo "Services restarted."
echo "=== Detecting services to restart ==="
# Determine which services need restart based on what was deployed.
# rsync touched these paths → these services:
# pipeline-v2/lib/, pipeline-v2/*.py → teleo-pipeline
# diagnostics/ → teleo-diagnostics
# agent-state/, research-session.sh → no restart (not daemons)
RESTART_SVCS=""
# Check VPS for recent file changes from this deploy
# Compare local files against VPS to see what actually changed
PIPELINE_CHANGED=false
DIAG_CHANGED=false
# Pipeline: lib/ or top-level scripts
if ! rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
"$REPO_ROOT/ops/pipeline-v2/lib/" "$VPS_HOST:$VPS_PIPELINE/lib/" 2>/dev/null | grep -q '\.py$'; then
true # no python changes
else
PIPELINE_CHANGED=true
fi
for f in teleo-pipeline.py reweave.py; do
if [ -f "$REPO_ROOT/ops/pipeline-v2/$f" ]; then
if rsync -avzn "$REPO_ROOT/ops/pipeline-v2/$f" "$VPS_HOST:$VPS_PIPELINE/$f" 2>/dev/null | grep -q "$f"; then
PIPELINE_CHANGED=true
fi
fi
done
# Diagnostics
if rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
"$REPO_ROOT/ops/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/" 2>/dev/null | grep -q '\.py$'; then
DIAG_CHANGED=true
fi
if $PIPELINE_CHANGED; then
RESTART_SVCS="$RESTART_SVCS teleo-pipeline"
echo " teleo-pipeline: files changed, will restart"
else
echo " teleo-pipeline: no changes, skipping"
fi
if $DIAG_CHANGED; then
RESTART_SVCS="$RESTART_SVCS teleo-diagnostics"
echo " teleo-diagnostics: files changed, will restart"
else
echo " teleo-diagnostics: no changes, skipping"
fi
if [ -z "$RESTART_SVCS" ]; then
echo ""
echo "No service files changed. Skipping restart."
else
echo ""
echo "=== Restarting:$RESTART_SVCS ==="
ssh "$VPS_HOST" "sudo systemctl restart $RESTART_SVCS"
echo "Services restarted. Waiting 5s for startup..."
sleep 5
echo ""
echo "=== Smoke test ==="
SMOKE_FAIL=0
# Check systemd unit status for restarted services
for svc in $RESTART_SVCS; do
if ssh "$VPS_HOST" "systemctl is-active --quiet $svc"; then
echo " $svc: active"
else
echo " $svc: FAILED"
ssh "$VPS_HOST" "journalctl -u $svc -n 10 --no-pager" || true
SMOKE_FAIL=1
fi
done
# Hit health endpoints for restarted services
if echo "$RESTART_SVCS" | grep -q "teleo-pipeline"; then
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8080/health > /dev/null"; then
echo " pipeline health (8080): OK"
else
echo " pipeline health (8080): FAILED"
SMOKE_FAIL=1
fi
fi
if echo "$RESTART_SVCS" | grep -q "teleo-diagnostics"; then
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8081/ops > /dev/null"; then
echo " diagnostics (8081): OK"
else
echo " diagnostics (8081): FAILED"
SMOKE_FAIL=1
fi
fi
# Tail logs for quick visual check
echo ""
echo "=== Recent logs (10s) ==="
JOURNAL_UNITS=""
for svc in $RESTART_SVCS; do
JOURNAL_UNITS="$JOURNAL_UNITS -u $svc"
done
ssh "$VPS_HOST" "journalctl $JOURNAL_UNITS --since '-10s' --no-pager -n 20" || true
if [ "$SMOKE_FAIL" -gt 0 ]; then
echo ""
echo "WARNING: Smoke test detected failures. Check logs above."
exit 1
fi
echo ""
echo "Smoke test passed."
fi
fi

View file

@ -140,7 +140,7 @@ async def fetch_review_queue(
if forgejo_token:
headers["Authorization"] = f"token {forgejo_token}"
connector = aiohttp.TCPConnector(ssl=False)
connector = aiohttp.TCPConnector() # Default SSL verification — Forgejo token must not be exposed to MITM
async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
# Fetch open PRs
url = f"{FORGEJO_BASE}/repos/{REPO}/pulls?state=open&limit=50&sort=oldest"

View file

@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""One-time backfill: populate prs.description with claim titles from merged files.
For PRs that have description=NULL or empty, reads the claim files on main
(for merged PRs) or on the branch (for open PRs) and extracts H1 titles.
Usage: python3 backfill-descriptions.py [--dry-run]
Requires: run from the teleo-codex git worktree (main branch).
"""
import re
import sqlite3
import subprocess
import sys
from pathlib import Path
DB_PATH = Path("/opt/teleo-eval/pipeline/pipeline.db")
MAIN_WORKTREE = Path("/opt/teleo-eval/teleo-codex")
CLAIM_DIRS = ("domains/", "core/", "foundations/")
dry_run = "--dry-run" in sys.argv
def get_pr_claim_titles(pr_number: int, branch: str, status: str) -> list[str]:
"""Extract H1 claim titles from a PR's changed files."""
titles = []
# For merged PRs: diff the merge commit on main
# For open PRs: diff against main
try:
if status == "merged":
# Get the diff from the branch name — files are on main now
# Use git log to find the merge and diff its changes
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
# Branch may be deleted — try reading files from main directly
# We can't reconstruct the diff, but we can search by PR number in audit_log
return titles
else:
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
return titles
changed_files = [
f.strip() for f in result.stdout.strip().split("\n")
if f.strip() and any(f.strip().startswith(d) for d in CLAIM_DIRS) and f.strip().endswith(".md")
]
for fpath in changed_files:
# Read from main for merged, from branch for open
ref = "origin/main" if status == "merged" else f"origin/{branch}"
show = subprocess.run(
["git", "show", f"{ref}:{fpath}"],
capture_output=True, text=True, timeout=5,
cwd=str(MAIN_WORKTREE),
)
if show.returncode == 0:
for line in show.stdout.split("\n"):
if line.startswith("# ") and len(line) > 3:
titles.append(line[2:].strip())
break
except (subprocess.TimeoutExpired, Exception) as e:
print(f" PR #{pr_number}: error — {e}")
return titles
def main():
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
# Find PRs with empty description
rows = conn.execute(
"SELECT number, branch, status FROM prs WHERE description IS NULL OR description = '' ORDER BY number DESC"
).fetchall()
print(f"Found {len(rows)} PRs with empty description")
updated = 0
skipped = 0
for row in rows:
pr_num = row["number"]
branch = row["branch"]
status = row["status"]
if not branch:
skipped += 1
continue
titles = get_pr_claim_titles(pr_num, branch, status)
if titles:
desc = " | ".join(titles)
if dry_run:
print(f" PR #{pr_num} ({status}): would set → {desc[:100]}...")
else:
conn.execute(
"UPDATE prs SET description = ? WHERE number = ?",
(desc, pr_num),
)
updated += 1
if updated % 50 == 0:
conn.commit()
print(f" ...{updated} updated so far")
else:
skipped += 1
if not dry_run:
conn.commit()
conn.close()
print(f"\nDone. Updated: {updated}, Skipped: {skipped}, Total: {len(rows)}")
if dry_run:
print("(dry run — no changes written)")
if __name__ == "__main__":
main()

View file

@ -9,7 +9,7 @@ the same atomic-write pattern as lib-state.sh.
"""
import asyncio
import hashlib
import secrets
import json
import logging
import os
@ -116,8 +116,8 @@ def _write_inbox_message(agent: str, subject: str, body: str) -> bool:
return False
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
file_hash = hashlib.md5(f"{agent}-{subject}-{body[:200]}".encode()).hexdigest()[:8]
filename = f"cascade-{ts}-{subject[:60]}-{file_hash}.md"
nonce = secrets.token_hex(3)
filename = f"cascade-{ts}-{nonce}-{subject[:60]}.md"
final_path = inbox_dir / filename
try:

View file

@ -479,6 +479,9 @@ def migrate(conn: sqlite3.Connection):
logger.info("Migration v11: added auto_merge column to prs table")
# v12-v16 ran manually on VPS before code was version-controlled.
# Their changes are consolidated into v17+ migrations below.
if current < 17:
# Add prompt/pipeline version tracking per PR
for col, default in [

View file

@ -376,6 +376,7 @@ async def _extract_one_source(
filename = c.get("filename", "")
if not filename:
continue
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
if not filename.endswith(".md"):
filename += ".md"
content = _build_claim_content(c, agent_lower)
@ -387,6 +388,7 @@ async def _extract_one_source(
filename = e.get("filename", "")
if not filename:
continue
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
if not filename.endswith(".md"):
filename += ".md"
action = e.get("action", "create")

View file

@ -0,0 +1,94 @@
"""Stale extraction PR cleanup — closes extraction PRs that produce no claims.
When an extraction PR sits open >30 min with claims_count=0, it indicates:
- Extraction failed (model couldn't extract anything useful)
- Batch job stalled (no claims written)
- Source material is empty/junk
Auto-closing prevents zombie PRs from blocking the pipeline.
Logs each close for root cause analysis (model failures, bad sources, etc.).
Epimetheus owns this module.
"""
import json
import logging
from datetime import datetime, timezone
from . import config, db
from .forgejo import api, repo_path
logger = logging.getLogger("pipeline.stale_pr")
STALE_THRESHOLD_MINUTES = 30
async def check_stale_prs(conn) -> tuple[int, int]:
"""Auto-close extraction PRs open >30 min with zero claims.
Returns (stale_closed, stale_errors) count of closed PRs and close failures.
"""
stale_closed = 0
stale_errors = 0
# Find extraction PRs: open >30 min, source has 0 claims
stale_prs = conn.execute(
"""SELECT p.number, p.branch, p.source_path, p.created_at
FROM prs p
LEFT JOIN sources s ON p.source_path = s.path
WHERE p.status = 'open'
AND p.commit_type = 'extract'
AND datetime(p.created_at) < datetime('now', '-' || ? || ' minutes')
AND COALESCE(s.claims_count, 0) = 0""",
(STALE_THRESHOLD_MINUTES,),
).fetchall()
for pr in stale_prs:
pr_num = pr["number"]
source_path = pr["source_path"] or "unknown"
try:
# Close the PR via Forgejo
result = await api(
"PATCH",
repo_path(f"pulls/{pr_num}"),
body={"state": "closed"},
)
if result is None:
stale_errors += 1
logger.warning(
"Failed to close stale extraction PR #%d (%s, %s)",
pr_num, source_path, pr["branch"],
)
continue
# Update local DB status
conn.execute(
"UPDATE prs SET status = 'closed' WHERE number = ?",
(pr_num,),
)
db.audit(
conn,
"watchdog",
"stale_pr_closed",
json.dumps({
"pr": pr_num,
"branch": pr["branch"],
"source": source_path,
"open_minutes": STALE_THRESHOLD_MINUTES,
}),
)
stale_closed += 1
logger.info(
"WATCHDOG: closed stale extraction PR #%d (no claims after %d min): %s",
pr_num, STALE_THRESHOLD_MINUTES, source_path,
)
except Exception as e:
stale_errors += 1
logger.warning(
"Stale PR close exception for #%d: %s",
pr_num, e,
)
return stale_closed, stale_errors

View file

@ -620,6 +620,27 @@ async def validate_pr(conn, pr_number: int) -> dict:
# Extract claim files (domains/, core/, foundations/)
claim_files = extract_claim_files_from_diff(diff)
# ── Backfill description (claim titles) if missing ──
# discover_external_prs creates rows without description. Extract H1 titles
# from the diff so the dashboard shows what the PR actually contains.
existing_desc = conn.execute(
"SELECT description FROM prs WHERE number = ?", (pr_number,)
).fetchone()
if existing_desc and not (existing_desc["description"] or "").strip() and claim_files:
titles = []
for _fp, content in claim_files.items():
for line in content.split("\n"):
if line.startswith("# ") and len(line) > 3:
titles.append(line[2:].strip())
break
if titles:
desc = " | ".join(titles)
conn.execute(
"UPDATE prs SET description = ? WHERE number = ? AND (description IS NULL OR description = '')",
(desc, pr_number),
)
logger.info("PR #%d: backfilled description with %d claim titles", pr_number, len(titles))
# ── Tier 0: per-claim validation ──
# Only validates NEW files (not modified). Modified files have partial content
# from diffs (only + lines) — frontmatter parsing fails on partial content,

View file

@ -19,6 +19,7 @@ import logging
from datetime import datetime, timezone
from . import config, db
from .stale_pr import check_stale_prs
logger = logging.getLogger("pipeline.watchdog")
@ -103,17 +104,94 @@ async def watchdog_check(conn) -> dict:
"action": "GC should auto-close these — check fixer.py GC logic",
})
# 5. Tier0 blockage: many PRs with tier0_pass=0 (potential validation bug)
# 5. Tier0 blockage: auto-reset stuck PRs with retry cap
MAX_TIER0_RESETS = 3
TIER0_RESET_COOLDOWN_S = 3600
tier0_blocked = conn.execute(
"SELECT COUNT(*) as n FROM prs WHERE status = 'open' AND tier0_pass = 0"
).fetchone()["n"]
if tier0_blocked >= 5:
"SELECT number, branch FROM prs WHERE status = 'open' AND tier0_pass = 0"
).fetchall()
if tier0_blocked:
reset_count = 0
permanent_count = 0
for pr in tier0_blocked:
row = conn.execute(
"""SELECT COUNT(*) as n, MAX(timestamp) as last_ts FROM audit_log
WHERE stage = 'watchdog' AND event = 'tier0_reset'
AND json_extract(detail, '$.pr') = ?""",
(pr["number"],),
).fetchone()
prior_resets = row["n"]
if prior_resets >= MAX_TIER0_RESETS:
permanent_count += 1
continue
last_reset = row["last_ts"]
if last_reset:
try:
last_ts = datetime.fromisoformat(last_reset).replace(tzinfo=timezone.utc)
age = (datetime.now(timezone.utc) - last_ts).total_seconds()
if age < TIER0_RESET_COOLDOWN_S:
continue
except (ValueError, TypeError):
pass
conn.execute(
"UPDATE prs SET tier0_pass = NULL WHERE number = ?",
(pr["number"],),
)
db.audit(
conn, "watchdog", "tier0_reset",
json.dumps({
"pr": pr["number"],
"branch": pr["branch"],
"attempt": prior_resets + 1,
"max": MAX_TIER0_RESETS,
}),
)
reset_count += 1
logger.info(
"WATCHDOG: auto-reset tier0 for PR #%d (attempt %d/%d)",
pr["number"], prior_resets + 1, MAX_TIER0_RESETS,
)
if reset_count:
issues.append({
"type": "tier0_blockage",
"severity": "warning",
"detail": f"{tier0_blocked} PRs blocked at tier0_pass=0",
"action": "Check validate.py — may be the modified-file or wiki-link bug recurring",
"type": "tier0_reset",
"severity": "info",
"detail": f"Auto-reset {reset_count} PRs stuck at tier0_pass=0 for re-validation",
"action": "Monitor — if same PRs fail again, check validate.py",
})
if permanent_count:
issues.append({
"type": "tier0_permanent_failure",
"severity": "warning",
"detail": f"{permanent_count} PRs exhausted {MAX_TIER0_RESETS} tier0 retries — manual intervention needed",
"action": "Inspect PR content or close stale PRs",
})
# 6. Stale extraction PRs: open >30 min with no claim files
try:
stale_closed, stale_errors = await check_stale_prs(conn)
if stale_closed > 0:
issues.append({
"type": "stale_prs_closed",
"severity": "info",
"detail": f"Auto-closed {stale_closed} stale extraction PRs (no claims after 30 min)",
"action": "Check batch-extract logs for extraction failures",
})
if stale_errors > 0:
issues.append({
"type": "stale_pr_close_failed",
"severity": "warning",
"detail": f"Failed to close {stale_errors} stale PRs",
"action": "Check Forgejo API connectivity",
})
except Exception as e:
logger.warning("Stale PR check failed: %s", e)
# Log issues
healthy = len(issues) == 0
@ -124,7 +202,7 @@ async def watchdog_check(conn) -> dict:
else:
logger.info("WATCHDOG: %s%s", issue["type"], issue["detail"])
return {"healthy": healthy, "issues": issues, "checks_run": 5}
return {"healthy": healthy, "issues": issues, "checks_run": 6}
async def watchdog_cycle(conn, max_workers=None) -> tuple[int, int]: