teleo-codex/ops/pipeline-v2/backfill-descriptions.py
m3taversal e27f6a7b91 commit pending pipeline changes: watchdog tier0 recovery, stale_pr cleanup, deploy.sh improvements
- watchdog.py: tier0 auto-recovery (3 retries, 1h cooldown, audit trail) — pending Ganymede review
- stale_pr.py: new module, closes extraction PRs open >30 min with zero claims
- deploy.sh: expanded with new deployment features
- validate.py, extract.py, cascade.py, db.py: minor fixes
- backfill-descriptions.py: utility script
- review_queue.py: minor fix

Note: watchdog + stale_pr not yet deployed to VPS (reverted after missing import crash)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 10:14:54 +02:00

129 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""One-time backfill: populate prs.description with claim titles from merged files.
For PRs that have description=NULL or empty, reads the claim files on main
(for merged PRs) or on the branch (for open PRs) and extracts H1 titles.
Usage: python3 backfill-descriptions.py [--dry-run]
Requires: run from the teleo-codex git worktree (main branch).
"""
import re
import sqlite3
import subprocess
import sys
from pathlib import Path
DB_PATH = Path("/opt/teleo-eval/pipeline/pipeline.db")
MAIN_WORKTREE = Path("/opt/teleo-eval/teleo-codex")
CLAIM_DIRS = ("domains/", "core/", "foundations/")
dry_run = "--dry-run" in sys.argv
def get_pr_claim_titles(pr_number: int, branch: str, status: str) -> list[str]:
"""Extract H1 claim titles from a PR's changed files."""
titles = []
# For merged PRs: diff the merge commit on main
# For open PRs: diff against main
try:
if status == "merged":
# Get the diff from the branch name — files are on main now
# Use git log to find the merge and diff its changes
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
# Branch may be deleted — try reading files from main directly
# We can't reconstruct the diff, but we can search by PR number in audit_log
return titles
else:
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
return titles
changed_files = [
f.strip() for f in result.stdout.strip().split("\n")
if f.strip() and any(f.strip().startswith(d) for d in CLAIM_DIRS) and f.strip().endswith(".md")
]
for fpath in changed_files:
# Read from main for merged, from branch for open
ref = "origin/main" if status == "merged" else f"origin/{branch}"
show = subprocess.run(
["git", "show", f"{ref}:{fpath}"],
capture_output=True, text=True, timeout=5,
cwd=str(MAIN_WORKTREE),
)
if show.returncode == 0:
for line in show.stdout.split("\n"):
if line.startswith("# ") and len(line) > 3:
titles.append(line[2:].strip())
break
except (subprocess.TimeoutExpired, Exception) as e:
print(f" PR #{pr_number}: error — {e}")
return titles
def main():
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
# Find PRs with empty description
rows = conn.execute(
"SELECT number, branch, status FROM prs WHERE description IS NULL OR description = '' ORDER BY number DESC"
).fetchall()
print(f"Found {len(rows)} PRs with empty description")
updated = 0
skipped = 0
for row in rows:
pr_num = row["number"]
branch = row["branch"]
status = row["status"]
if not branch:
skipped += 1
continue
titles = get_pr_claim_titles(pr_num, branch, status)
if titles:
desc = " | ".join(titles)
if dry_run:
print(f" PR #{pr_num} ({status}): would set → {desc[:100]}...")
else:
conn.execute(
"UPDATE prs SET description = ? WHERE number = ?",
(desc, pr_num),
)
updated += 1
if updated % 50 == 0:
conn.commit()
print(f" ...{updated} updated so far")
else:
skipped += 1
if not dry_run:
conn.commit()
conn.close()
print(f"\nDone. Updated: {updated}, Skipped: {skipped}, Total: {len(rows)}")
if dry_run:
print("(dry run — no changes written)")
if __name__ == "__main__":
main()