#!/usr/bin/env python3 """One-time backfill: populate prs.description with claim titles from merged files. For PRs that have description=NULL or empty, reads the claim files on main (for merged PRs) or on the branch (for open PRs) and extracts H1 titles. Usage: python3 backfill-descriptions.py [--dry-run] Requires: run from the teleo-codex git worktree (main branch). """ import re import sqlite3 import subprocess import sys from pathlib import Path DB_PATH = Path("/opt/teleo-eval/pipeline/pipeline.db") MAIN_WORKTREE = Path("/opt/teleo-eval/teleo-codex") CLAIM_DIRS = ("domains/", "core/", "foundations/") dry_run = "--dry-run" in sys.argv def get_pr_claim_titles(pr_number: int, branch: str, status: str) -> list[str]: """Extract H1 claim titles from a PR's changed files.""" titles = [] # For merged PRs: diff the merge commit on main # For open PRs: diff against main try: if status == "merged": # Get the diff from the branch name — files are on main now # Use git log to find the merge and diff its changes result = subprocess.run( ["git", "diff", "--name-only", f"origin/main...origin/{branch}"], capture_output=True, text=True, timeout=10, cwd=str(MAIN_WORKTREE), ) if result.returncode != 0: # Branch may be deleted — try reading files from main directly # We can't reconstruct the diff, but we can search by PR number in audit_log return titles else: result = subprocess.run( ["git", "diff", "--name-only", f"origin/main...origin/{branch}"], capture_output=True, text=True, timeout=10, cwd=str(MAIN_WORKTREE), ) if result.returncode != 0: return titles changed_files = [ f.strip() for f in result.stdout.strip().split("\n") if f.strip() and any(f.strip().startswith(d) for d in CLAIM_DIRS) and f.strip().endswith(".md") ] for fpath in changed_files: # Read from main for merged, from branch for open ref = "origin/main" if status == "merged" else f"origin/{branch}" show = subprocess.run( ["git", "show", f"{ref}:{fpath}"], capture_output=True, text=True, timeout=5, cwd=str(MAIN_WORKTREE), ) if show.returncode == 0: for line in show.stdout.split("\n"): if line.startswith("# ") and len(line) > 3: titles.append(line[2:].strip()) break except (subprocess.TimeoutExpired, Exception) as e: print(f" PR #{pr_number}: error — {e}") return titles def main(): conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row # Find PRs with empty description rows = conn.execute( "SELECT number, branch, status FROM prs WHERE description IS NULL OR description = '' ORDER BY number DESC" ).fetchall() print(f"Found {len(rows)} PRs with empty description") updated = 0 skipped = 0 for row in rows: pr_num = row["number"] branch = row["branch"] status = row["status"] if not branch: skipped += 1 continue titles = get_pr_claim_titles(pr_num, branch, status) if titles: desc = " | ".join(titles) if dry_run: print(f" PR #{pr_num} ({status}): would set → {desc[:100]}...") else: conn.execute( "UPDATE prs SET description = ? WHERE number = ?", (desc, pr_num), ) updated += 1 if updated % 50 == 0: conn.commit() print(f" ...{updated} updated so far") else: skipped += 1 if not dry_run: conn.commit() conn.close() print(f"\nDone. Updated: {updated}, Skipped: {skipped}, Total: {len(rows)}") if dry_run: print("(dry run — no changes written)") if __name__ == "__main__": main()