#!/usr/bin/env python3 """One-time backfill: populate submitted_by on prs table from source archive files. Matches PRs to sources via branch name slug → source filename. Reads proposed_by and intake_tier from source frontmatter. Run: python3 backfill_submitted_by.py """ import os import re import sqlite3 from pathlib import Path DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db") ARCHIVE_DIR = Path(os.environ.get("ARCHIVE_DIR", "/opt/teleo-eval/workspaces/main/inbox/archive")) def parse_frontmatter(path: Path) -> dict: """Parse YAML-like frontmatter from a markdown file.""" text = path.read_text(encoding="utf-8", errors="replace") if not text.startswith("---"): return {} end = text.find("---", 3) if end == -1: return {} fm = {} for line in text[3:end].strip().split("\n"): line = line.strip() if not line or ":" not in line: continue key, _, val = line.partition(":") key = key.strip() val = val.strip().strip('"').strip("'") if val.lower() == "null" or val == "": val = None fm[key] = val return fm def slug_from_branch(branch: str) -> str: """Extract source slug from branch name like 'extract/2026-04-06-slug-hash'.""" if "/" in branch: branch = branch.split("/", 1)[1] # Strip trailing hex hash (e.g., -3e68, -a6af) branch = re.sub(r"-[0-9a-f]{4}$", "", branch) return branch def main(): conn = sqlite3.connect(DB_PATH, timeout=30) conn.row_factory = sqlite3.Row # Build source index: filename stem → frontmatter source_index = {} if ARCHIVE_DIR.exists(): for f in ARCHIVE_DIR.glob("*.md"): fm = parse_frontmatter(f) source_index[f.stem] = fm print(f"Indexed {len(source_index)} source files from {ARCHIVE_DIR}") # Get all PRs without submitted_by prs = conn.execute( "SELECT number, branch FROM prs WHERE submitted_by IS NULL AND branch IS NOT NULL" ).fetchall() print(f"Found {len(prs)} PRs without submitted_by") updated = 0 for pr in prs: branch = pr["branch"] slug = slug_from_branch(branch) # Try to match slug to a source file fm = source_index.get(slug) if not fm: # Try partial matching: slug might be a substring of the source filename for stem, sfm in source_index.items(): if slug in stem or stem in slug: fm = sfm break if fm: proposed_by = fm.get("proposed_by") intake_tier = fm.get("intake_tier") if proposed_by: contributor = proposed_by.strip().strip('"').strip("'") elif intake_tier == "research-task": # Derive agent from branch prefix prefix = branch.split("/", 1)[0] if "/" in branch else "unknown" agent_map = { "extract": "pipeline", "ingestion": "pipeline", "rio": "rio", "theseus": "theseus", "vida": "vida", "clay": "clay", "astra": "astra", "leo": "leo", "reweave": "pipeline", } agent = agent_map.get(prefix, prefix) contributor = f"{agent} (self-directed)" elif intake_tier == "directed": contributor = "directed (unknown)" else: contributor = None if contributor: conn.execute( "UPDATE prs SET submitted_by = ?, source_path = ? WHERE number = ?", (contributor, f"inbox/archive/{slug}.md", pr["number"]), ) updated += 1 else: # For extract/ branches, mark as pipeline self-directed if branch.startswith("extract/") or branch.startswith("ingestion/"): conn.execute( "UPDATE prs SET submitted_by = 'pipeline (self-directed)' WHERE number = ?", (pr["number"],), ) updated += 1 elif branch.startswith(("rio/", "theseus/", "vida/", "clay/", "astra/", "leo/")): agent = branch.split("/", 1)[0] conn.execute( "UPDATE prs SET submitted_by = ? WHERE number = ?", (f"{agent} (self-directed)", pr["number"]), ) updated += 1 elif branch.startswith("reweave/"): conn.execute( "UPDATE prs SET submitted_by = 'pipeline (reweave)' WHERE number = ?", (pr["number"],), ) updated += 1 conn.commit() conn.close() print(f"Updated {updated}/{len(prs)} PRs with submitted_by") if __name__ == "__main__": main()