- Migration v19: submitted_by column on prs + sources tables - extract.py: propagates proposed_by from source frontmatter → PR record - merge.py: sets submitted_by from Forgejo author for human PRs - dashboard_prs.py: redesigned with Contributor column, improved claim visibility in expanded rows, cost estimates, evaluator chain display - dashboard_routes.py: submitted_by + source_path in pr-lifecycle API - backfill_submitted_by.py: one-time backfill (1525/1777 PRs matched) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
138 lines
4.8 KiB
Python
138 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time backfill: populate submitted_by on prs table from source archive files.
|
|
|
|
Matches PRs to sources via branch name slug → source filename.
|
|
Reads proposed_by and intake_tier from source frontmatter.
|
|
|
|
Run: python3 backfill_submitted_by.py
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
ARCHIVE_DIR = Path(os.environ.get("ARCHIVE_DIR", "/opt/teleo-eval/workspaces/main/inbox/archive"))
|
|
|
|
|
|
def parse_frontmatter(path: Path) -> dict:
|
|
"""Parse YAML-like frontmatter from a markdown file."""
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
if not text.startswith("---"):
|
|
return {}
|
|
end = text.find("---", 3)
|
|
if end == -1:
|
|
return {}
|
|
fm = {}
|
|
for line in text[3:end].strip().split("\n"):
|
|
line = line.strip()
|
|
if not line or ":" not in line:
|
|
continue
|
|
key, _, val = line.partition(":")
|
|
key = key.strip()
|
|
val = val.strip().strip('"').strip("'")
|
|
if val.lower() == "null" or val == "":
|
|
val = None
|
|
fm[key] = val
|
|
return fm
|
|
|
|
|
|
def slug_from_branch(branch: str) -> str:
|
|
"""Extract source slug from branch name like 'extract/2026-04-06-slug-hash'."""
|
|
if "/" in branch:
|
|
branch = branch.split("/", 1)[1]
|
|
# Strip trailing hex hash (e.g., -3e68, -a6af)
|
|
branch = re.sub(r"-[0-9a-f]{4}$", "", branch)
|
|
return branch
|
|
|
|
|
|
def main():
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Build source index: filename stem → frontmatter
|
|
source_index = {}
|
|
if ARCHIVE_DIR.exists():
|
|
for f in ARCHIVE_DIR.glob("*.md"):
|
|
fm = parse_frontmatter(f)
|
|
source_index[f.stem] = fm
|
|
print(f"Indexed {len(source_index)} source files from {ARCHIVE_DIR}")
|
|
|
|
# Get all PRs without submitted_by
|
|
prs = conn.execute(
|
|
"SELECT number, branch FROM prs WHERE submitted_by IS NULL AND branch IS NOT NULL"
|
|
).fetchall()
|
|
print(f"Found {len(prs)} PRs without submitted_by")
|
|
|
|
updated = 0
|
|
for pr in prs:
|
|
branch = pr["branch"]
|
|
slug = slug_from_branch(branch)
|
|
|
|
# Try to match slug to a source file
|
|
fm = source_index.get(slug)
|
|
if not fm:
|
|
# Try partial matching: slug might be a substring of the source filename
|
|
for stem, sfm in source_index.items():
|
|
if slug in stem or stem in slug:
|
|
fm = sfm
|
|
break
|
|
|
|
if fm:
|
|
proposed_by = fm.get("proposed_by")
|
|
intake_tier = fm.get("intake_tier")
|
|
|
|
if proposed_by:
|
|
contributor = proposed_by.strip().strip('"').strip("'")
|
|
elif intake_tier == "research-task":
|
|
# Derive agent from branch prefix
|
|
prefix = branch.split("/", 1)[0] if "/" in branch else "unknown"
|
|
agent_map = {
|
|
"extract": "pipeline", "ingestion": "pipeline",
|
|
"rio": "rio", "theseus": "theseus", "vida": "vida",
|
|
"clay": "clay", "astra": "astra", "leo": "leo",
|
|
"reweave": "pipeline",
|
|
}
|
|
agent = agent_map.get(prefix, prefix)
|
|
contributor = f"{agent} (self-directed)"
|
|
elif intake_tier == "directed":
|
|
contributor = "directed (unknown)"
|
|
else:
|
|
contributor = None
|
|
|
|
if contributor:
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = ?, source_path = ? WHERE number = ?",
|
|
(contributor, f"inbox/archive/{slug}.md", pr["number"]),
|
|
)
|
|
updated += 1
|
|
else:
|
|
# For extract/ branches, mark as pipeline self-directed
|
|
if branch.startswith("extract/") or branch.startswith("ingestion/"):
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = 'pipeline (self-directed)' WHERE number = ?",
|
|
(pr["number"],),
|
|
)
|
|
updated += 1
|
|
elif branch.startswith(("rio/", "theseus/", "vida/", "clay/", "astra/", "leo/")):
|
|
agent = branch.split("/", 1)[0]
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
|
(f"{agent} (self-directed)", pr["number"]),
|
|
)
|
|
updated += 1
|
|
elif branch.startswith("reweave/"):
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = 'pipeline (reweave)' WHERE number = ?",
|
|
(pr["number"],),
|
|
)
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"Updated {updated}/{len(prs)} PRs with submitted_by")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|