teleo-codex/ops/diagnostics/backfill_submitted_by.py
m3taversal efe23f931a ship: fix evaluator column + correct contributor attribution
- Add domain_agent and domain_model to pr-lifecycle API response (data was
  queried but dropped before serialization — evaluator column showed blank)
- Show model name tag next to evaluator (Gemini Flash, GPT-4o, etc.)
- Re-attribute 1201 "pipeline (self-directed)" PRs to @m3taversal — these
  were Cory-directed, not autonomous overnight research
- Re-attribute 252 NULL PRs to @m3taversal
- Fix extract.py defaults: new PRs without proposed_by default to @m3taversal
- Fix backfill script defaults: extract/ branches → @m3taversal, not
  "pipeline (self-directed)"
- Only agent-named branches (rio/, theseus/, etc.) from research-session.sh
  remain as "(self-directed)"

Pentagon-Agent: Ship <B8D06D3F-1589-4777-B2E7-B2460D51C81F>
2026-04-07 14:56:03 +00:00

140 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""One-time backfill: populate submitted_by on prs table from source archive files.
Matches PRs to sources via branch name slug → source filename.
Reads proposed_by and intake_tier from source frontmatter.
Run: python3 backfill_submitted_by.py
"""
import os
import re
import sqlite3
from pathlib import Path
DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db")
ARCHIVE_DIR = Path(os.environ.get("ARCHIVE_DIR", "/opt/teleo-eval/workspaces/main/inbox/archive"))
def parse_frontmatter(path: Path) -> dict:
"""Parse YAML-like frontmatter from a markdown file."""
text = path.read_text(encoding="utf-8", errors="replace")
if not text.startswith("---"):
return {}
end = text.find("---", 3)
if end == -1:
return {}
fm = {}
for line in text[3:end].strip().split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if val.lower() == "null" or val == "":
val = None
fm[key] = val
return fm
def slug_from_branch(branch: str) -> str:
"""Extract source slug from branch name like 'extract/2026-04-06-slug-hash'."""
if "/" in branch:
branch = branch.split("/", 1)[1]
# Strip trailing hex hash (e.g., -3e68, -a6af)
branch = re.sub(r"-[0-9a-f]{4}$", "", branch)
return branch
def main():
conn = sqlite3.connect(DB_PATH, timeout=30)
conn.row_factory = sqlite3.Row
# Build source index: filename stem → frontmatter
source_index = {}
if ARCHIVE_DIR.exists():
for f in ARCHIVE_DIR.glob("*.md"):
fm = parse_frontmatter(f)
source_index[f.stem] = fm
print(f"Indexed {len(source_index)} source files from {ARCHIVE_DIR}")
# Get all PRs without submitted_by
prs = conn.execute(
"SELECT number, branch FROM prs WHERE submitted_by IS NULL AND branch IS NOT NULL"
).fetchall()
print(f"Found {len(prs)} PRs without submitted_by")
updated = 0
for pr in prs:
branch = pr["branch"]
slug = slug_from_branch(branch)
# Try to match slug to a source file
fm = source_index.get(slug)
if not fm:
# Try partial matching: slug might be a substring of the source filename
for stem, sfm in source_index.items():
if slug in stem or stem in slug:
fm = sfm
break
if fm:
proposed_by = fm.get("proposed_by")
intake_tier = fm.get("intake_tier")
if proposed_by:
contributor = proposed_by.strip().strip('"').strip("'")
elif intake_tier == "research-task":
# Derive agent from branch prefix
prefix = branch.split("/", 1)[0] if "/" in branch else "unknown"
agent_map = {
"extract": "pipeline", "ingestion": "pipeline",
"rio": "rio", "theseus": "theseus", "vida": "vida",
"clay": "clay", "astra": "astra", "leo": "leo",
"reweave": "pipeline",
}
agent = agent_map.get(prefix, prefix)
contributor = f"{agent} (self-directed)"
elif intake_tier == "directed":
contributor = "@m3taversal"
else:
# Default: if source exists but no proposed_by, it was Cory's submission
contributor = "@m3taversal"
if contributor:
conn.execute(
"UPDATE prs SET submitted_by = ?, source_path = ? WHERE number = ?",
(contributor, f"inbox/archive/{slug}.md", pr["number"]),
)
updated += 1
else:
# Agent-named branches from overnight research sessions
if branch.startswith(("rio/", "theseus/", "vida/", "clay/", "astra/", "leo/")):
agent = branch.split("/", 1)[0]
conn.execute(
"UPDATE prs SET submitted_by = ? WHERE number = ?",
(f"{agent} (self-directed)", pr["number"]),
)
updated += 1
elif branch.startswith("reweave/"):
conn.execute(
"UPDATE prs SET submitted_by = 'pipeline (reweave)' WHERE number = ?",
(pr["number"],),
)
updated += 1
else:
# Everything else (extract/, ingestion/, unknown) → Cory directed it
conn.execute(
"UPDATE prs SET submitted_by = '@m3taversal' WHERE number = ?",
(pr["number"],),
)
updated += 1
conn.commit()
conn.close()
print(f"Updated {updated}/{len(prs)} PRs with submitted_by")
if __name__ == "__main__":
main()