#!/usr/bin/env python3 """Backfill: re-attribute research-session-derived PRs from m3taversal to agent. Problem: research-session.sh used to write source frontmatter without `proposed_by` / `intake_tier`, so extract.py's contributor-classification fallback set `prs.submitted_by = '@m3taversal'`, which propagated into `contribution_events` as a `handle='m3taversal', role='author'` row per research-derived claim. Result: agent research credited to the human. Forward fix is a frontmatter-template patch to research-session.sh. This script corrects historical records. Identification: Research-session source archives are committed to teleo-codex with a message matching `^: research session YYYY-MM-DD —`. The diff for that commit lists `inbox/queue/*.md` files the agent created. Any PR whose `source_path` matches one of those filenames is research-derived. Touch list (per matched PR): 1. UPDATE prs SET submitted_by = ' (self-directed)' 2. DELETE FROM contribution_events WHERE handle='m3taversal' AND role='author' AND pr_number=? 3. INSERT OR IGNORE INTO contribution_events with handle=, kind='agent', role='author', weight=0.30, original timestamp/domain/channel. Defaults to --dry-run. Pass --apply to commit changes. Usage: python3 backfill-research-session-attribution.py --dry-run --days 30 python3 backfill-research-session-attribution.py --apply --days 30 """ import argparse import logging import os import re import sqlite3 import subprocess import sys from collections import defaultdict from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger("backfill-research-attr") DEFAULT_REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")) DEFAULT_DB = Path(os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")) KNOWN_AGENTS = frozenset({"rio", "leo", "theseus", "vida", "clay", "astra"}) COMMIT_HEADER_RE = re.compile(r"^([a-z]+):\s+research session\s+\d{4}-\d{2}-\d{2}\s+—") AUTHOR_WEIGHT = 0.30 def git(repo: Path, *args: str) -> str: """Run a git command in repo, return stdout. Raises on non-zero.""" result = subprocess.run( ["git", "-C", str(repo), *args], capture_output=True, text=True, check=True, ) return result.stdout def discover_research_session_archives(repo: Path, days: int) -> dict[str, str]: """Return {source_filename_basename: agent_handle} for last N days. Walks teleo-codex `git log --since`, filters to research-session commits, parses agent from message header, lists inbox/queue/*.md files added in that commit's diff. Maps the basename (which becomes source_path on extract) to the agent who created it. """ log = git(repo, "log", f"--since={days} days ago", "--pretty=%H|%s", "--no-merges") file_to_agent: dict[str, str] = {} commits_seen = 0 commits_matched = 0 for line in log.splitlines(): if not line or "|" not in line: continue commits_seen += 1 sha, _, subject = line.partition("|") m = COMMIT_HEADER_RE.match(subject) if not m: continue agent = m.group(1) if agent not in KNOWN_AGENTS: logger.debug("skipping commit %s — unknown agent %r", sha[:8], agent) continue commits_matched += 1 # List files added in this commit (inbox/queue/*.md only) try: added = git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", "--diff-filter=A", sha) except subprocess.CalledProcessError: logger.warning("diff-tree failed for %s", sha[:8]) continue for f in added.splitlines(): if f.startswith("inbox/queue/") and f.endswith(".md"): basename = Path(f).name if basename in file_to_agent and file_to_agent[basename] != agent: logger.warning( "filename collision: %s — was %s, now %s (keeping first)", basename, file_to_agent[basename], agent, ) continue file_to_agent.setdefault(basename, agent) logger.info( "scanned %d commits, %d research-session matches, %d unique source files", commits_seen, commits_matched, len(file_to_agent), ) return file_to_agent def find_misattributed_prs(conn: sqlite3.Connection, file_to_agent: dict[str, str], days: int): """Return list of (pr_number, current_submitted_by, source_path, agent, domain, channel, merged_at). Only includes PRs: - with source_path basename in our research-session map - currently attributed to '@m3taversal' - merged within the last N days (cap on temporal scope) """ rows = conn.execute( """SELECT number, submitted_by, source_path, domain, source_channel, merged_at FROM prs WHERE submitted_by = '@m3taversal' AND source_path IS NOT NULL AND status = 'merged' AND merged_at > datetime('now', ?)""", (f"-{days} days",), ).fetchall() matches = [] for row in rows: basename = Path(row["source_path"]).name agent = file_to_agent.get(basename) if agent: matches.append({ "pr": row["number"], "current_submitted_by": row["submitted_by"], "source_path": row["source_path"], "basename": basename, "agent": agent, "domain": row["domain"], "channel": row["source_channel"], "merged_at": row["merged_at"], }) return matches def existing_event_count(conn: sqlite3.Connection, pr: int, handle: str, role: str) -> int: """Return count of contribution_events rows matching (handle, role, pr_number, claim_path IS NULL).""" return conn.execute( """SELECT COUNT(*) FROM contribution_events WHERE handle = ? AND role = ? AND pr_number = ? AND claim_path IS NULL""", (handle, role, pr), ).fetchone()[0] def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) -> dict: """Apply the backfill. Returns counters.""" counters = defaultdict(int) if not dry_run: conn.execute("BEGIN") try: for m in matches: pr = m["pr"] agent = m["agent"] # Pre-checks for accurate dry-run reporting old_event_exists = existing_event_count(conn, pr, "m3taversal", "author") > 0 new_event_exists = existing_event_count(conn, pr, agent, "author") > 0 if dry_run: logger.info( "would update pr=%d submitted_by '%s' → '%s (self-directed)' " "[m3ta_event=%s, agent_event=%s]", pr, m["current_submitted_by"], agent, old_event_exists, new_event_exists, ) counters["prs"] += 1 if old_event_exists: counters["events_to_delete"] += 1 if not new_event_exists: counters["events_to_insert"] += 1 continue # 1. UPDATE prs.submitted_by conn.execute( "UPDATE prs SET submitted_by = ? WHERE number = ?", (f"{agent} (self-directed)", pr), ) counters["prs"] += 1 # 2. INSERT new agent author event (idempotent via UNIQUE index) cur = conn.execute( """INSERT OR IGNORE INTO contribution_events (handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp) VALUES (?, 'agent', 'author', ?, ?, NULL, ?, ?, COALESCE(?, datetime('now')))""", (agent, AUTHOR_WEIGHT, pr, m["domain"], m["channel"], m["merged_at"]), ) if cur.rowcount > 0: counters["events_inserted"] += 1 # 3. DELETE old m3taversal author event cur = conn.execute( """DELETE FROM contribution_events WHERE handle = 'm3taversal' AND role = 'author' AND pr_number = ? AND claim_path IS NULL""", (pr,), ) if cur.rowcount > 0: counters["events_deleted"] += 1 if not dry_run: conn.execute("COMMIT") except Exception: if not dry_run: conn.execute("ROLLBACK") raise return dict(counters) def main(): parser = argparse.ArgumentParser() parser.add_argument("--repo", type=Path, default=DEFAULT_REPO) parser.add_argument("--db", type=Path, default=DEFAULT_DB) parser.add_argument("--days", type=int, default=30) parser.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)") parser.add_argument("--limit", type=int, default=0, help="cap PR updates (0 = no cap; useful for testing on a small slice)") args = parser.parse_args() dry_run = not args.apply logger.info("repo=%s db=%s days=%d mode=%s", args.repo, args.db, args.days, "DRY-RUN" if dry_run else "APPLY") if not args.repo.exists(): logger.error("repo not found: %s", args.repo) sys.exit(1) if not args.db.exists(): logger.error("db not found: %s", args.db) sys.exit(1) file_to_agent = discover_research_session_archives(args.repo, args.days) if not file_to_agent: logger.warning("no research-session source files found in last %d days", args.days) sys.exit(0) # Per-agent breakdown by_agent = defaultdict(int) for agent in file_to_agent.values(): by_agent[agent] += 1 for agent, count in sorted(by_agent.items()): logger.info(" research-session sources by %s: %d", agent, count) conn = sqlite3.connect(args.db) conn.row_factory = sqlite3.Row matches = find_misattributed_prs(conn, file_to_agent, args.days) logger.info("misattributed PRs found: %d", len(matches)) if args.limit and len(matches) > args.limit: logger.info("--limit=%d — truncating from %d", args.limit, len(matches)) matches = matches[:args.limit] if not matches: logger.info("nothing to do") return # Per-agent breakdown of misattribution miss_by_agent = defaultdict(int) for m in matches: miss_by_agent[m["agent"]] += 1 logger.info("misattributed PR breakdown:") for agent, count in sorted(miss_by_agent.items()): logger.info(" %s: %d", agent, count) counters = apply_backfill(conn, matches, dry_run) logger.info("RESULT (%s): %s", "DRY-RUN" if dry_run else "APPLIED", counters) if __name__ == "__main__": main()