feat: CI backfill script — reclassifies 614 PRs, attributes sourcer to m3taversal

484 knowledge PRs, 130 pipeline PRs (excluded from CI). m3taversal credited as sourcer for all knowledge PRs. Principal roll-up: 540 claims, CI 75.4. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-26 15:02:27 +00:00 · 2026-03-26 15:02:27 +00:00 · ae1cce730c
commit ae1cce730c
parent 4b5c5841ce
1 changed files with 196 additions and 0 deletions
--- a/backfill-ci.py
+++ b/backfill-ci.py
@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""Backfill CI contributor attribution from git history.
+
+Walks all merged PRs, reclassifies as knowledge/pipeline,
+re-derives contributor counts with corrected logic.
+
+Initial claims (sourced by m3taversal, extracted by agents) get
+sourcer credit to m3taversal.
+
+Usage:
+    python3 backfill-ci.py [--dry-run]
+
+Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
+"""
+
+import argparse
+import json
+import re
+import sqlite3
+import subprocess
+from pathlib import Path
+
+DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
+REPO_DIR = "/opt/teleo-eval/workspaces/main"
+
+# Static principal map
+PRINCIPAL_MAP = {
+    "rio": "m3taversal",
+    "leo": "m3taversal",
+    "clay": "m3taversal",
+    "theseus": "m3taversal",
+    "vida": "m3taversal",
+    "astra": "m3taversal",
+}
+
+KNOWLEDGE_PREFIXES = ("domains/", "core/", "foundations/", "decisions/")
+PIPELINE_PREFIXES = ("inbox/", "entities/", "agents/")
+
+
+def classify_pr(conn, pr_number):
+    """Classify a merged PR as knowledge or pipeline from its DB record."""
+    row = conn.execute("SELECT branch FROM prs WHERE number=?", (pr_number,)).fetchone()
+    if not row or not row[0]:
+        return "pipeline"  # No branch info = infrastructure
+
+    branch = row[0]
+
+    # Pipeline branches are obvious
+    if branch.startswith("pipeline/") or branch.startswith("entity-batch/"):
+        return "pipeline"
+
+    # Try to get diff from git
+    try:
+        result = subprocess.run(
+            ["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
+            cwd=REPO_DIR, capture_output=True, text=True, timeout=10,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            files = result.stdout.strip().split("\n")
+            if any(f.startswith(KNOWLEDGE_PREFIXES) for f in files):
+                return "knowledge"
+            return "pipeline"
+    except Exception:
+        pass
+
+    # Fallback: check branch name patterns
+    if any(branch.startswith(p) for p in ("extract/", "rio/", "leo/", "clay/", "theseus/", "vida/", "astra/")):
+        return "knowledge"  # Agent extraction branches are usually knowledge
+
+    return "pipeline"
+
+
+def get_pr_agent(conn, pr_number):
+    """Get the agent name for a PR from DB or branch name."""
+    row = conn.execute("SELECT agent, branch FROM prs WHERE number=?", (pr_number,)).fetchone()
+    if row and row[0]:
+        return row[0].lower()
+    if row and row[1]:
+        branch = row[1]
+        # Extract agent from branch prefix
+        for agent in ("rio", "leo", "clay", "theseus", "vida", "astra", "epimetheus", "ganymede", "argus"):
+            if branch.startswith(f"{agent}/"):
+                return agent
+        if branch.startswith("extract/"):
+            return "epimetheus"  # Pipeline extraction
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    # Step 1: Reset all role counts
+    if not args.dry_run:
+        conn.execute("""UPDATE contributors SET
+            sourcer_count=0, extractor_count=0, challenger_count=0,
+            synthesizer_count=0, reviewer_count=0, claims_merged=0""")
+        print("Reset all contributor counts to zero")
+
+    # Step 2: Walk all merged PRs
+    merged_prs = conn.execute(
+        "SELECT number, branch, agent, origin FROM prs WHERE status='merged' ORDER BY number"
+    ).fetchall()
+    print(f"Processing {len(merged_prs)} merged PRs")
+
+    knowledge_count = 0
+    pipeline_count = 0
+    attributed = {}  # handle → {role → count}
+
+    for pr in merged_prs:
+        pr_num = pr["number"]
+        commit_type = classify_pr(conn, pr_num)
+
+        if commit_type == "pipeline":
+            pipeline_count += 1
+            if not args.dry_run:
+                conn.execute("UPDATE prs SET commit_type='pipeline' WHERE number=?", (pr_num,))
+            continue
+
+        knowledge_count += 1
+        if not args.dry_run:
+            conn.execute("UPDATE prs SET commit_type='knowledge' WHERE number=?", (pr_num,))
+
+        agent = get_pr_agent(conn, pr_num)
+
+        # Credit the extracting agent
+        if agent:
+            attributed.setdefault(agent, {"extractor": 0, "sourcer": 0, "claims": 0})
+            attributed[agent]["extractor"] += 1
+            attributed[agent]["claims"] += 1
+
+        # Credit m3taversal as sourcer for all knowledge PRs
+        # (he directed the work, provided sources, seeded the KB)
+        attributed.setdefault("m3taversal", {"extractor": 0, "sourcer": 0, "claims": 0})
+        attributed["m3taversal"]["sourcer"] += 1
+        attributed["m3taversal"]["claims"] += 1
+
+    print(f"\nClassified: {knowledge_count} knowledge, {pipeline_count} pipeline")
+
+    # Step 3: Update contributor table
+    print("\n=== Attribution results ===")
+    for handle, counts in sorted(attributed.items(), key=lambda x: x[1]["claims"], reverse=True):
+        principal = PRINCIPAL_MAP.get(handle)
+        p = f" -> {principal}" if principal else ""
+        print(f"  {handle}{p}: sourcer={counts['sourcer']}, extractor={counts['extractor']}, claims={counts['claims']}")
+
+        if not args.dry_run:
+            # Upsert
+            existing = conn.execute("SELECT handle FROM contributors WHERE handle=?", (handle,)).fetchone()
+            if existing:
+                conn.execute("""UPDATE contributors SET
+                    sourcer_count=?, extractor_count=?, claims_merged=?,
+                    principal=?
+                    WHERE handle=?""",
+                    (counts["sourcer"], counts["extractor"], counts["claims"],
+                     principal, handle))
+            else:
+                conn.execute("""INSERT INTO contributors
+                    (handle, sourcer_count, extractor_count, claims_merged, principal,
+                     first_contribution, last_contribution, tier)
+                    VALUES (?, ?, ?, ?, ?, date('now'), date('now'), 'contributor')""",
+                    (handle, counts["sourcer"], counts["extractor"], counts["claims"], principal))
+
+    if not args.dry_run:
+        conn.commit()
+        print("\nBackfill committed to DB")
+
+        # Verify
+        weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
+        print("\n=== Post-backfill CI ===")
+        for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count,
+            challenger_count, synthesizer_count, reviewer_count, claims_merged
+            FROM contributors ORDER BY claims_merged DESC LIMIT 10""").fetchall():
+            ci = sum((r[f"{role}_count"] or 0) * w for role, w in weights.items())
+            p = f" -> {r['principal']}" if r['principal'] else ""
+            print(f"  {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, ext={r['extractor_count']}, CI={round(ci, 2)}")
+
+        # Principal roll-up
+        print("\n=== Principal roll-up ===")
+        rows = conn.execute("""SELECT
+            COALESCE(principal, handle) as who,
+            SUM(sourcer_count) as src, SUM(extractor_count) as ext,
+            SUM(challenger_count) as chl, SUM(synthesizer_count) as syn,
+            SUM(reviewer_count) as rev, SUM(claims_merged) as claims
+            FROM contributors GROUP BY who ORDER BY claims DESC""").fetchall()
+        for r in rows:
+            ci = r["src"]*0.15 + r["ext"]*0.05 + r["chl"]*0.35 + r["syn"]*0.25 + r["rev"]*0.20
+            print(f"  {r['who']}: claims={r['claims']}, CI={round(ci, 2)}")
+
+
+if __name__ == "__main__":
+    main()