#!/usr/bin/env python3 # ONE-SHOT BACKFILL — do not cron. Idempotent but resets all counts. (Ganymede) """Backfill CI contributor attribution from git history. Walks all merged PRs, reclassifies as knowledge/pipeline, re-derives contributor counts with corrected logic. Initial claims (sourced by m3taversal, extracted by agents) get sourcer credit to m3taversal. Usage: python3 backfill-ci.py [--dry-run] Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> """ import argparse import json import re import sqlite3 import subprocess from pathlib import Path DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db" REPO_DIR = "/opt/teleo-eval/workspaces/main" # Static principal map PRINCIPAL_MAP = { "rio": "m3taversal", "leo": "m3taversal", "clay": "m3taversal", "theseus": "m3taversal", "vida": "m3taversal", "astra": "m3taversal", } KNOWLEDGE_PREFIXES = ("domains/", "core/", "foundations/", "decisions/") PIPELINE_PREFIXES = ("inbox/", "entities/", "agents/") def classify_pr(conn, pr_number): """Classify a merged PR as knowledge or pipeline from its DB record.""" row = conn.execute("SELECT branch FROM prs WHERE number=?", (pr_number,)).fetchone() if not row or not row[0]: return "pipeline" # No branch info = infrastructure branch = row[0] # Pipeline branches are obvious if branch.startswith("pipeline/") or branch.startswith("entity-batch/"): return "pipeline" # Try to get diff from git try: result = subprocess.run( ["git", "diff", "--name-only", f"origin/main...origin/{branch}"], cwd=REPO_DIR, capture_output=True, text=True, timeout=10, ) if result.returncode == 0 and result.stdout.strip(): files = result.stdout.strip().split("\n") if any(f.startswith(KNOWLEDGE_PREFIXES) for f in files): return "knowledge" return "pipeline" except Exception: pass # Fallback: check branch name patterns if any(branch.startswith(p) for p in ("extract/", "rio/", "leo/", "clay/", "theseus/", "vida/", "astra/")): return "knowledge" # Agent extraction branches are usually knowledge return "pipeline" def get_pr_agent(conn, pr_number): """Get the agent name for a PR from DB or branch name.""" row = conn.execute("SELECT agent, branch FROM prs WHERE number=?", (pr_number,)).fetchone() if row and row[0]: return row[0].lower() if row and row[1]: branch = row[1] # Extract agent from branch prefix for agent in ("rio", "leo", "clay", "theseus", "vida", "astra", "epimetheus", "ganymede", "argus"): if branch.startswith(f"{agent}/"): return agent if branch.startswith("extract/"): return "epimetheus" # Pipeline extraction return None def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row # Step 1: Reset all role counts if not args.dry_run: conn.execute("""UPDATE contributors SET sourcer_count=0, extractor_count=0, challenger_count=0, synthesizer_count=0, reviewer_count=0, claims_merged=0""") print("Reset all contributor counts to zero") # Step 2: Walk all merged PRs merged_prs = conn.execute( "SELECT number, branch, agent, origin FROM prs WHERE status='merged' ORDER BY number" ).fetchall() print(f"Processing {len(merged_prs)} merged PRs") knowledge_count = 0 pipeline_count = 0 attributed = {} # handle → {role → count} for pr in merged_prs: pr_num = pr["number"] commit_type = classify_pr(conn, pr_num) if commit_type == "pipeline": pipeline_count += 1 if not args.dry_run: conn.execute("UPDATE prs SET commit_type='pipeline' WHERE number=?", (pr_num,)) continue knowledge_count += 1 if not args.dry_run: conn.execute("UPDATE prs SET commit_type='knowledge' WHERE number=?", (pr_num,)) agent = get_pr_agent(conn, pr_num) # Credit the extracting agent if agent: attributed.setdefault(agent, {"extractor": 0, "sourcer": 0, "claims": 0}) attributed[agent]["extractor"] += 1 attributed[agent]["claims"] += 1 # Credit m3taversal as sourcer for all knowledge PRs # (he directed the work, provided sources, seeded the KB) attributed.setdefault("m3taversal", {"extractor": 0, "sourcer": 0, "claims": 0}) attributed["m3taversal"]["sourcer"] += 1 attributed["m3taversal"]["claims"] += 1 print(f"\nClassified: {knowledge_count} knowledge, {pipeline_count} pipeline") # Step 3: Update contributor table print("\n=== Attribution results ===") for handle, counts in sorted(attributed.items(), key=lambda x: x[1]["claims"], reverse=True): principal = PRINCIPAL_MAP.get(handle) p = f" -> {principal}" if principal else "" print(f" {handle}{p}: sourcer={counts['sourcer']}, extractor={counts['extractor']}, claims={counts['claims']}") if not args.dry_run: # Upsert existing = conn.execute("SELECT handle FROM contributors WHERE handle=?", (handle,)).fetchone() if existing: conn.execute("""UPDATE contributors SET sourcer_count=?, extractor_count=?, claims_merged=?, principal=? WHERE handle=?""", (counts["sourcer"], counts["extractor"], counts["claims"], principal, handle)) else: conn.execute("""INSERT INTO contributors (handle, sourcer_count, extractor_count, claims_merged, principal, first_contribution, last_contribution, tier) VALUES (?, ?, ?, ?, ?, date('now'), date('now'), 'contributor')""", (handle, counts["sourcer"], counts["extractor"], counts["claims"], principal)) if not args.dry_run: conn.commit() print("\nBackfill committed to DB") # Verify weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20} print("\n=== Post-backfill CI ===") for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, challenger_count, synthesizer_count, reviewer_count, claims_merged FROM contributors ORDER BY claims_merged DESC LIMIT 10""").fetchall(): ci = sum((r[f"{role}_count"] or 0) * w for role, w in weights.items()) p = f" -> {r['principal']}" if r['principal'] else "" print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, ext={r['extractor_count']}, CI={round(ci, 2)}") # Principal roll-up print("\n=== Principal roll-up ===") rows = conn.execute("""SELECT COALESCE(principal, handle) as who, SUM(sourcer_count) as src, SUM(extractor_count) as ext, SUM(challenger_count) as chl, SUM(synthesizer_count) as syn, SUM(reviewer_count) as rev, SUM(claims_merged) as claims FROM contributors GROUP BY who ORDER BY claims DESC""").fetchall() for r in rows: ci = r["src"]*0.15 + r["ext"]*0.05 + r["chl"]*0.35 + r["syn"]*0.25 + r["rev"]*0.20 print(f" {r['who']}: claims={r['claims']}, CI={round(ci, 2)}") if __name__ == "__main__": main()