From ae1cce730cea1649ef196adbcba5b5dd30feb912 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 26 Mar 2026 15:02:27 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20CI=20backfill=20script=20=E2=80=94=20re?= =?UTF-8?q?classifies=20614=20PRs,=20attributes=20sourcer=20to=20m3taversa?= =?UTF-8?q?l?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 484 knowledge PRs, 130 pipeline PRs (excluded from CI). m3taversal credited as sourcer for all knowledge PRs. Principal roll-up: 540 claims, CI 75.4. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- backfill-ci.py | 196 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 backfill-ci.py diff --git a/backfill-ci.py b/backfill-ci.py new file mode 100644 index 0000000..558bbea --- /dev/null +++ b/backfill-ci.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""Backfill CI contributor attribution from git history. + +Walks all merged PRs, reclassifies as knowledge/pipeline, +re-derives contributor counts with corrected logic. + +Initial claims (sourced by m3taversal, extracted by agents) get +sourcer credit to m3taversal. + +Usage: + python3 backfill-ci.py [--dry-run] + +Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> +""" + +import argparse +import json +import re +import sqlite3 +import subprocess +from pathlib import Path + +DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db" +REPO_DIR = "/opt/teleo-eval/workspaces/main" + +# Static principal map +PRINCIPAL_MAP = { + "rio": "m3taversal", + "leo": "m3taversal", + "clay": "m3taversal", + "theseus": "m3taversal", + "vida": "m3taversal", + "astra": "m3taversal", +} + +KNOWLEDGE_PREFIXES = ("domains/", "core/", "foundations/", "decisions/") +PIPELINE_PREFIXES = ("inbox/", "entities/", "agents/") + + +def classify_pr(conn, pr_number): + """Classify a merged PR as knowledge or pipeline from its DB record.""" + row = conn.execute("SELECT branch FROM prs WHERE number=?", (pr_number,)).fetchone() + if not row or not row[0]: + return "pipeline" # No branch info = infrastructure + + branch = row[0] + + # Pipeline branches are obvious + if branch.startswith("pipeline/") or branch.startswith("entity-batch/"): + return "pipeline" + + # Try to get diff from git + try: + result = subprocess.run( + ["git", "diff", "--name-only", f"origin/main...origin/{branch}"], + cwd=REPO_DIR, capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0 and result.stdout.strip(): + files = result.stdout.strip().split("\n") + if any(f.startswith(KNOWLEDGE_PREFIXES) for f in files): + return "knowledge" + return "pipeline" + except Exception: + pass + + # Fallback: check branch name patterns + if any(branch.startswith(p) for p in ("extract/", "rio/", "leo/", "clay/", "theseus/", "vida/", "astra/")): + return "knowledge" # Agent extraction branches are usually knowledge + + return "pipeline" + + +def get_pr_agent(conn, pr_number): + """Get the agent name for a PR from DB or branch name.""" + row = conn.execute("SELECT agent, branch FROM prs WHERE number=?", (pr_number,)).fetchone() + if row and row[0]: + return row[0].lower() + if row and row[1]: + branch = row[1] + # Extract agent from branch prefix + for agent in ("rio", "leo", "clay", "theseus", "vida", "astra", "epimetheus", "ganymede", "argus"): + if branch.startswith(f"{agent}/"): + return agent + if branch.startswith("extract/"): + return "epimetheus" # Pipeline extraction + return None + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + + # Step 1: Reset all role counts + if not args.dry_run: + conn.execute("""UPDATE contributors SET + sourcer_count=0, extractor_count=0, challenger_count=0, + synthesizer_count=0, reviewer_count=0, claims_merged=0""") + print("Reset all contributor counts to zero") + + # Step 2: Walk all merged PRs + merged_prs = conn.execute( + "SELECT number, branch, agent, origin FROM prs WHERE status='merged' ORDER BY number" + ).fetchall() + print(f"Processing {len(merged_prs)} merged PRs") + + knowledge_count = 0 + pipeline_count = 0 + attributed = {} # handle → {role → count} + + for pr in merged_prs: + pr_num = pr["number"] + commit_type = classify_pr(conn, pr_num) + + if commit_type == "pipeline": + pipeline_count += 1 + if not args.dry_run: + conn.execute("UPDATE prs SET commit_type='pipeline' WHERE number=?", (pr_num,)) + continue + + knowledge_count += 1 + if not args.dry_run: + conn.execute("UPDATE prs SET commit_type='knowledge' WHERE number=?", (pr_num,)) + + agent = get_pr_agent(conn, pr_num) + + # Credit the extracting agent + if agent: + attributed.setdefault(agent, {"extractor": 0, "sourcer": 0, "claims": 0}) + attributed[agent]["extractor"] += 1 + attributed[agent]["claims"] += 1 + + # Credit m3taversal as sourcer for all knowledge PRs + # (he directed the work, provided sources, seeded the KB) + attributed.setdefault("m3taversal", {"extractor": 0, "sourcer": 0, "claims": 0}) + attributed["m3taversal"]["sourcer"] += 1 + attributed["m3taversal"]["claims"] += 1 + + print(f"\nClassified: {knowledge_count} knowledge, {pipeline_count} pipeline") + + # Step 3: Update contributor table + print("\n=== Attribution results ===") + for handle, counts in sorted(attributed.items(), key=lambda x: x[1]["claims"], reverse=True): + principal = PRINCIPAL_MAP.get(handle) + p = f" -> {principal}" if principal else "" + print(f" {handle}{p}: sourcer={counts['sourcer']}, extractor={counts['extractor']}, claims={counts['claims']}") + + if not args.dry_run: + # Upsert + existing = conn.execute("SELECT handle FROM contributors WHERE handle=?", (handle,)).fetchone() + if existing: + conn.execute("""UPDATE contributors SET + sourcer_count=?, extractor_count=?, claims_merged=?, + principal=? + WHERE handle=?""", + (counts["sourcer"], counts["extractor"], counts["claims"], + principal, handle)) + else: + conn.execute("""INSERT INTO contributors + (handle, sourcer_count, extractor_count, claims_merged, principal, + first_contribution, last_contribution, tier) + VALUES (?, ?, ?, ?, ?, date('now'), date('now'), 'contributor')""", + (handle, counts["sourcer"], counts["extractor"], counts["claims"], principal)) + + if not args.dry_run: + conn.commit() + print("\nBackfill committed to DB") + + # Verify + weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20} + print("\n=== Post-backfill CI ===") + for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, + challenger_count, synthesizer_count, reviewer_count, claims_merged + FROM contributors ORDER BY claims_merged DESC LIMIT 10""").fetchall(): + ci = sum((r[f"{role}_count"] or 0) * w for role, w in weights.items()) + p = f" -> {r['principal']}" if r['principal'] else "" + print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, ext={r['extractor_count']}, CI={round(ci, 2)}") + + # Principal roll-up + print("\n=== Principal roll-up ===") + rows = conn.execute("""SELECT + COALESCE(principal, handle) as who, + SUM(sourcer_count) as src, SUM(extractor_count) as ext, + SUM(challenger_count) as chl, SUM(synthesizer_count) as syn, + SUM(reviewer_count) as rev, SUM(claims_merged) as claims + FROM contributors GROUP BY who ORDER BY claims DESC""").fetchall() + for r in rows: + ci = r["src"]*0.15 + r["ext"]*0.05 + r["chl"]*0.35 + r["syn"]*0.25 + r["rev"]*0.20 + print(f" {r['who']}: claims={r['claims']}, CI={round(ci, 2)}") + + +if __name__ == "__main__": + main()