feat: CI backfill script — reclassifies 614 PRs, attributes sourcer to m3taversal
484 knowledge PRs, 130 pipeline PRs (excluded from CI). m3taversal credited as sourcer for all knowledge PRs. Principal roll-up: 540 claims, CI 75.4. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
4b5c5841ce
commit
ae1cce730c
1 changed files with 196 additions and 0 deletions
196
backfill-ci.py
Normal file
196
backfill-ci.py
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Backfill CI contributor attribution from git history.
|
||||
|
||||
Walks all merged PRs, reclassifies as knowledge/pipeline,
|
||||
re-derives contributor counts with corrected logic.
|
||||
|
||||
Initial claims (sourced by m3taversal, extracted by agents) get
|
||||
sourcer credit to m3taversal.
|
||||
|
||||
Usage:
|
||||
python3 backfill-ci.py [--dry-run]
|
||||
|
||||
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
|
||||
REPO_DIR = "/opt/teleo-eval/workspaces/main"
|
||||
|
||||
# Static principal map
|
||||
PRINCIPAL_MAP = {
|
||||
"rio": "m3taversal",
|
||||
"leo": "m3taversal",
|
||||
"clay": "m3taversal",
|
||||
"theseus": "m3taversal",
|
||||
"vida": "m3taversal",
|
||||
"astra": "m3taversal",
|
||||
}
|
||||
|
||||
KNOWLEDGE_PREFIXES = ("domains/", "core/", "foundations/", "decisions/")
|
||||
PIPELINE_PREFIXES = ("inbox/", "entities/", "agents/")
|
||||
|
||||
|
||||
def classify_pr(conn, pr_number):
|
||||
"""Classify a merged PR as knowledge or pipeline from its DB record."""
|
||||
row = conn.execute("SELECT branch FROM prs WHERE number=?", (pr_number,)).fetchone()
|
||||
if not row or not row[0]:
|
||||
return "pipeline" # No branch info = infrastructure
|
||||
|
||||
branch = row[0]
|
||||
|
||||
# Pipeline branches are obvious
|
||||
if branch.startswith("pipeline/") or branch.startswith("entity-batch/"):
|
||||
return "pipeline"
|
||||
|
||||
# Try to get diff from git
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
|
||||
cwd=REPO_DIR, capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
files = result.stdout.strip().split("\n")
|
||||
if any(f.startswith(KNOWLEDGE_PREFIXES) for f in files):
|
||||
return "knowledge"
|
||||
return "pipeline"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: check branch name patterns
|
||||
if any(branch.startswith(p) for p in ("extract/", "rio/", "leo/", "clay/", "theseus/", "vida/", "astra/")):
|
||||
return "knowledge" # Agent extraction branches are usually knowledge
|
||||
|
||||
return "pipeline"
|
||||
|
||||
|
||||
def get_pr_agent(conn, pr_number):
|
||||
"""Get the agent name for a PR from DB or branch name."""
|
||||
row = conn.execute("SELECT agent, branch FROM prs WHERE number=?", (pr_number,)).fetchone()
|
||||
if row and row[0]:
|
||||
return row[0].lower()
|
||||
if row and row[1]:
|
||||
branch = row[1]
|
||||
# Extract agent from branch prefix
|
||||
for agent in ("rio", "leo", "clay", "theseus", "vida", "astra", "epimetheus", "ganymede", "argus"):
|
||||
if branch.startswith(f"{agent}/"):
|
||||
return agent
|
||||
if branch.startswith("extract/"):
|
||||
return "epimetheus" # Pipeline extraction
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Step 1: Reset all role counts
|
||||
if not args.dry_run:
|
||||
conn.execute("""UPDATE contributors SET
|
||||
sourcer_count=0, extractor_count=0, challenger_count=0,
|
||||
synthesizer_count=0, reviewer_count=0, claims_merged=0""")
|
||||
print("Reset all contributor counts to zero")
|
||||
|
||||
# Step 2: Walk all merged PRs
|
||||
merged_prs = conn.execute(
|
||||
"SELECT number, branch, agent, origin FROM prs WHERE status='merged' ORDER BY number"
|
||||
).fetchall()
|
||||
print(f"Processing {len(merged_prs)} merged PRs")
|
||||
|
||||
knowledge_count = 0
|
||||
pipeline_count = 0
|
||||
attributed = {} # handle → {role → count}
|
||||
|
||||
for pr in merged_prs:
|
||||
pr_num = pr["number"]
|
||||
commit_type = classify_pr(conn, pr_num)
|
||||
|
||||
if commit_type == "pipeline":
|
||||
pipeline_count += 1
|
||||
if not args.dry_run:
|
||||
conn.execute("UPDATE prs SET commit_type='pipeline' WHERE number=?", (pr_num,))
|
||||
continue
|
||||
|
||||
knowledge_count += 1
|
||||
if not args.dry_run:
|
||||
conn.execute("UPDATE prs SET commit_type='knowledge' WHERE number=?", (pr_num,))
|
||||
|
||||
agent = get_pr_agent(conn, pr_num)
|
||||
|
||||
# Credit the extracting agent
|
||||
if agent:
|
||||
attributed.setdefault(agent, {"extractor": 0, "sourcer": 0, "claims": 0})
|
||||
attributed[agent]["extractor"] += 1
|
||||
attributed[agent]["claims"] += 1
|
||||
|
||||
# Credit m3taversal as sourcer for all knowledge PRs
|
||||
# (he directed the work, provided sources, seeded the KB)
|
||||
attributed.setdefault("m3taversal", {"extractor": 0, "sourcer": 0, "claims": 0})
|
||||
attributed["m3taversal"]["sourcer"] += 1
|
||||
attributed["m3taversal"]["claims"] += 1
|
||||
|
||||
print(f"\nClassified: {knowledge_count} knowledge, {pipeline_count} pipeline")
|
||||
|
||||
# Step 3: Update contributor table
|
||||
print("\n=== Attribution results ===")
|
||||
for handle, counts in sorted(attributed.items(), key=lambda x: x[1]["claims"], reverse=True):
|
||||
principal = PRINCIPAL_MAP.get(handle)
|
||||
p = f" -> {principal}" if principal else ""
|
||||
print(f" {handle}{p}: sourcer={counts['sourcer']}, extractor={counts['extractor']}, claims={counts['claims']}")
|
||||
|
||||
if not args.dry_run:
|
||||
# Upsert
|
||||
existing = conn.execute("SELECT handle FROM contributors WHERE handle=?", (handle,)).fetchone()
|
||||
if existing:
|
||||
conn.execute("""UPDATE contributors SET
|
||||
sourcer_count=?, extractor_count=?, claims_merged=?,
|
||||
principal=?
|
||||
WHERE handle=?""",
|
||||
(counts["sourcer"], counts["extractor"], counts["claims"],
|
||||
principal, handle))
|
||||
else:
|
||||
conn.execute("""INSERT INTO contributors
|
||||
(handle, sourcer_count, extractor_count, claims_merged, principal,
|
||||
first_contribution, last_contribution, tier)
|
||||
VALUES (?, ?, ?, ?, ?, date('now'), date('now'), 'contributor')""",
|
||||
(handle, counts["sourcer"], counts["extractor"], counts["claims"], principal))
|
||||
|
||||
if not args.dry_run:
|
||||
conn.commit()
|
||||
print("\nBackfill committed to DB")
|
||||
|
||||
# Verify
|
||||
weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
|
||||
print("\n=== Post-backfill CI ===")
|
||||
for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count,
|
||||
challenger_count, synthesizer_count, reviewer_count, claims_merged
|
||||
FROM contributors ORDER BY claims_merged DESC LIMIT 10""").fetchall():
|
||||
ci = sum((r[f"{role}_count"] or 0) * w for role, w in weights.items())
|
||||
p = f" -> {r['principal']}" if r['principal'] else ""
|
||||
print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, ext={r['extractor_count']}, CI={round(ci, 2)}")
|
||||
|
||||
# Principal roll-up
|
||||
print("\n=== Principal roll-up ===")
|
||||
rows = conn.execute("""SELECT
|
||||
COALESCE(principal, handle) as who,
|
||||
SUM(sourcer_count) as src, SUM(extractor_count) as ext,
|
||||
SUM(challenger_count) as chl, SUM(synthesizer_count) as syn,
|
||||
SUM(reviewer_count) as rev, SUM(claims_merged) as claims
|
||||
FROM contributors GROUP BY who ORDER BY claims DESC""").fetchall()
|
||||
for r in rows:
|
||||
ci = r["src"]*0.15 + r["ext"]*0.05 + r["chl"]*0.35 + r["syn"]*0.25 + r["rev"]*0.20
|
||||
print(f" {r['who']}: claims={r['claims']}, CI={round(ci, 2)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue