The contributor attribution always recorded "extractor" regardless of the PR's refined commit_type. Added COMMIT_TYPE_TO_ROLE mapping and applied it in all three attribution paths (Pentagon-Agent trailer, git author fallback, PR agent fallback). Backfill script resets and re-derives role counts from prs.commit_type. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
113 lines
3.9 KiB
Python
113 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Backfill contributor role counts from prs.commit_type.
|
|
|
|
Resets all role counts to 0, then re-derives them from the prs table's
|
|
commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects
|
|
the bug where all contributors were recorded as 'extractor' regardless
|
|
of their actual commit_type.
|
|
|
|
Usage:
|
|
python3 ops/backfill-contributor-roles.py [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import sqlite3
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role
|
|
|
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
|
|
|
|
def backfill(db_path: str, dry_run: bool = False):
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Get all merged PRs with commit_type and agent
|
|
prs = conn.execute("""
|
|
SELECT number, commit_type, agent, branch
|
|
FROM prs
|
|
WHERE status = 'merged' AND agent IS NOT NULL
|
|
ORDER BY number
|
|
""").fetchall()
|
|
|
|
print(f"Processing {len(prs)} merged PRs...")
|
|
|
|
# Reset all role counts
|
|
if not dry_run:
|
|
conn.execute("""
|
|
UPDATE contributors SET
|
|
extractor_count = 0,
|
|
challenger_count = 0,
|
|
synthesizer_count = 0,
|
|
sourcer_count = 0
|
|
""")
|
|
print("Reset all role counts to 0")
|
|
|
|
# Tally roles from commit_type
|
|
role_counts: dict[str, dict[str, int]] = {}
|
|
for pr in prs:
|
|
agent = pr["agent"].lower() if pr["agent"] else None
|
|
if not agent or agent in ("external", "pipeline"):
|
|
continue
|
|
|
|
commit_type = pr["commit_type"] or "extract"
|
|
role = commit_type_to_role(commit_type)
|
|
|
|
if agent not in role_counts:
|
|
role_counts[agent] = {
|
|
"extractor_count": 0, "challenger_count": 0,
|
|
"synthesizer_count": 0, "sourcer_count": 0,
|
|
"reviewer_count": 0,
|
|
}
|
|
role_col = f"{role}_count"
|
|
if role_col in role_counts[agent]:
|
|
role_counts[agent][role_col] += 1
|
|
|
|
# Apply tallied counts
|
|
for handle, counts in sorted(role_counts.items()):
|
|
non_zero = {k: v for k, v in counts.items() if v > 0}
|
|
print(f" {handle}: {non_zero or '(no knowledge PRs)'}")
|
|
if not dry_run and non_zero:
|
|
set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items())
|
|
conn.execute(
|
|
f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?",
|
|
(handle,),
|
|
)
|
|
|
|
if not dry_run:
|
|
conn.commit()
|
|
print("\nBackfill committed.")
|
|
else:
|
|
print("\n[DRY RUN] No changes made.")
|
|
|
|
# Print summary
|
|
print("\nRole distribution across all contributors:")
|
|
if not dry_run:
|
|
rows = conn.execute("""
|
|
SELECT handle, extractor_count, challenger_count, synthesizer_count,
|
|
sourcer_count, reviewer_count
|
|
FROM contributors
|
|
ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC
|
|
""").fetchall()
|
|
for r in rows:
|
|
parts = []
|
|
if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}")
|
|
if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}")
|
|
if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}")
|
|
if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}")
|
|
if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}")
|
|
if parts:
|
|
print(f" {r['handle']}: {', '.join(parts)}")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--db", default=DB_PATH)
|
|
args = parser.parse_args()
|
|
backfill(args.db, args.dry_run)
|