fix: wire commit_type into contributor role assignment

The contributor attribution always recorded "extractor" regardless of the PR's refined commit_type. Added COMMIT_TYPE_TO_ROLE mapping and applied it in all three attribution paths (Pentagon-Agent trailer, git author fallback, PR agent fallback). Backfill script resets and re-derives role counts from prs.commit_type. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 10:27:36 +01:00 · 2026-04-21 10:27:36 +01:00 · c29049924e
commit c29049924e
parent f463f49b46
2 changed files with 135 additions and 3 deletions
--- a/lib/contributor.py
+++ b/lib/contributor.py
@ -38,6 +38,22 @@ def is_knowledge_pr(diff: str) -> bool:
    return False


+COMMIT_TYPE_TO_ROLE = {
+    "challenge": "challenger",
+    "enrich": "synthesizer",
+    "extract": "extractor",
+    "research": "synthesizer",
+    "entity": "extractor",
+    "reweave": "synthesizer",
+    "fix": "extractor",
+}
+
+
+def commit_type_to_role(commit_type: str) -> str:
+    """Map a refined commit_type to a contributor role."""
+    return COMMIT_TYPE_TO_ROLE.get(commit_type, "extractor")
+
+
 def refine_commit_type(diff: str, branch_commit_type: str) -> str:
    """Refine commit_type from diff content when branch prefix is ambiguous.

@ -126,8 +142,9 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
        for match in re.finditer(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", log_output):
            agent_name = match.group(1).lower()
            agent_uuid = match.group(2)
+            role = commit_type_to_role(refined_type)
            upsert_contributor(
-                conn, agent_name, agent_uuid, "extractor", today,
+                conn, agent_name, agent_uuid, role, today,
            )
            agents_found.add(agent_name)

@ -167,13 +184,15 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
            for author_line in author_output.strip().split("\n"):
                author_name = author_line.strip().lower()
                if author_name and author_name not in _BOT_AUTHORS:
-                    upsert_contributor(conn, author_name, None, "extractor", today)
+                    role = commit_type_to_role(refined_type)
+                    upsert_contributor(conn, author_name, None, role, today)
                    agents_found.add(author_name)

        if not agents_found:
            row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone()
            if row and row["agent"] and row["agent"] != "external":
-                upsert_contributor(conn, row["agent"].lower(), None, "extractor", today)
+                role = commit_type_to_role(refined_type)
+                upsert_contributor(conn, row["agent"].lower(), None, role, today)


 def upsert_contributor(
--- a/ops/backfill-contributor-roles.py
+++ b/ops/backfill-contributor-roles.py
@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Backfill contributor role counts from prs.commit_type.
+
+Resets all role counts to 0, then re-derives them from the prs table's
+commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects
+the bug where all contributors were recorded as 'extractor' regardless
+of their actual commit_type.
+
+Usage:
+    python3 ops/backfill-contributor-roles.py [--dry-run]
+"""
+
+import argparse
+import sqlite3
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role
+
+DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
+
+
+def backfill(db_path: str, dry_run: bool = False):
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+
+    # Get all merged PRs with commit_type and agent
+    prs = conn.execute("""
+        SELECT number, commit_type, agent, branch
+        FROM prs
+        WHERE status = 'merged' AND agent IS NOT NULL
+        ORDER BY number
+    """).fetchall()
+
+    print(f"Processing {len(prs)} merged PRs...")
+
+    # Reset all role counts
+    if not dry_run:
+        conn.execute("""
+            UPDATE contributors SET
+                extractor_count = 0,
+                challenger_count = 0,
+                synthesizer_count = 0,
+                sourcer_count = 0
+        """)
+        print("Reset all role counts to 0")
+
+    # Tally roles from commit_type
+    role_counts: dict[str, dict[str, int]] = {}
+    for pr in prs:
+        agent = pr["agent"].lower() if pr["agent"] else None
+        if not agent or agent in ("external", "pipeline"):
+            continue
+
+        commit_type = pr["commit_type"] or "extract"
+        role = commit_type_to_role(commit_type)
+
+        if agent not in role_counts:
+            role_counts[agent] = {
+                "extractor_count": 0, "challenger_count": 0,
+                "synthesizer_count": 0, "sourcer_count": 0,
+                "reviewer_count": 0,
+            }
+        role_col = f"{role}_count"
+        if role_col in role_counts[agent]:
+            role_counts[agent][role_col] += 1
+
+    # Apply tallied counts
+    for handle, counts in sorted(role_counts.items()):
+        non_zero = {k: v for k, v in counts.items() if v > 0}
+        print(f"  {handle}: {non_zero or '(no knowledge PRs)'}")
+        if not dry_run and non_zero:
+            set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items())
+            conn.execute(
+                f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?",
+                (handle,),
+            )
+
+    if not dry_run:
+        conn.commit()
+        print("\nBackfill committed.")
+    else:
+        print("\n[DRY RUN] No changes made.")
+
+    # Print summary
+    print("\nRole distribution across all contributors:")
+    if not dry_run:
+        rows = conn.execute("""
+            SELECT handle, extractor_count, challenger_count, synthesizer_count,
+                   sourcer_count, reviewer_count
+            FROM contributors
+            ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC
+        """).fetchall()
+        for r in rows:
+            parts = []
+            if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}")
+            if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}")
+            if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}")
+            if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}")
+            if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}")
+            if parts:
+                print(f"  {r['handle']}: {', '.join(parts)}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--db", default=DB_PATH)
+    args = parser.parse_args()
+    backfill(args.db, args.dry_run)