diff --git a/lib/contributor.py b/lib/contributor.py index fa86163..713dab4 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -38,6 +38,22 @@ def is_knowledge_pr(diff: str) -> bool: return False +COMMIT_TYPE_TO_ROLE = { + "challenge": "challenger", + "enrich": "synthesizer", + "extract": "extractor", + "research": "synthesizer", + "entity": "extractor", + "reweave": "synthesizer", + "fix": "extractor", +} + + +def commit_type_to_role(commit_type: str) -> str: + """Map a refined commit_type to a contributor role.""" + return COMMIT_TYPE_TO_ROLE.get(commit_type, "extractor") + + def refine_commit_type(diff: str, branch_commit_type: str) -> str: """Refine commit_type from diff content when branch prefix is ambiguous. @@ -126,8 +142,9 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ for match in re.finditer(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", log_output): agent_name = match.group(1).lower() agent_uuid = match.group(2) + role = commit_type_to_role(refined_type) upsert_contributor( - conn, agent_name, agent_uuid, "extractor", today, + conn, agent_name, agent_uuid, role, today, ) agents_found.add(agent_name) @@ -167,13 +184,15 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ for author_line in author_output.strip().split("\n"): author_name = author_line.strip().lower() if author_name and author_name not in _BOT_AUTHORS: - upsert_contributor(conn, author_name, None, "extractor", today) + role = commit_type_to_role(refined_type) + upsert_contributor(conn, author_name, None, role, today) agents_found.add(author_name) if not agents_found: row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone() if row and row["agent"] and row["agent"] != "external": - upsert_contributor(conn, row["agent"].lower(), None, "extractor", today) + role = commit_type_to_role(refined_type) + upsert_contributor(conn, row["agent"].lower(), None, role, today) def upsert_contributor( diff --git a/ops/backfill-contributor-roles.py b/ops/backfill-contributor-roles.py new file mode 100644 index 0000000..8705e08 --- /dev/null +++ b/ops/backfill-contributor-roles.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""Backfill contributor role counts from prs.commit_type. + +Resets all role counts to 0, then re-derives them from the prs table's +commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects +the bug where all contributors were recorded as 'extractor' regardless +of their actual commit_type. + +Usage: + python3 ops/backfill-contributor-roles.py [--dry-run] +""" + +import argparse +import sqlite3 +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role + +DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") + + +def backfill(db_path: str, dry_run: bool = False): + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + + # Get all merged PRs with commit_type and agent + prs = conn.execute(""" + SELECT number, commit_type, agent, branch + FROM prs + WHERE status = 'merged' AND agent IS NOT NULL + ORDER BY number + """).fetchall() + + print(f"Processing {len(prs)} merged PRs...") + + # Reset all role counts + if not dry_run: + conn.execute(""" + UPDATE contributors SET + extractor_count = 0, + challenger_count = 0, + synthesizer_count = 0, + sourcer_count = 0 + """) + print("Reset all role counts to 0") + + # Tally roles from commit_type + role_counts: dict[str, dict[str, int]] = {} + for pr in prs: + agent = pr["agent"].lower() if pr["agent"] else None + if not agent or agent in ("external", "pipeline"): + continue + + commit_type = pr["commit_type"] or "extract" + role = commit_type_to_role(commit_type) + + if agent not in role_counts: + role_counts[agent] = { + "extractor_count": 0, "challenger_count": 0, + "synthesizer_count": 0, "sourcer_count": 0, + "reviewer_count": 0, + } + role_col = f"{role}_count" + if role_col in role_counts[agent]: + role_counts[agent][role_col] += 1 + + # Apply tallied counts + for handle, counts in sorted(role_counts.items()): + non_zero = {k: v for k, v in counts.items() if v > 0} + print(f" {handle}: {non_zero or '(no knowledge PRs)'}") + if not dry_run and non_zero: + set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items()) + conn.execute( + f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?", + (handle,), + ) + + if not dry_run: + conn.commit() + print("\nBackfill committed.") + else: + print("\n[DRY RUN] No changes made.") + + # Print summary + print("\nRole distribution across all contributors:") + if not dry_run: + rows = conn.execute(""" + SELECT handle, extractor_count, challenger_count, synthesizer_count, + sourcer_count, reviewer_count + FROM contributors + ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC + """).fetchall() + for r in rows: + parts = [] + if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}") + if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}") + if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}") + if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}") + if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}") + if parts: + print(f" {r['handle']}: {', '.join(parts)}") + + conn.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--db", default=DB_PATH) + args = parser.parse_args() + backfill(args.db, args.dry_run)