#!/usr/bin/env python3 """Backfill contributor role counts from prs.commit_type. Resets all role counts to 0, then re-derives them from the prs table's commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects the bug where all contributors were recorded as 'extractor' regardless of their actual commit_type. Usage: python3 ops/backfill-contributor-roles.py [--dry-run] """ import argparse import sqlite3 import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") def backfill(db_path: str, dry_run: bool = False): conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row # Get all merged PRs with commit_type and agent prs = conn.execute(""" SELECT number, commit_type, agent, branch FROM prs WHERE status = 'merged' AND agent IS NOT NULL ORDER BY number """).fetchall() print(f"Processing {len(prs)} merged PRs...") # Reset all role counts if not dry_run: conn.execute(""" UPDATE contributors SET extractor_count = 0, challenger_count = 0, synthesizer_count = 0, sourcer_count = 0 """) print("Reset all role counts to 0") # Tally roles from commit_type role_counts: dict[str, dict[str, int]] = {} for pr in prs: agent = pr["agent"].lower() if pr["agent"] else None if not agent or agent in ("external", "pipeline"): continue commit_type = pr["commit_type"] or "extract" role = commit_type_to_role(commit_type) if agent not in role_counts: role_counts[agent] = { "extractor_count": 0, "challenger_count": 0, "synthesizer_count": 0, "sourcer_count": 0, "reviewer_count": 0, } role_col = f"{role}_count" if role_col in role_counts[agent]: role_counts[agent][role_col] += 1 # Apply tallied counts for handle, counts in sorted(role_counts.items()): non_zero = {k: v for k, v in counts.items() if v > 0} print(f" {handle}: {non_zero or '(no knowledge PRs)'}") if not dry_run and non_zero: set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items()) conn.execute( f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?", (handle,), ) if not dry_run: conn.commit() print("\nBackfill committed.") else: print("\n[DRY RUN] No changes made.") # Print summary print("\nRole distribution across all contributors:") if not dry_run: rows = conn.execute(""" SELECT handle, extractor_count, challenger_count, synthesizer_count, sourcer_count, reviewer_count FROM contributors ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC """).fetchall() for r in rows: parts = [] if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}") if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}") if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}") if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}") if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}") if parts: print(f" {r['handle']}: {', '.join(parts)}") conn.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") parser.add_argument("--db", default=DB_PATH) args = parser.parse_args() backfill(args.db, args.dry_run)