teleo-infrastructure/ops/backfill-contributor-roles.py

#!/usr/bin/env python3
"""Backfill contributor role counts from prs.commit_type.

Resets all role counts to 0, then re-derives them from the prs table's
commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects
the bug where all contributors were recorded as 'extractor' regardless
of their actual commit_type.

Usage:
    python3 ops/backfill-contributor-roles.py [--dry-run]
"""

import argparse
import sqlite3
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role

DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")


def backfill(db_path: str, dry_run: bool = False):
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row

    # Get all merged PRs with commit_type and agent
    prs = conn.execute("""
        SELECT number, commit_type, agent, branch
        FROM prs
        WHERE status = 'merged' AND agent IS NOT NULL
        ORDER BY number
    """).fetchall()

    print(f"Processing {len(prs)} merged PRs...")

    # Reset all role counts
    if not dry_run:
        conn.execute("""
            UPDATE contributors SET
                extractor_count = 0,
                challenger_count = 0,
                synthesizer_count = 0,
                sourcer_count = 0
        """)
        print("Reset all role counts to 0")

    # Tally roles from commit_type
    role_counts: dict[str, dict[str, int]] = {}
    for pr in prs:
        agent = pr["agent"].lower() if pr["agent"] else None
        if not agent or agent in ("external", "pipeline"):
            continue

        commit_type = pr["commit_type"] or "extract"
        role = commit_type_to_role(commit_type)

        if agent not in role_counts:
            role_counts[agent] = {
                "extractor_count": 0, "challenger_count": 0,
                "synthesizer_count": 0, "sourcer_count": 0,
                "reviewer_count": 0,
            }
        role_col = f"{role}_count"
        if role_col in role_counts[agent]:
            role_counts[agent][role_col] += 1

    # Apply tallied counts
    for handle, counts in sorted(role_counts.items()):
        non_zero = {k: v for k, v in counts.items() if v > 0}
        print(f"  {handle}: {non_zero or '(no knowledge PRs)'}")
        if not dry_run and non_zero:
            set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items())
            conn.execute(
                f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?",
                (handle,),
            )

    if not dry_run:
        conn.commit()
        print("\nBackfill committed.")
    else:
        print("\n[DRY RUN] No changes made.")

    # Print summary
    print("\nRole distribution across all contributors:")
    if not dry_run:
        rows = conn.execute("""
            SELECT handle, extractor_count, challenger_count, synthesizer_count,
                   sourcer_count, reviewer_count
            FROM contributors
            ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC
        """).fetchall()
        for r in rows:
            parts = []
            if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}")
            if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}")
            if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}")
            if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}")
            if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}")
            if parts:
                print(f"  {r['handle']}: {', '.join(parts)}")

    conn.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--db", default=DB_PATH)
    args = parser.parse_args()
    backfill(args.db, args.dry_run)