teleo-infrastructure/scripts/normalize-submitted-by.py

#!/usr/bin/env python3
"""One-time backfill: canonicalize prs.submitted_by and sources.submitted_by.

Strips legacy decorators ("(self-directed)", "(reweave)"), lowercases, drops
the @ prefix. After this runs, every value matches the contract documented
on diagnostics/activity_feed_api.py::_normalize_contributor — and the
companion read-side fix becomes redundant defense-in-depth instead of
load-bearing.

Defaults to --dry-run. Pass --apply to commit.

Usage:
    python3 normalize-submitted-by.py --dry-run
    python3 normalize-submitted-by.py --apply
"""

import argparse
import os
import re
import sqlite3
import sys
from collections import Counter

DEFAULT_DB = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")

# Valid handle: lowercase alphanum + _-, 1-39 chars (matches GitHub rules,
# same as pipeline/lib/attribution._HANDLE_RE). Anything with parens, spaces,
# or uppercase needs canonicalization.
_TRAILING_PAREN_RE = re.compile(r"\s*\([^)]*\)\s*$")
_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")


def canonicalize(raw):
    if raw is None:
        return None
    h = raw.strip().lower().lstrip("@")
    h = _TRAILING_PAREN_RE.sub("", h).strip()
    return h or None


def normalize_table(conn, table, dry_run):
    cur = conn.execute(
        f"SELECT rowid, submitted_by FROM {table} WHERE submitted_by IS NOT NULL"
    )
    changes = []
    for row in cur.fetchall():
        old = row[1]
        new = canonicalize(old)
        if new != old:
            changes.append((row[0], old, new))

    print(f"\n{table}: {len(changes)} rows need normalization")
    if not changes:
        return 0

    # Distribution preview
    from_to = Counter((old, new) for _, old, new in changes)
    for (old, new), count in from_to.most_common(15):
        print(f"  {count:>5}  {old!r:40} -> {new!r}")
    if len(from_to) > 15:
        print(f"  ... ({len(from_to) - 15} more distinct mappings)")

    # Sanity: every result is a valid handle (no garbage falls through).
    invalid = [(rowid, old, new) for rowid, old, new in changes
               if new is not None and not _HANDLE_RE.match(new)]
    if invalid:
        print(f"\n  WARNING: {len(invalid)} rows would normalize to invalid handles:")
        for rowid, old, new in invalid[:10]:
            print(f"    rowid={rowid} {old!r} -> {new!r}")
        print("  These rows will be SKIPPED (left as-is). Inspect manually.")

    valid_changes = [(rowid, old, new) for rowid, old, new in changes
                     if new is None or _HANDLE_RE.match(new)]

    if dry_run:
        print(f"  [dry-run] would update {len(valid_changes)} rows in {table}")
        return len(valid_changes)

    for rowid, _, new in valid_changes:
        conn.execute(
            f"UPDATE {table} SET submitted_by = ? WHERE rowid = ?",
            (new, rowid),
        )
    print(f"  updated {len(valid_changes)} rows in {table}")
    return len(valid_changes)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default=DEFAULT_DB)
    ap.add_argument("--apply", action="store_true", help="Commit changes (default is dry-run)")
    ap.add_argument("--dry-run", action="store_true", help="Preview only (default)")
    args = ap.parse_args()

    dry_run = not args.apply
    print(f"DB: {args.db}")
    print(f"Mode: {'DRY-RUN' if dry_run else 'APPLY'}")

    conn = sqlite3.connect(args.db, timeout=30)
    try:
        total = 0
        total += normalize_table(conn, "prs", dry_run)
        total += normalize_table(conn, "sources", dry_run)
        if not dry_run:
            conn.commit()
            print(f"\nCommitted. Total rows updated: {total}")
        else:
            print(f"\nDry-run complete. Run with --apply to commit ({total} rows pending).")
    finally:
        conn.close()


if __name__ == "__main__":
    main()