#!/usr/bin/env python3 """One-time backfill: canonicalize prs.submitted_by and sources.submitted_by. Strips legacy decorators ("(self-directed)", "(reweave)"), lowercases, drops the @ prefix. After this runs, every value matches the contract documented on diagnostics/activity_feed_api.py::_normalize_contributor — and the companion read-side fix becomes redundant defense-in-depth instead of load-bearing. Defaults to --dry-run. Pass --apply to commit. Usage: python3 normalize-submitted-by.py --dry-run python3 normalize-submitted-by.py --apply """ import argparse import os import re import sqlite3 import sys from collections import Counter DEFAULT_DB = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") # Valid handle: lowercase alphanum + _-, 1-39 chars (matches GitHub rules, # same as pipeline/lib/attribution._HANDLE_RE). Anything with parens, spaces, # or uppercase needs canonicalization. _TRAILING_PAREN_RE = re.compile(r"\s*\([^)]*\)\s*$") _HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$") def canonicalize(raw): if raw is None: return None h = raw.strip().lower().lstrip("@") h = _TRAILING_PAREN_RE.sub("", h).strip() return h or None def normalize_table(conn, table, dry_run): cur = conn.execute( f"SELECT rowid, submitted_by FROM {table} WHERE submitted_by IS NOT NULL" ) changes = [] for row in cur.fetchall(): old = row[1] new = canonicalize(old) if new != old: changes.append((row[0], old, new)) print(f"\n{table}: {len(changes)} rows need normalization") if not changes: return 0 # Distribution preview from_to = Counter((old, new) for _, old, new in changes) for (old, new), count in from_to.most_common(15): print(f" {count:>5} {old!r:40} -> {new!r}") if len(from_to) > 15: print(f" ... ({len(from_to) - 15} more distinct mappings)") # Sanity: every result is a valid handle (no garbage falls through). invalid = [(rowid, old, new) for rowid, old, new in changes if new is not None and not _HANDLE_RE.match(new)] if invalid: print(f"\n WARNING: {len(invalid)} rows would normalize to invalid handles:") for rowid, old, new in invalid[:10]: print(f" rowid={rowid} {old!r} -> {new!r}") print(" These rows will be SKIPPED (left as-is). Inspect manually.") valid_changes = [(rowid, old, new) for rowid, old, new in changes if new is None or _HANDLE_RE.match(new)] if dry_run: print(f" [dry-run] would update {len(valid_changes)} rows in {table}") return len(valid_changes) for rowid, _, new in valid_changes: conn.execute( f"UPDATE {table} SET submitted_by = ? WHERE rowid = ?", (new, rowid), ) print(f" updated {len(valid_changes)} rows in {table}") return len(valid_changes) def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", default=DEFAULT_DB) ap.add_argument("--apply", action="store_true", help="Commit changes (default is dry-run)") ap.add_argument("--dry-run", action="store_true", help="Preview only (default)") args = ap.parse_args() dry_run = not args.apply print(f"DB: {args.db}") print(f"Mode: {'DRY-RUN' if dry_run else 'APPLY'}") conn = sqlite3.connect(args.db, timeout=30) try: total = 0 total += normalize_table(conn, "prs", dry_run) total += normalize_table(conn, "sources", dry_run) if not dry_run: conn.commit() print(f"\nCommitted. Total rows updated: {total}") else: print(f"\nDry-run complete. Run with --apply to commit ({total} rows pending).") finally: conn.close() if __name__ == "__main__": main()