teleo-infrastructure/scripts/reset-m3taversal-sourcer.py

#!/usr/bin/env python3
"""Reset m3taversal.sourcer_count from inflated legacy value to file-truth count.

Background: pre-Phase-A extract.py had a `submitted_by` fallback that credited
m3taversal as sourcer for every Telegram-ingested source, accumulating to 1011
sourcer_count in the contributors table. The actual file-truth count (sourcer
frontmatter equal to "m3taversal" in claim files) is 21. The 990-row delta is
infrastructure attribution that doesn't reflect content authorship.

The Phase A event-sourced ledger (contribution_events) computed the correct
389.55 CI from author events; /api/leaderboard reads from there directly.
But the legacy /api/contributors endpoint reads contributors.claims_merged
which carries the inflated 1011. Until that endpoint is deprecated, the
divergence shows two different numbers depending on which surface the UI
queries.

This script applies the surgical UPDATE that was run on VPS on 2026-04-27
during the leaderboard cutover. Committed as a script per Ganymede review:
"DB mutations go through reviewable code paths matters more than the
convenience of one-shot SQL. The artifact explains what was done and why."

Idempotent — safe to re-run. If sourcer_count is already 21, no change.

Usage:
  python3 scripts/reset-m3taversal-sourcer.py --dry-run
  python3 scripts/reset-m3taversal-sourcer.py
"""
import argparse
import os
import sqlite3
import sys
from pathlib import Path

DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
TARGET_HANDLE = "m3taversal"
TRUTH_SOURCER_COUNT = 21
TRUTH_CLAIMS_MERGED = 21


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    if not Path(DB_PATH).exists():
        print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
        sys.exit(1)

    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row

    row = conn.execute(
        "SELECT handle, sourcer_count, claims_merged FROM contributors WHERE handle = ?",
        (TARGET_HANDLE,),
    ).fetchone()
    if not row:
        print(f"  No contributors row for {TARGET_HANDLE} — nothing to reset.")
        return

    print(
        f"  Current: {row['handle']} sourcer_count={row['sourcer_count']} "
        f"claims_merged={row['claims_merged']}"
    )
    print(f"  Target:  sourcer_count={TRUTH_SOURCER_COUNT} claims_merged={TRUTH_CLAIMS_MERGED}")

    if (row["sourcer_count"] == TRUTH_SOURCER_COUNT
            and row["claims_merged"] == TRUTH_CLAIMS_MERGED):
        print("  Already at target values — no-op.")
        return

    if args.dry_run:
        print("  (dry-run) UPDATE would be applied. Re-run without --dry-run.")
        return

    conn.execute(
        """UPDATE contributors SET
            sourcer_count = ?,
            claims_merged = ?,
            updated_at = datetime('now')
           WHERE handle = ?""",
        (TRUTH_SOURCER_COUNT, TRUTH_CLAIMS_MERGED, TARGET_HANDLE),
    )
    conn.execute(
        """INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)""",
        (
            "manual",
            "m3taversal_sourcer_reset",
            (
                '{"reason":"Pre-Phase-A submitted_by fallback inflated to 1011; '
                'file-truth is 21","sourcer_count_before":1011,'
                '"sourcer_count_after":21,"claims_merged_after":21}'
            ),
        ),
    )
    conn.commit()

    after = conn.execute(
        "SELECT sourcer_count, claims_merged FROM contributors WHERE handle = ?",
        (TARGET_HANDLE,),
    ).fetchone()
    print(
        f"  Applied. Now: sourcer_count={after['sourcer_count']} "
        f"claims_merged={after['claims_merged']}"
    )


if __name__ == "__main__":
    main()