teleo-infrastructure/scripts/reattribute-by-branch-prefix.py
Teleo Agents c9515c770a
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
fix(attribution): classify submitted_by by branch prefix at PR discovery
reweave.py and ingestion run as the operator Forgejo token, so the prior
opener-based classifier set submitted_by=m3taversal for every system
maintenance PR. backfill_submitted_by.py never overrides non-NULL rows,
so this misattribution accumulated: ~2,748 reweave/ingestion PRs and
~3,706 <agent>/ research/entity PRs were credited to the operator on
the leaderboard and contribution_events table.

Two parts:

1. lib/merge.py: at PR discovery, classify by branch prefix first.
     reweave/, ingestion/             -> submitted_by = 'pipeline'
     <agent>/ (per _AGENT_NAMES)      -> submitted_by = '<agent>'
     otherwise human                  -> submitted_by = author.lower()
     otherwise pipeline               -> submitted_by = None
                                         (extract.py sets from proposed_by)
   Origin flag updated so domain detection and priority still fire for
   branch-classified pipeline PRs. Human PRs lowercased to maintain the
   canonical-handle contract enforced in PR #9.

2. scripts/reattribute-by-branch-prefix.py: historical cleanup.
   Per affected PR (atomic):
     - UPDATE prs.submitted_by  -> target
     - UPDATE sources.submitted_by where source_path matches
     - UPDATE contribution_events handle ('m3taversal',role='author')
       -> target, kind='agent'. Collision (target already has author
       event for PR) deletes the m3ta row; target wins.

   Scope is deliberately conservative: extract/ branches stay attributed
   to m3taversal because proposed_by-missing legitimately defaults to the
   operator (telegram drops). Only reweave/, ingestion/, and <agent>/.

   Dry-run shows 6,454 PRs + 284 events to move. Pre-flight collision
   query returns 0; pre-flight kind check confirms m3ta has only role=author
   events on this set (no challenger/synthesizer/evaluator).

   Idempotent. Dry-run by default. Run with --apply after deploy + DB
   snapshot.
2026-05-13 03:49:10 +00:00

168 lines
5.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""Reattribute PRs and their author events from m3taversal to the true author.
Scope (intentionally conservative):
- branch reweave/* -> pipeline (system maintenance, no human author)
- branch ingestion/* -> pipeline (pipeline-internal source intake)
- branch <agent>/* -> <agent> (autonomous agent work)
for agent in {leo, vida, rio, astra, clay, theseus}.
NOT in scope:
- branch extract/* -- proposed_by may legitimately be absent
(telegram source drops default to operator).
Per affected PR (atomic):
1. UPDATE prs.submitted_by -> target
2. UPDATE sources.submitted_by where path = pr.source_path
3. UPDATE contribution_events.handle for every m3ta author event on this PR
(kind set to 'agent', since pipeline + the six agents are all kind='agent'
per attribution.PENTAGON_AGENTS).
Idempotent. Dry-run by default; --apply commits.
Run AFTER scripts/normalize-submitted-by.py.
"""
import argparse
import os
import sqlite3
import sys
from collections import Counter
DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db")
AGENT_PREFIXES = ("leo/", "vida/", "rio/", "astra/", "clay/", "theseus/")
PIPELINE_PREFIXES = ("reweave/", "ingestion/")
def target_for(branch):
if not branch:
return None
if branch.startswith(PIPELINE_PREFIXES):
return "pipeline"
for prefix in AGENT_PREFIXES:
if branch.startswith(prefix):
return prefix.rstrip("/")
return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)")
ap.add_argument("--db", default=DB_PATH)
args = ap.parse_args()
conn = sqlite3.connect(args.db, timeout=30)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA busy_timeout = 30000")
mode = "APPLY" if args.apply else "DRY-RUN"
print("DB: {}\nMode: {}\n".format(args.db, mode))
rows = conn.execute("""
SELECT number, branch, source_path
FROM prs
WHERE submitted_by = 'm3taversal'
AND branch IS NOT NULL
""").fetchall()
pr_targets = []
pr_counts = Counter()
for r in rows:
tgt = target_for(r["branch"])
if tgt is None:
continue
pr_targets.append((r["number"], r["branch"], r["source_path"], tgt))
pr_counts[tgt] += 1
print("prs to reattribute: {}".format(len(pr_targets)))
for tgt, n in pr_counts.most_common():
print(" {:6d} -> {!r}".format(n, tgt))
src_paths = [t[2] for t in pr_targets if t[2]]
src_count = 0
if src_paths:
placeholders = ",".join("?" * len(src_paths))
src_count = conn.execute(
"SELECT COUNT(*) FROM sources "
"WHERE submitted_by = 'm3taversal' AND path IN ({})".format(placeholders),
src_paths,
).fetchone()[0]
print("\nsources rows that will be re-pointed: {}".format(src_count))
pr_to_target = {p[0]: p[3] for p in pr_targets}
events = []
if pr_to_target:
pr_placeholders = ",".join("?" * len(pr_to_target))
events = conn.execute(
"SELECT id, pr_number FROM contribution_events "
"WHERE handle = 'm3taversal' AND role = 'author' "
"AND pr_number IN ({})".format(pr_placeholders),
list(pr_to_target.keys()),
).fetchall()
print("contribution_events author rows to move: {}".format(len(events)))
ev_counts = Counter(pr_to_target[e["pr_number"]] for e in events)
for tgt, n in ev_counts.most_common():
print(" {:6d} events -> {!r}".format(n, tgt))
if not args.apply:
print("\nDry-run complete. Run with --apply to commit "
"({} PRs + {} sources + {} events).".format(
len(pr_targets), src_count, len(events)))
return 0
pr_updated = 0
src_updated = 0
ev_updated = 0
ev_collisions = 0
try:
for pr_num, branch, source_path, target in pr_targets:
cur = conn.execute(
"UPDATE prs SET submitted_by = ? "
"WHERE number = ? AND submitted_by = 'm3taversal'",
(target, pr_num),
)
pr_updated += cur.rowcount
if source_path:
cur = conn.execute(
"UPDATE sources SET submitted_by = ? "
"WHERE path = ? AND submitted_by = 'm3taversal'",
(target, source_path),
)
src_updated += cur.rowcount
for ev in conn.execute(
"SELECT id FROM contribution_events "
"WHERE handle = 'm3taversal' AND role = 'author' AND pr_number = ?",
(pr_num,),
).fetchall():
try:
conn.execute(
"UPDATE contribution_events SET handle = ?, kind = 'agent' "
"WHERE id = ?",
(target, ev["id"]),
)
ev_updated += 1
except sqlite3.IntegrityError:
conn.execute(
"DELETE FROM contribution_events WHERE id = ?",
(ev["id"],),
)
ev_collisions += 1
conn.commit()
except Exception:
conn.rollback()
raise
print("\nCommitted.")
print(" prs.submitted_by moves: {}".format(pr_updated))
print(" sources.submitted_by moves: {}".format(src_updated))
print(" contribution_events moves: {}".format(ev_updated))
print(" ce collisions deleted: {}".format(ev_collisions))
return 0
if __name__ == "__main__":
sys.exit(main())