Merge pull request 'fix(attribution): classify submitted_by by branch prefix at PR discovery' (#11) from fix/reattribute-by-branch-prefix into main
Some checks are pending
CI / lint-and-test (push) Waiting to run

Reviewed-on: #11
This commit is contained in:
fwazb 2026-05-13 03:57:04 +00:00
commit a54f52234a
2 changed files with 203 additions and 7 deletions

View file

@ -112,16 +112,44 @@ async def discover_external_prs(conn) -> int:
# Detect origin: pipeline agents have per-agent Forgejo users
pipeline_users = {"teleo", "rio", "clay", "theseus", "vida", "astra", "leo"}
author = pr.get("user", {}).get("login", "")
is_pipeline = author.lower() in pipeline_users
branch_ref = pr["head"]["ref"]
# Pre-classify by branch prefix — pipeline-shape branches are
# pipeline regardless of Forgejo opener. reweave.py and
# ingestion run as the operator's token, so opener-based
# classification mis-credited system maintenance to the
# operator (~2.7k PRs on m3ta's contributor row before the
# 2026-05-12 reattribute fix). Branch prefix is the canonical
# signal: reweave/ingestion -> 'pipeline', <agent>/ -> agent.
branch_target = None
if branch_ref.startswith(("reweave/", "ingestion/")):
branch_target = "pipeline"
elif branch_ref.startswith(_AGENT_NAMES):
# _AGENT_NAMES is a tuple of bare names; agent branches
# are "<name>/..." so use the tuple as a startswith prefix
# set after appending '/'.
for name in _AGENT_NAMES:
if branch_ref.startswith(name + "/"):
branch_target = name
break
is_pipeline = author.lower() in pipeline_users or branch_target is not None
origin = "pipeline" if is_pipeline else "human"
priority = "high" if origin == "human" else None
domain = None if not is_pipeline else detect_domain_from_branch(pr["head"]["ref"])
agent, commit_type = classify_branch(pr["head"]["ref"])
source_channel = classify_source_channel(pr["head"]["ref"])
domain = None if not is_pipeline else detect_domain_from_branch(branch_ref)
agent, commit_type = classify_branch(branch_ref)
source_channel = classify_source_channel(branch_ref)
# For human PRs, submitted_by is the Forgejo author.
# For pipeline PRs, submitted_by is set later by extract.py (from source proposed_by).
submitted_by = author if origin == "human" else None
# submitted_by precedence (canonical handles only):
# 1. branch prefix (pipeline/agent) — set here at discovery
# 2. Forgejo opener for human PRs — set here, lowercased
# 3. extract.py later (from source proposed_by) — left None
if branch_target is not None:
submitted_by = branch_target
elif origin == "human":
submitted_by = author.lower().lstrip("@") if author else None
else:
submitted_by = None
conn.execute(
"""INSERT OR IGNORE INTO prs

View file

@ -0,0 +1,168 @@
#!/usr/bin/env python3
"""Reattribute PRs and their author events from m3taversal to the true author.
Scope (intentionally conservative):
- branch reweave/* -> pipeline (system maintenance, no human author)
- branch ingestion/* -> pipeline (pipeline-internal source intake)
- branch <agent>/* -> <agent> (autonomous agent work)
for agent in {leo, vida, rio, astra, clay, theseus}.
NOT in scope:
- branch extract/* -- proposed_by may legitimately be absent
(telegram source drops default to operator).
Per affected PR (atomic):
1. UPDATE prs.submitted_by -> target
2. UPDATE sources.submitted_by where path = pr.source_path
3. UPDATE contribution_events.handle for every m3ta author event on this PR
(kind set to 'agent', since pipeline + the six agents are all kind='agent'
per attribution.PENTAGON_AGENTS).
Idempotent. Dry-run by default; --apply commits.
Run AFTER scripts/normalize-submitted-by.py.
"""
import argparse
import os
import sqlite3
import sys
from collections import Counter
DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db")
AGENT_PREFIXES = ("leo/", "vida/", "rio/", "astra/", "clay/", "theseus/")
PIPELINE_PREFIXES = ("reweave/", "ingestion/")
def target_for(branch):
if not branch:
return None
if branch.startswith(PIPELINE_PREFIXES):
return "pipeline"
for prefix in AGENT_PREFIXES:
if branch.startswith(prefix):
return prefix.rstrip("/")
return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)")
ap.add_argument("--db", default=DB_PATH)
args = ap.parse_args()
conn = sqlite3.connect(args.db, timeout=30)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA busy_timeout = 30000")
mode = "APPLY" if args.apply else "DRY-RUN"
print("DB: {}\nMode: {}\n".format(args.db, mode))
rows = conn.execute("""
SELECT number, branch, source_path
FROM prs
WHERE submitted_by = 'm3taversal'
AND branch IS NOT NULL
""").fetchall()
pr_targets = []
pr_counts = Counter()
for r in rows:
tgt = target_for(r["branch"])
if tgt is None:
continue
pr_targets.append((r["number"], r["branch"], r["source_path"], tgt))
pr_counts[tgt] += 1
print("prs to reattribute: {}".format(len(pr_targets)))
for tgt, n in pr_counts.most_common():
print(" {:6d} -> {!r}".format(n, tgt))
src_paths = [t[2] for t in pr_targets if t[2]]
src_count = 0
if src_paths:
placeholders = ",".join("?" * len(src_paths))
src_count = conn.execute(
"SELECT COUNT(*) FROM sources "
"WHERE submitted_by = 'm3taversal' AND path IN ({})".format(placeholders),
src_paths,
).fetchone()[0]
print("\nsources rows that will be re-pointed: {}".format(src_count))
pr_to_target = {p[0]: p[3] for p in pr_targets}
events = []
if pr_to_target:
pr_placeholders = ",".join("?" * len(pr_to_target))
events = conn.execute(
"SELECT id, pr_number FROM contribution_events "
"WHERE handle = 'm3taversal' AND role = 'author' "
"AND pr_number IN ({})".format(pr_placeholders),
list(pr_to_target.keys()),
).fetchall()
print("contribution_events author rows to move: {}".format(len(events)))
ev_counts = Counter(pr_to_target[e["pr_number"]] for e in events)
for tgt, n in ev_counts.most_common():
print(" {:6d} events -> {!r}".format(n, tgt))
if not args.apply:
print("\nDry-run complete. Run with --apply to commit "
"({} PRs + {} sources + {} events).".format(
len(pr_targets), src_count, len(events)))
return 0
pr_updated = 0
src_updated = 0
ev_updated = 0
ev_collisions = 0
try:
for pr_num, branch, source_path, target in pr_targets:
cur = conn.execute(
"UPDATE prs SET submitted_by = ? "
"WHERE number = ? AND submitted_by = 'm3taversal'",
(target, pr_num),
)
pr_updated += cur.rowcount
if source_path:
cur = conn.execute(
"UPDATE sources SET submitted_by = ? "
"WHERE path = ? AND submitted_by = 'm3taversal'",
(target, source_path),
)
src_updated += cur.rowcount
for ev in conn.execute(
"SELECT id FROM contribution_events "
"WHERE handle = 'm3taversal' AND role = 'author' AND pr_number = ?",
(pr_num,),
).fetchall():
try:
conn.execute(
"UPDATE contribution_events SET handle = ?, kind = 'agent' "
"WHERE id = ?",
(target, ev["id"]),
)
ev_updated += 1
except sqlite3.IntegrityError:
conn.execute(
"DELETE FROM contribution_events WHERE id = ?",
(ev["id"],),
)
ev_collisions += 1
conn.commit()
except Exception:
conn.rollback()
raise
print("\nCommitted.")
print(" prs.submitted_by moves: {}".format(pr_updated))
print(" sources.submitted_by moves: {}".format(src_updated))
print(" contribution_events moves: {}".format(ev_updated))
print(" ce collisions deleted: {}".format(ev_collisions))
return 0
if __name__ == "__main__":
sys.exit(main())