From 74bf0461e811d87a7b98b7480c24661aacc64c36 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Wed, 13 May 2026 02:56:50 +0000 Subject: [PATCH] fix(attribution): canonicalize submitted_by at write time + historical normalizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion / write-side fix to fix/activity-feed-canonical-handle. The activity-feed canonicalization was a read-side guard. The bug at the source is that extract.py and two backfill scripts write decorated strings (Vida (self-directed), pipeline (reweave), @m3taversal) into prs.submitted_by and sources.submitted_by. Downstream readers (lib.contributor.insert_contribution_event, scripts/scoring_digest, diagnostics/activity_feed_api) all strip the decorator on read — but anything that reads the column verbatim (like /api/activity-feed before the read-side fix) 404s on /contributors/{decorated-handle}. Stop writing the decorator. The self-directed signal is already carried by intake_tier == research-task plus the prs.agent column; the suffix is redundant string noise that costs us correctness at every consumer that forgets to strip. Changes: - lib/extract.py:690 — write canonical handle via attribution.normalize_handle. Direct elif for intake_tier == research-task now stores just agent_name. @m3taversal -> m3taversal. - diagnostics/backfill_submitted_by.py — same fix in two branches plus the reweave branch (pipeline (reweave) -> pipeline). - scripts/backfill-research-session-attribution.py — UPDATE prs sets agent handle alone, no suffix. Docstring + log line updated. - scripts/normalize-submitted-by.py (new) — one-time backfill that canonicalizes existing prs.submitted_by and sources.submitted_by rows. Strips trailing parenthetical decorators, lowercases, drops @. Defaults to dry-run; --apply to commit. Skips rows that would normalize to invalid handles (no garbage falls through silently). Dry-run against live pipeline.db: prs: 3008 rows need normalization (clean mappings, 0 invalid) sources: 730 rows need normalization (clean mappings, 0 invalid) Total: 3738 rows. All map to existing handle column values. After this lands + auto-deploys, the operator should run python3 scripts/normalize-submitted-by.py --apply once to clean historical rows. The read-side canonicalization in diagnostics/activity_feed_api.py (fix/activity-feed-canonical-handle) becomes redundant defense-in-depth instead of load-bearing. No KB writes. --- diagnostics/backfill_submitted_by.py | 23 ++-- lib/extract.py | 22 +++- .../backfill-research-session-attribution.py | 10 +- scripts/normalize-submitted-by.py | 114 ++++++++++++++++++ 4 files changed, 149 insertions(+), 20 deletions(-) create mode 100755 scripts/normalize-submitted-by.py diff --git a/diagnostics/backfill_submitted_by.py b/diagnostics/backfill_submitted_by.py index 7e1b44d..2549d65 100644 --- a/diagnostics/backfill_submitted_by.py +++ b/diagnostics/backfill_submitted_by.py @@ -79,12 +79,16 @@ def main(): fm = sfm break + # `submitted_by` is stored as a canonical handle (lowercase, no @, no + # "(self-directed)" / "(reweave)" suffix). Read consumers normalize via + # attribution.normalize_handle, so writing decorated strings produces + # downstream 404s on /contributors/{handle} (livingip-web timeline). if fm: proposed_by = fm.get("proposed_by") intake_tier = fm.get("intake_tier") if proposed_by: - contributor = proposed_by.strip().strip('"').strip("'") + contributor = proposed_by.strip().strip('"').strip("'").lower().lstrip("@") elif intake_tier == "research-task": # Derive agent from branch prefix prefix = branch.split("/", 1)[0] if "/" in branch else "unknown" @@ -94,13 +98,12 @@ def main(): "clay": "clay", "astra": "astra", "leo": "leo", "reweave": "pipeline", } - agent = agent_map.get(prefix, prefix) - contributor = f"{agent} (self-directed)" + contributor = agent_map.get(prefix, prefix) elif intake_tier == "directed": - contributor = "@m3taversal" + contributor = "m3taversal" else: - # Default: if source exists but no proposed_by, it was Cory's submission - contributor = "@m3taversal" + # Default: if source exists but no proposed_by, operator submitted it. + contributor = "m3taversal" if contributor: conn.execute( @@ -114,19 +117,19 @@ def main(): agent = branch.split("/", 1)[0] conn.execute( "UPDATE prs SET submitted_by = ? WHERE number = ?", - (f"{agent} (self-directed)", pr["number"]), + (agent, pr["number"]), ) updated += 1 elif branch.startswith("reweave/"): conn.execute( - "UPDATE prs SET submitted_by = 'pipeline (reweave)' WHERE number = ?", + "UPDATE prs SET submitted_by = 'pipeline' WHERE number = ?", (pr["number"],), ) updated += 1 else: - # Everything else (extract/, ingestion/, unknown) → Cory directed it + # Everything else (extract/, ingestion/, unknown) → operator directed it conn.execute( - "UPDATE prs SET submitted_by = '@m3taversal' WHERE number = ?", + "UPDATE prs SET submitted_by = 'm3taversal' WHERE number = ?", (pr["number"],), ) updated += 1 diff --git a/lib/extract.py b/lib/extract.py index 6f825f4..76e887d 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -32,6 +32,7 @@ from datetime import date from pathlib import Path from . import config +from .attribution import normalize_handle from .costs import record_usage from .db import classify_source_channel from .domains import agent_for_domain @@ -683,16 +684,25 @@ async def _extract_one_source( logger.info("PR #%d created for %s (%d claims, %d entities)", pr_num, source_file, len(claim_files), len(entity_files)) # Store contributor attribution: who submitted this source? - # Priority: proposed_by field → intake_tier inference → "unknown" + # Priority: proposed_by field → intake_tier inference → operator default. + # NB: `submitted_by` is a CANONICAL HANDLE — lowercase, no @, no + # trailing "(self-directed)" decorator. The "self-directed" signal is + # already carried by intake_tier == "research-task" + the prs.agent + # column; persisting it here as a string suffix produced decorated + # values like "Vida (self-directed)" that broke /contributors/{handle} + # lookups downstream (livingip-web timeline → 404). Read consumers + # (lib/contributor.insert_contribution_event, scripts/scoring_digest, + # diagnostics/activity_feed_api) all normalize via attribution.normalize_handle + # anyway, so writing the canonical form is the source-of-truth fix. if proposed_by: - contributor = proposed_by.strip().strip('"').strip("'") + contributor = normalize_handle(proposed_by, conn=conn) elif intake_tier == "research-task": - contributor = f"{agent_name} (self-directed)" + contributor = normalize_handle(agent_name, conn=conn) elif intake_tier == "directed": - contributor = "@m3taversal" + contributor = "m3taversal" else: - # Default: if no proposed_by and not a research task, Cory submitted it - contributor = "@m3taversal" + # Default: if no proposed_by and not a research task, operator submitted it. + contributor = "m3taversal" # Build pipe-separated claim titles for the description field claim_titles = " | ".join( diff --git a/scripts/backfill-research-session-attribution.py b/scripts/backfill-research-session-attribution.py index 21bf63d..d276889 100644 --- a/scripts/backfill-research-session-attribution.py +++ b/scripts/backfill-research-session-attribution.py @@ -17,7 +17,9 @@ Identification: PR whose `source_path` matches one of those filenames is research-derived. Touch list (per matched PR): - 1. UPDATE prs SET submitted_by = ' (self-directed)' + 1. UPDATE prs SET submitted_by = '' (canonical handle, lowercase, no + trailing "(self-directed)" suffix — see lib/extract.py and + diagnostics/activity_feed_api.py for why decorators leak into 404s) 2. DELETE FROM contribution_events WHERE handle='m3taversal' AND role='author' AND pr_number=? 3. INSERT OR IGNORE INTO contribution_events with handle=, @@ -169,7 +171,7 @@ def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) if dry_run: logger.info( - "would update pr=%d submitted_by '%s' → '%s (self-directed)' " + "would update pr=%d submitted_by '%s' → '%s' " "[m3ta_event=%s, agent_event=%s]", pr, m["current_submitted_by"], agent, old_event_exists, new_event_exists, @@ -181,10 +183,10 @@ def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) counters["events_to_insert"] += 1 continue - # 1. UPDATE prs.submitted_by + # 1. UPDATE prs.submitted_by — canonical handle only. conn.execute( "UPDATE prs SET submitted_by = ? WHERE number = ?", - (f"{agent} (self-directed)", pr), + (agent, pr), ) counters["prs"] += 1 diff --git a/scripts/normalize-submitted-by.py b/scripts/normalize-submitted-by.py new file mode 100755 index 0000000..4991dc4 --- /dev/null +++ b/scripts/normalize-submitted-by.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""One-time backfill: canonicalize prs.submitted_by and sources.submitted_by. + +Strips legacy decorators ("(self-directed)", "(reweave)"), lowercases, drops +the @ prefix. After this runs, every value matches the contract documented +on diagnostics/activity_feed_api.py::_normalize_contributor — and the +companion read-side fix becomes redundant defense-in-depth instead of +load-bearing. + +Defaults to --dry-run. Pass --apply to commit. + +Usage: + python3 normalize-submitted-by.py --dry-run + python3 normalize-submitted-by.py --apply +""" + +import argparse +import os +import re +import sqlite3 +import sys +from collections import Counter + +DEFAULT_DB = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") + +# Valid handle: lowercase alphanum + _-, 1-39 chars (matches GitHub rules, +# same as pipeline/lib/attribution._HANDLE_RE). Anything with parens, spaces, +# or uppercase needs canonicalization. +_TRAILING_PAREN_RE = re.compile(r"\s*\([^)]*\)\s*$") +_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$") + + +def canonicalize(raw): + if raw is None: + return None + h = raw.strip().lower().lstrip("@") + h = _TRAILING_PAREN_RE.sub("", h).strip() + return h or None + + +def normalize_table(conn, table, dry_run): + cur = conn.execute( + f"SELECT rowid, submitted_by FROM {table} WHERE submitted_by IS NOT NULL" + ) + changes = [] + for row in cur.fetchall(): + old = row[1] + new = canonicalize(old) + if new != old: + changes.append((row[0], old, new)) + + print(f"\n{table}: {len(changes)} rows need normalization") + if not changes: + return 0 + + # Distribution preview + from_to = Counter((old, new) for _, old, new in changes) + for (old, new), count in from_to.most_common(15): + print(f" {count:>5} {old!r:40} -> {new!r}") + if len(from_to) > 15: + print(f" ... ({len(from_to) - 15} more distinct mappings)") + + # Sanity: every result is a valid handle (no garbage falls through). + invalid = [(rowid, old, new) for rowid, old, new in changes + if new is not None and not _HANDLE_RE.match(new)] + if invalid: + print(f"\n WARNING: {len(invalid)} rows would normalize to invalid handles:") + for rowid, old, new in invalid[:10]: + print(f" rowid={rowid} {old!r} -> {new!r}") + print(" These rows will be SKIPPED (left as-is). Inspect manually.") + + valid_changes = [(rowid, old, new) for rowid, old, new in changes + if new is None or _HANDLE_RE.match(new)] + + if dry_run: + print(f" [dry-run] would update {len(valid_changes)} rows in {table}") + return len(valid_changes) + + for rowid, _, new in valid_changes: + conn.execute( + f"UPDATE {table} SET submitted_by = ? WHERE rowid = ?", + (new, rowid), + ) + print(f" updated {len(valid_changes)} rows in {table}") + return len(valid_changes) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--db", default=DEFAULT_DB) + ap.add_argument("--apply", action="store_true", help="Commit changes (default is dry-run)") + ap.add_argument("--dry-run", action="store_true", help="Preview only (default)") + args = ap.parse_args() + + dry_run = not args.apply + print(f"DB: {args.db}") + print(f"Mode: {'DRY-RUN' if dry_run else 'APPLY'}") + + conn = sqlite3.connect(args.db, timeout=30) + try: + total = 0 + total += normalize_table(conn, "prs", dry_run) + total += normalize_table(conn, "sources", dry_run) + if not dry_run: + conn.commit() + print(f"\nCommitted. Total rows updated: {total}") + else: + print(f"\nDry-run complete. Run with --apply to commit ({total} rows pending).") + finally: + conn.close() + + +if __name__ == "__main__": + main()