From d0fb4c96e3e009ccf3a96ef98495e325cec0767a Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sun, 26 Apr 2026 14:21:10 +0100
Subject: [PATCH 1/5] fix(attribution): gate writer on publishers table
 (regression prevention)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Schema v26 (commit 3fe524d) split orgs/citations from contributors into
the publishers table. Without a writer-side gate, every merged PR with
`sourcer: cnbc` (or similar) re-creates CNBC as a contributor and
undoes the v26 classifier cleanup. Once normal pipeline traffic resumes,
the contributors table re-pollutes within hours.

Fix: belt-and-suspenders gate at both writer surfaces.

1. `lib/attribution.py::is_publisher_handle(handle, conn)` — returns
   publisher.id if handle exists in publishers.name, else None. Falls
   back gracefully on pre-v26 DBs (no publishers table → returns None →
   writer behaves like before, no regression).

2. `lib/contributor.py::insert_contribution_event` — checks
   is_publisher_handle on canonical handle before INSERT. If it's a
   publisher, debug-log + return False. Prevents originator events for
   CNBC/SpaceNews/etc.

3. `lib/contributor.py::upsert_contributor` — same gate at top. Prevents
   the contributors table from re-acquiring publisher rows.

Verified end-to-end against live VPS DB snapshot:
  - CNBC originator event: blocked (insert returns False)
  - CNBC contributors row: blocked (no row created)
  - alexastrum, thesensatore, newhandle_xyz: pass through unchanged
  - is_publisher_handle handles case-insensitive lookup correctly
    (CNBC and cnbc both match publisher_id=3)

Pre-deploy event count was 3705. Post-classifier cleanup: 3623 (82 org
events purged). Going forward, no new org events accumulate.

Branch 2 of the schema-v26 rollout. Branch 3 (auto-create at tier='cited',
extract.py sources.publisher_id wiring) is separate scope and not required
for regression prevention.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/attribution.py | 27 +++++++++++++++++++++++++++
 lib/contributor.py | 16 +++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/lib/attribution.py b/lib/attribution.py
index 664c39c..68e69eb 100644
--- a/lib/attribution.py
+++ b/lib/attribution.py
@@ -108,6 +108,33 @@ def classify_kind(handle: str) -> str:
     return "person"
 
 
+def is_publisher_handle(handle: str, conn) -> int | None:
+    """Return publisher.id if the handle exists as a publisher name, else None.
+
+    Schema v26 split orgs/citations into the publishers table. Writer code
+    (upsert_contributor, insert_contribution_event) calls this to gate creating
+    contributor rows or events for handles that belong to publishers.
+
+    Without this gate, every merged PR with `sourcer: cnbc` (for example) would
+    re-create CNBC as a contributor and undo the v26 classifier cleanup.
+
+    Falls back gracefully on pre-v26 DBs: returns None if publishers table
+    doesn't exist yet (writer behaves like before, no regression).
+    """
+    if not handle or conn is None:
+        return None
+    h = handle.strip().lower().lstrip("@")
+    try:
+        row = conn.execute(
+            "SELECT id FROM publishers WHERE name = ?", (h,),
+        ).fetchone()
+        if row:
+            return row["id"] if hasattr(row, "keys") else row[0]
+    except Exception:
+        logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True)
+    return None
+
+
 # ─── Parse attribution from claim content ──────────────────────────────────
 
 
diff --git a/lib/contributor.py b/lib/contributor.py
index a2117d6..b2cc11d 100644
--- a/lib/contributor.py
+++ b/lib/contributor.py
@@ -14,7 +14,7 @@ import logging
 import re
 
 from . import config, db
-from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle
+from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle
 from .forgejo import get_pr_diff
 
 logger = logging.getLogger("pipeline.contributor")
@@ -62,6 +62,12 @@ def insert_contribution_event(
     canonical = normalize_handle(handle, conn=conn)
     if not canonical:
         return False
+    # Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv,
+    # etc.) are provenance metadata, not contributors. Don't credit them. Without
+    # this gate every merge re-creates org events and undoes the v26 cleanup.
+    if is_publisher_handle(canonical, conn) is not None:
+        logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical)
+        return False
     kind = classify_kind(canonical)
     try:
         cur = conn.execute(
@@ -419,6 +425,14 @@ def upsert_contributor(
         logger.warning("Unknown contributor role: %s", role)
         return
 
+    # Schema v26 gate: orgs/citations live in publishers table, not contributors.
+    # Skip without writing so the v26 classifier cleanup isn't undone by every
+    # merge that has `sourcer: cnbc` (or similar) in claim frontmatter.
+    canonical_handle = handle.strip().lower().lstrip("@") if handle else ""
+    if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None:
+        logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle)
+        return
+
     existing = conn.execute(
         "SELECT handle FROM contributors WHERE handle = ?", (handle,)
     ).fetchone()

From dea1b02aa6d930876e3b5cce02cd1aca923ef7e5 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sun, 26 Apr 2026 14:25:24 +0100
Subject: [PATCH 2/5] fix(attribution): narrow exception + document gate
 asymmetry (Ganymede review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-up fixes from Ganymede's review of d0fb4c9:

1. is_publisher_handle: narrow `except Exception` to sqlite3.OperationalError.
   Pre-v26 DB fallback only needs to catch the "table doesn't exist" case;
   broader exceptions (programming errors, locks, corruption) should propagate.

2. upsert_contributor gate: add comment documenting the alias-resolution
   asymmetry between insert_contribution_event (alias-resolved via
   normalize_handle) and upsert_contributor (bare lower+lstrip-@). Today this
   is fine because the v26 classifier produced one publisher row per canonical
   handle. Branch 3 will normalize alias→canonical at writer entry points,
   tightening this gate transparently.

Unit tests for the gates (positive + negative + alias resolution) deferred to
Branch 3 alongside the auto-create flow tests.

Smoke-tested:
  - pre-v26 fallback (no publishers table) → None (correct)
  - case-insensitive match (CNBC → id=1) → correct
  - @ prefix strip (@cnbc → id=1) → correct
  - non-publisher handle (alexastrum) → None (correct)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/attribution.py | 8 ++++++--
 lib/contributor.py | 7 +++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/attribution.py b/lib/attribution.py
index 68e69eb..8b571b9 100644
--- a/lib/attribution.py
+++ b/lib/attribution.py
@@ -15,6 +15,7 @@ Epimetheus owns this module. Leo reviews changes.
 
 import logging
 import re
+import sqlite3
 from pathlib import Path
 
 logger = logging.getLogger("pipeline.attribution")
@@ -130,8 +131,11 @@ def is_publisher_handle(handle: str, conn) -> int | None:
         ).fetchone()
         if row:
             return row["id"] if hasattr(row, "keys") else row[0]
-    except Exception:
-        logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True)
+    except sqlite3.OperationalError:
+        # Pre-v26 DB: publishers table doesn't exist yet. Fall through to None
+        # so writer behaves as before. Any other exception class is real signal
+        # (programming error, lock contention, corruption) — let it propagate.
+        logger.debug("is_publisher_handle: publishers table not present (pre-v26?)", exc_info=True)
     return None
 
 
diff --git a/lib/contributor.py b/lib/contributor.py
index b2cc11d..983fe6b 100644
--- a/lib/contributor.py
+++ b/lib/contributor.py
@@ -428,6 +428,13 @@ def upsert_contributor(
     # Schema v26 gate: orgs/citations live in publishers table, not contributors.
     # Skip without writing so the v26 classifier cleanup isn't undone by every
     # merge that has `sourcer: cnbc` (or similar) in claim frontmatter.
+    #
+    # Note: bare normalization (lower + lstrip @), no alias resolution. This is
+    # consistent with the existing `SELECT handle FROM contributors WHERE handle = ?`
+    # below — both look up by canonical-form-as-stored. Today's classifier produces
+    # one publisher row per canonical handle, so bare lookup hits. Branch 3 will
+    # normalize alias→canonical at writer entry points (extract.py, post_extract);
+    # at that point this gate auto-tightens because callers pass canonical handles.
     canonical_handle = handle.strip().lower().lstrip("@") if handle else ""
     if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None:
         logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle)

From 2d332c66d4ba1496d0b749557eb1cef6b15d0df8 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Mon, 27 Apr 2026 12:38:53 +0100
Subject: [PATCH 3/5] fix(attribution): credit research-session sources to
 agents, not m3taversal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two-part fix for a bug where every claim extracted from agent overnight
research sessions was being credited to m3taversal in contribution_events
(visible in the activity feed as "@m3taversal" on agent-derived claims).

Forward fix (research/research-session.sh):
The frontmatter template the agent prompt instructs Claude to use now
includes `proposed_by: ${AGENT}` and `intake_tier: research-task`. With
those fields present, extract.py path 1 (line 687) takes precedence and
sets prs.submitted_by to the agent handle, which then propagates into
contribution_events as a kind='agent' author event for the agent.

Without the fields, extract.py fell through to the default branch on
line 695 and set submitted_by='@m3taversal'.

Backfill (scripts/backfill-research-session-attribution.py):
Identifies research-session-derived PRs by finding teleo-codex commits
matching `^<agent>: research session YYYY-MM-DD —`, listing the
inbox/queue/*.md files added in each commit's diff, and matching those
filename basenames against prs.source_path. Only PRs currently
submitted_by='@m3taversal' AND merged within the configurable window
are touched. Default --dry-run; --apply to commit.

For each match the script:
  1. UPDATE prs SET submitted_by = '<agent> (self-directed)'
  2. INSERT OR IGNORE the agent author event (kind='agent', weight=0.30)
     with the original PR's domain, channel, merged_at preserved
  3. DELETE the misattributed m3taversal author event

Applied 30-day backfill on VPS:
  - 304 PRs re-attributed (rio 74, clay 70, astra 53, vida 48,
    theseus 30, leo 29)
  - 297 m3taversal author events deleted, 304 agent author events
    inserted (delta of 7 = pre-v24 PRs that never had m3ta events
    in the first place; we still create the new agent event)
  - m3taversal author count: 1368 → 1071 (−22%)
  - Pre-backfill DB snapshot: pipeline.db.bak-pre-research-attribution

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 research/research-session.sh                  |   2 +
 .../backfill-research-session-attribution.py  | 280 ++++++++++++++++++
 2 files changed, 282 insertions(+)
 create mode 100644 scripts/backfill-research-session-attribution.py

diff --git a/research/research-session.sh b/research/research-session.sh
index abc6ab8..4f6703a 100755
--- a/research/research-session.sh
+++ b/research/research-session.sh
@@ -267,6 +267,8 @@ format: tweet | thread
 status: unprocessed
 priority: high | medium | low
 tags: [topic1, topic2]
+proposed_by: ${AGENT}
+intake_tier: research-task
 ---
 
 ## Content
diff --git a/scripts/backfill-research-session-attribution.py b/scripts/backfill-research-session-attribution.py
new file mode 100644
index 0000000..21bf63d
--- /dev/null
+++ b/scripts/backfill-research-session-attribution.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""Backfill: re-attribute research-session-derived PRs from m3taversal to agent.
+
+Problem: research-session.sh used to write source frontmatter without
+`proposed_by` / `intake_tier`, so extract.py's contributor-classification
+fallback set `prs.submitted_by = '@m3taversal'`, which propagated into
+`contribution_events` as a `handle='m3taversal', role='author'` row per
+research-derived claim. Result: agent research credited to the human.
+
+Forward fix is a frontmatter-template patch to research-session.sh.
+This script corrects historical records.
+
+Identification:
+  Research-session source archives are committed to teleo-codex with a
+  message matching `^<agent>: research session YYYY-MM-DD —`. The diff
+  for that commit lists `inbox/queue/*.md` files the agent created. Any
+  PR whose `source_path` matches one of those filenames is research-derived.
+
+Touch list (per matched PR):
+  1. UPDATE prs SET submitted_by = '<agent> (self-directed)'
+  2. DELETE FROM contribution_events
+       WHERE handle='m3taversal' AND role='author' AND pr_number=?
+  3. INSERT OR IGNORE INTO contribution_events with handle=<agent>,
+     kind='agent', role='author', weight=0.30, original timestamp/domain/channel.
+
+Defaults to --dry-run. Pass --apply to commit changes.
+
+Usage:
+    python3 backfill-research-session-attribution.py --dry-run --days 30
+    python3 backfill-research-session-attribution.py --apply --days 30
+"""
+
+import argparse
+import logging
+import os
+import re
+import sqlite3
+import subprocess
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger("backfill-research-attr")
+
+DEFAULT_REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
+DEFAULT_DB = Path(os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db"))
+
+KNOWN_AGENTS = frozenset({"rio", "leo", "theseus", "vida", "clay", "astra"})
+COMMIT_HEADER_RE = re.compile(r"^([a-z]+):\s+research session\s+\d{4}-\d{2}-\d{2}\s+—")
+AUTHOR_WEIGHT = 0.30
+
+
+def git(repo: Path, *args: str) -> str:
+    """Run a git command in repo, return stdout. Raises on non-zero."""
+    result = subprocess.run(
+        ["git", "-C", str(repo), *args],
+        capture_output=True, text=True, check=True,
+    )
+    return result.stdout
+
+
+def discover_research_session_archives(repo: Path, days: int) -> dict[str, str]:
+    """Return {source_filename_basename: agent_handle} for last N days.
+
+    Walks teleo-codex `git log --since`, filters to research-session commits,
+    parses agent from message header, lists inbox/queue/*.md files added in
+    that commit's diff. Maps the basename (which becomes source_path on extract)
+    to the agent who created it.
+    """
+    log = git(repo, "log", f"--since={days} days ago", "--pretty=%H|%s", "--no-merges")
+    file_to_agent: dict[str, str] = {}
+    commits_seen = 0
+    commits_matched = 0
+    for line in log.splitlines():
+        if not line or "|" not in line:
+            continue
+        commits_seen += 1
+        sha, _, subject = line.partition("|")
+        m = COMMIT_HEADER_RE.match(subject)
+        if not m:
+            continue
+        agent = m.group(1)
+        if agent not in KNOWN_AGENTS:
+            logger.debug("skipping commit %s — unknown agent %r", sha[:8], agent)
+            continue
+        commits_matched += 1
+        # List files added in this commit (inbox/queue/*.md only)
+        try:
+            added = git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r",
+                        "--diff-filter=A", sha)
+        except subprocess.CalledProcessError:
+            logger.warning("diff-tree failed for %s", sha[:8])
+            continue
+        for f in added.splitlines():
+            if f.startswith("inbox/queue/") and f.endswith(".md"):
+                basename = Path(f).name
+                if basename in file_to_agent and file_to_agent[basename] != agent:
+                    logger.warning(
+                        "filename collision: %s — was %s, now %s (keeping first)",
+                        basename, file_to_agent[basename], agent,
+                    )
+                    continue
+                file_to_agent.setdefault(basename, agent)
+    logger.info(
+        "scanned %d commits, %d research-session matches, %d unique source files",
+        commits_seen, commits_matched, len(file_to_agent),
+    )
+    return file_to_agent
+
+
+def find_misattributed_prs(conn: sqlite3.Connection, file_to_agent: dict[str, str], days: int):
+    """Return list of (pr_number, current_submitted_by, source_path, agent, domain, channel, merged_at).
+
+    Only includes PRs:
+      - with source_path basename in our research-session map
+      - currently attributed to '@m3taversal'
+      - merged within the last N days (cap on temporal scope)
+    """
+    rows = conn.execute(
+        """SELECT number, submitted_by, source_path, domain, source_channel, merged_at
+           FROM prs
+           WHERE submitted_by = '@m3taversal'
+             AND source_path IS NOT NULL
+             AND status = 'merged'
+             AND merged_at > datetime('now', ?)""",
+        (f"-{days} days",),
+    ).fetchall()
+    matches = []
+    for row in rows:
+        basename = Path(row["source_path"]).name
+        agent = file_to_agent.get(basename)
+        if agent:
+            matches.append({
+                "pr": row["number"],
+                "current_submitted_by": row["submitted_by"],
+                "source_path": row["source_path"],
+                "basename": basename,
+                "agent": agent,
+                "domain": row["domain"],
+                "channel": row["source_channel"],
+                "merged_at": row["merged_at"],
+            })
+    return matches
+
+
+def existing_event_count(conn: sqlite3.Connection, pr: int, handle: str, role: str) -> int:
+    """Return count of contribution_events rows matching (handle, role, pr_number, claim_path IS NULL)."""
+    return conn.execute(
+        """SELECT COUNT(*) FROM contribution_events
+           WHERE handle = ? AND role = ? AND pr_number = ? AND claim_path IS NULL""",
+        (handle, role, pr),
+    ).fetchone()[0]
+
+
+def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) -> dict:
+    """Apply the backfill. Returns counters."""
+    counters = defaultdict(int)
+    if not dry_run:
+        conn.execute("BEGIN")
+    try:
+        for m in matches:
+            pr = m["pr"]
+            agent = m["agent"]
+
+            # Pre-checks for accurate dry-run reporting
+            old_event_exists = existing_event_count(conn, pr, "m3taversal", "author") > 0
+            new_event_exists = existing_event_count(conn, pr, agent, "author") > 0
+
+            if dry_run:
+                logger.info(
+                    "would update pr=%d submitted_by '%s' → '%s (self-directed)' "
+                    "[m3ta_event=%s, agent_event=%s]",
+                    pr, m["current_submitted_by"], agent,
+                    old_event_exists, new_event_exists,
+                )
+                counters["prs"] += 1
+                if old_event_exists:
+                    counters["events_to_delete"] += 1
+                if not new_event_exists:
+                    counters["events_to_insert"] += 1
+                continue
+
+            # 1. UPDATE prs.submitted_by
+            conn.execute(
+                "UPDATE prs SET submitted_by = ? WHERE number = ?",
+                (f"{agent} (self-directed)", pr),
+            )
+            counters["prs"] += 1
+
+            # 2. INSERT new agent author event (idempotent via UNIQUE index)
+            cur = conn.execute(
+                """INSERT OR IGNORE INTO contribution_events
+                   (handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp)
+                   VALUES (?, 'agent', 'author', ?, ?, NULL, ?, ?, COALESCE(?, datetime('now')))""",
+                (agent, AUTHOR_WEIGHT, pr, m["domain"], m["channel"], m["merged_at"]),
+            )
+            if cur.rowcount > 0:
+                counters["events_inserted"] += 1
+
+            # 3. DELETE old m3taversal author event
+            cur = conn.execute(
+                """DELETE FROM contribution_events
+                   WHERE handle = 'm3taversal' AND role = 'author'
+                     AND pr_number = ? AND claim_path IS NULL""",
+                (pr,),
+            )
+            if cur.rowcount > 0:
+                counters["events_deleted"] += 1
+
+        if not dry_run:
+            conn.execute("COMMIT")
+    except Exception:
+        if not dry_run:
+            conn.execute("ROLLBACK")
+        raise
+
+    return dict(counters)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo", type=Path, default=DEFAULT_REPO)
+    parser.add_argument("--db", type=Path, default=DEFAULT_DB)
+    parser.add_argument("--days", type=int, default=30)
+    parser.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="cap PR updates (0 = no cap; useful for testing on a small slice)")
+    args = parser.parse_args()
+    dry_run = not args.apply
+
+    logger.info("repo=%s db=%s days=%d mode=%s",
+                args.repo, args.db, args.days, "DRY-RUN" if dry_run else "APPLY")
+
+    if not args.repo.exists():
+        logger.error("repo not found: %s", args.repo)
+        sys.exit(1)
+    if not args.db.exists():
+        logger.error("db not found: %s", args.db)
+        sys.exit(1)
+
+    file_to_agent = discover_research_session_archives(args.repo, args.days)
+    if not file_to_agent:
+        logger.warning("no research-session source files found in last %d days", args.days)
+        sys.exit(0)
+
+    # Per-agent breakdown
+    by_agent = defaultdict(int)
+    for agent in file_to_agent.values():
+        by_agent[agent] += 1
+    for agent, count in sorted(by_agent.items()):
+        logger.info("  research-session sources by %s: %d", agent, count)
+
+    conn = sqlite3.connect(args.db)
+    conn.row_factory = sqlite3.Row
+    matches = find_misattributed_prs(conn, file_to_agent, args.days)
+    logger.info("misattributed PRs found: %d", len(matches))
+
+    if args.limit and len(matches) > args.limit:
+        logger.info("--limit=%d — truncating from %d", args.limit, len(matches))
+        matches = matches[:args.limit]
+
+    if not matches:
+        logger.info("nothing to do")
+        return
+
+    # Per-agent breakdown of misattribution
+    miss_by_agent = defaultdict(int)
+    for m in matches:
+        miss_by_agent[m["agent"]] += 1
+    logger.info("misattributed PR breakdown:")
+    for agent, count in sorted(miss_by_agent.items()):
+        logger.info("  %s: %d", agent, count)
+
+    counters = apply_backfill(conn, matches, dry_run)
+    logger.info("RESULT (%s): %s", "DRY-RUN" if dry_run else "APPLIED", counters)
+
+
+if __name__ == "__main__":
+    main()

From 319e03e2c6676a1d622a8e34fabdf01002b31692 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Mon, 27 Apr 2026 12:50:17 +0100
Subject: [PATCH 4/5] test(attribution): prove research-backfill replay is
 idempotent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five tests against the real contribution_events schema (lib/db.py:181-209):
- pr-level dedup with NULL claim_path via idx_ce_unique_pr partial index
- per-claim dedup with non-NULL claim_path via idx_ce_unique_claim partial index
- pr-level and per-claim events coexist on the same pr_number
- backfill (INSERT correct + DELETE wrong) is a true no-op on replay
- replay against already-backfilled state preserves unrelated events

Schema case identified: case 2 with partial-index split solution already in
place. Two partial UNIQUE indexes target disjoint row sets (claim_path IS NULL
vs IS NOT NULL), bypassing SQLite's NULL-not-equal-NULL UNIQUE quirk.

Production replay verified: re-running backfill --apply against the live DB
returns "misattributed PRs found: 0" because the first-run UPDATE flipped the
WHERE predicate. Total contribution_events count: 3839 → 3839.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_research_backfill_idempotent.py | 167 +++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 tests/test_research_backfill_idempotent.py

diff --git a/tests/test_research_backfill_idempotent.py b/tests/test_research_backfill_idempotent.py
new file mode 100644
index 0000000..fac5969
--- /dev/null
+++ b/tests/test_research_backfill_idempotent.py
@@ -0,0 +1,167 @@
+"""Verify research-attribution backfill is replay-safe against real schema.
+
+Three things to prove:
+1. (handle, role, pr_number) with claim_path=NULL deduplicates correctly
+   (idx_ce_unique_pr partial index handles SQLite NULL-not-equal-NULL).
+2. Re-inserting an existing (handle, role, pr_number, NULL) row via INSERT OR IGNORE
+   is a true no-op — does not create a phantom duplicate.
+3. The backfill script's specific operation (DELETE then INSERT for same key)
+   nets zero rows when run twice in sequence.
+"""
+
+import sqlite3
+import sys
+
+# Schema lifted verbatim from lib/db.py:181-209
+SCHEMA = """
+CREATE TABLE contribution_events (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    handle TEXT NOT NULL,
+    kind TEXT NOT NULL DEFAULT 'person',
+    role TEXT NOT NULL,
+    weight REAL NOT NULL,
+    pr_number INTEGER NOT NULL,
+    claim_path TEXT,
+    domain TEXT,
+    channel TEXT,
+    timestamp TEXT NOT NULL DEFAULT (datetime('now'))
+);
+CREATE UNIQUE INDEX idx_ce_unique_claim ON contribution_events(
+    handle, role, pr_number, claim_path
+) WHERE claim_path IS NOT NULL;
+CREATE UNIQUE INDEX idx_ce_unique_pr ON contribution_events(
+    handle, role, pr_number
+) WHERE claim_path IS NULL;
+"""
+
+
+def setup() -> sqlite3.Connection:
+    conn = sqlite3.connect(":memory:")
+    conn.row_factory = sqlite3.Row
+    conn.executescript(SCHEMA)
+    return conn
+
+
+def insert_event(conn, handle, role, pr_number, claim_path=None):
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO contribution_events
+           (handle, kind, role, weight, pr_number, claim_path)
+           VALUES (?, 'agent', ?, 0.30, ?, ?)""",
+        (handle, role, pr_number, claim_path),
+    )
+    return cur.rowcount
+
+
+def count(conn) -> int:
+    return conn.execute("SELECT COUNT(*) FROM contribution_events").fetchone()[0]
+
+
+def test_pr_level_dedup_with_null_claim_path():
+    """Two inserts of same (handle, role, pr_number, NULL) → 1 row."""
+    conn = setup()
+    r1 = insert_event(conn, "rio", "author", 4061)
+    r2 = insert_event(conn, "rio", "author", 4061)
+    n = count(conn)
+    assert r1 == 1, f"first insert should write, got rowcount={r1}"
+    assert r2 == 0, f"second insert should be ignored, got rowcount={r2}"
+    assert n == 1, f"expected 1 row, got {n}"
+    print("PASS: pr-level dedup with NULL claim_path")
+
+
+def test_per_claim_dedup_with_path():
+    """Two inserts of same (handle, role, pr_number, path) → 1 row."""
+    conn = setup()
+    r1 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
+    r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
+    n = count(conn)
+    assert r1 == 1 and r2 == 0 and n == 1
+    print("PASS: per-claim dedup with claim_path")
+
+
+def test_pr_level_and_per_claim_coexist():
+    """A (handle, role, pr_number, NULL) and (handle, role, pr_number, 'x.md') coexist
+    because the partial indexes target different rows."""
+    conn = setup()
+    r1 = insert_event(conn, "rio", "author", 4061, claim_path=None)
+    r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
+    n = count(conn)
+    assert r1 == 1 and r2 == 1 and n == 2
+    print("PASS: pr-level and per-claim events coexist on same pr_number")
+
+
+def test_backfill_replay_is_noop():
+    """Simulate the exact backfill operation: INSERT correct event, DELETE wrong event.
+    Run twice. Expect identical state — no phantom rows, no double-deletions."""
+    conn = setup()
+
+    # Initial state: m3taversal has the wrong author event for pr=4061
+    insert_event(conn, "m3taversal", "author", 4061)
+    assert count(conn) == 1
+
+    def backfill_pr_4061():
+        # Insert the correct event (rio is the real author)
+        conn.execute(
+            """INSERT OR IGNORE INTO contribution_events
+               (handle, kind, role, weight, pr_number, claim_path)
+               VALUES (?, 'agent', 'author', 0.30, 4061, NULL)""",
+            ("rio (self-directed)",),
+        )
+        # Delete the wrong event
+        conn.execute(
+            """DELETE FROM contribution_events
+               WHERE handle='m3taversal' AND role='author'
+                 AND pr_number=4061 AND claim_path IS NULL""",
+        )
+        conn.commit()
+
+    backfill_pr_4061()
+    state_after_first = sorted(
+        (r["handle"], r["role"], r["pr_number"], r["claim_path"])
+        for r in conn.execute("SELECT * FROM contribution_events")
+    )
+    assert state_after_first == [("rio (self-directed)", "author", 4061, None)], state_after_first
+
+    # Replay
+    backfill_pr_4061()
+    state_after_second = sorted(
+        (r["handle"], r["role"], r["pr_number"], r["claim_path"])
+        for r in conn.execute("SELECT * FROM contribution_events")
+    )
+    assert state_after_first == state_after_second, "replay should be idempotent"
+    assert count(conn) == 1, f"expected 1 row after replay, got {count(conn)}"
+    print("PASS: backfill replay is a true no-op")
+
+
+def test_replay_against_already_backfilled_pr_does_not_double_delete():
+    """If m3taversal event was already deleted, running backfill again must not error
+    or affect anything else."""
+    conn = setup()
+    # Already-correct state: rio has the author event, m3taversal does not
+    insert_event(conn, "rio (self-directed)", "author", 4061)
+    insert_event(conn, "leo", "evaluator", 4061)  # noise — should not be touched
+
+    # Run backfill: tries to INSERT (rio, author, 4061) — already exists, no-op
+    # Tries to DELETE (m3taversal, author, 4061) — already absent, 0 rows affected
+    cur1 = conn.execute(
+        """INSERT OR IGNORE INTO contribution_events
+           (handle, kind, role, weight, pr_number, claim_path)
+           VALUES ('rio (self-directed)', 'agent', 'author', 0.30, 4061, NULL)""",
+    )
+    cur2 = conn.execute(
+        """DELETE FROM contribution_events
+           WHERE handle='m3taversal' AND role='author'
+             AND pr_number=4061 AND claim_path IS NULL""",
+    )
+    assert cur1.rowcount == 0, f"insert should be no-op, got {cur1.rowcount}"
+    assert cur2.rowcount == 0, f"delete should be no-op, got {cur2.rowcount}"
+    assert count(conn) == 2, f"expected 2 rows preserved, got {count(conn)}"
+    print("PASS: replay against already-backfilled state preserves unrelated events")
+
+
+if __name__ == "__main__":
+    test_pr_level_dedup_with_null_claim_path()
+    test_per_claim_dedup_with_path()
+    test_pr_level_and_per_claim_coexist()
+    test_backfill_replay_is_noop()
+    test_replay_against_already_backfilled_pr_does_not_double_delete()
+    print("\nAll 5 tests passed against real schema.")

From 6aff03ff56cae46b37137584b3ce6aae5bd57982 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Mon, 27 Apr 2026 12:53:52 +0100
Subject: [PATCH 5/5] fix(attribution): unify research-session format on
 "(self-directed)" suffix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves the format inconsistency between the forward fix and the 304-row
backfill. Both halves now produce prs.submitted_by = "rio (self-directed)":

- research-session.sh: drop proposed_by from the frontmatter template.
  extract.py path 1 (proposed_by-driven) no longer fires; path 2 fires
  instead and constructs f"{agent} (self-directed)" — matches backfill.

- attribution.py: normalize_handle now strips "(self-directed)" suffix
  immediately after lowercase+@-strip, before alias lookup. Closes the
  phantom-person-event class on any future replay through
  record_contributor_attribution. Round-trips through alias rules keyed
  on bare agent names.

Test (5 cases) still passes; suffix-strip behavior verified against
hostile inputs (whitespace, casing, mid-string occurrences must NOT
match — only trailing pattern).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/attribution.py           | 1 +
 research/research-session.sh | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/attribution.py b/lib/attribution.py
index 8b571b9..694bd1f 100644
--- a/lib/attribution.py
+++ b/lib/attribution.py
@@ -82,6 +82,7 @@ def normalize_handle(handle: str, conn=None) -> str:
     if not handle:
         return ""
     h = handle.strip().lower().lstrip("@")
+    h = re.sub(r"\s*\(self-directed\)\s*$", "", h)
     if conn is None:
         return h
     try:
diff --git a/research/research-session.sh b/research/research-session.sh
index 4f6703a..dc40e07 100755
--- a/research/research-session.sh
+++ b/research/research-session.sh
@@ -267,7 +267,6 @@ format: tweet | thread
 status: unprocessed
 priority: high | medium | low
 tags: [topic1, topic2]
-proposed_by: ${AGENT}
 intake_tier: research-task
 ---