From d0fb4c96e3e009ccf3a96ef98495e325cec0767a Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sun, 26 Apr 2026 14:21:10 +0100 Subject: [PATCH 1/5] fix(attribution): gate writer on publishers table (regression prevention) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema v26 (commit 3fe524d) split orgs/citations from contributors into the publishers table. Without a writer-side gate, every merged PR with `sourcer: cnbc` (or similar) re-creates CNBC as a contributor and undoes the v26 classifier cleanup. Once normal pipeline traffic resumes, the contributors table re-pollutes within hours. Fix: belt-and-suspenders gate at both writer surfaces. 1. `lib/attribution.py::is_publisher_handle(handle, conn)` — returns publisher.id if handle exists in publishers.name, else None. Falls back gracefully on pre-v26 DBs (no publishers table → returns None → writer behaves like before, no regression). 2. `lib/contributor.py::insert_contribution_event` — checks is_publisher_handle on canonical handle before INSERT. If it's a publisher, debug-log + return False. Prevents originator events for CNBC/SpaceNews/etc. 3. `lib/contributor.py::upsert_contributor` — same gate at top. Prevents the contributors table from re-acquiring publisher rows. Verified end-to-end against live VPS DB snapshot: - CNBC originator event: blocked (insert returns False) - CNBC contributors row: blocked (no row created) - alexastrum, thesensatore, newhandle_xyz: pass through unchanged - is_publisher_handle handles case-insensitive lookup correctly (CNBC and cnbc both match publisher_id=3) Pre-deploy event count was 3705. Post-classifier cleanup: 3623 (82 org events purged). Going forward, no new org events accumulate. Branch 2 of the schema-v26 rollout. Branch 3 (auto-create at tier='cited', extract.py sources.publisher_id wiring) is separate scope and not required for regression prevention. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 27 +++++++++++++++++++++++++++ lib/contributor.py | 16 +++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/lib/attribution.py b/lib/attribution.py index 664c39c..68e69eb 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -108,6 +108,33 @@ def classify_kind(handle: str) -> str: return "person" +def is_publisher_handle(handle: str, conn) -> int | None: + """Return publisher.id if the handle exists as a publisher name, else None. + + Schema v26 split orgs/citations into the publishers table. Writer code + (upsert_contributor, insert_contribution_event) calls this to gate creating + contributor rows or events for handles that belong to publishers. + + Without this gate, every merged PR with `sourcer: cnbc` (for example) would + re-create CNBC as a contributor and undo the v26 classifier cleanup. + + Falls back gracefully on pre-v26 DBs: returns None if publishers table + doesn't exist yet (writer behaves like before, no regression). + """ + if not handle or conn is None: + return None + h = handle.strip().lower().lstrip("@") + try: + row = conn.execute( + "SELECT id FROM publishers WHERE name = ?", (h,), + ).fetchone() + if row: + return row["id"] if hasattr(row, "keys") else row[0] + except Exception: + logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True) + return None + + # ─── Parse attribution from claim content ────────────────────────────────── diff --git a/lib/contributor.py b/lib/contributor.py index a2117d6..b2cc11d 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -14,7 +14,7 @@ import logging import re from . import config, db -from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle +from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle from .forgejo import get_pr_diff logger = logging.getLogger("pipeline.contributor") @@ -62,6 +62,12 @@ def insert_contribution_event( canonical = normalize_handle(handle, conn=conn) if not canonical: return False + # Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv, + # etc.) are provenance metadata, not contributors. Don't credit them. Without + # this gate every merge re-creates org events and undoes the v26 cleanup. + if is_publisher_handle(canonical, conn) is not None: + logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical) + return False kind = classify_kind(canonical) try: cur = conn.execute( @@ -419,6 +425,14 @@ def upsert_contributor( logger.warning("Unknown contributor role: %s", role) return + # Schema v26 gate: orgs/citations live in publishers table, not contributors. + # Skip without writing so the v26 classifier cleanup isn't undone by every + # merge that has `sourcer: cnbc` (or similar) in claim frontmatter. + canonical_handle = handle.strip().lower().lstrip("@") if handle else "" + if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None: + logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle) + return + existing = conn.execute( "SELECT handle FROM contributors WHERE handle = ?", (handle,) ).fetchone() From dea1b02aa6d930876e3b5cce02cd1aca923ef7e5 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sun, 26 Apr 2026 14:25:24 +0100 Subject: [PATCH 2/5] fix(attribution): narrow exception + document gate asymmetry (Ganymede review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up fixes from Ganymede's review of d0fb4c9: 1. is_publisher_handle: narrow `except Exception` to sqlite3.OperationalError. Pre-v26 DB fallback only needs to catch the "table doesn't exist" case; broader exceptions (programming errors, locks, corruption) should propagate. 2. upsert_contributor gate: add comment documenting the alias-resolution asymmetry between insert_contribution_event (alias-resolved via normalize_handle) and upsert_contributor (bare lower+lstrip-@). Today this is fine because the v26 classifier produced one publisher row per canonical handle. Branch 3 will normalize alias→canonical at writer entry points, tightening this gate transparently. Unit tests for the gates (positive + negative + alias resolution) deferred to Branch 3 alongside the auto-create flow tests. Smoke-tested: - pre-v26 fallback (no publishers table) → None (correct) - case-insensitive match (CNBC → id=1) → correct - @ prefix strip (@cnbc → id=1) → correct - non-publisher handle (alexastrum) → None (correct) Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 8 ++++++-- lib/contributor.py | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/attribution.py b/lib/attribution.py index 68e69eb..8b571b9 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -15,6 +15,7 @@ Epimetheus owns this module. Leo reviews changes. import logging import re +import sqlite3 from pathlib import Path logger = logging.getLogger("pipeline.attribution") @@ -130,8 +131,11 @@ def is_publisher_handle(handle: str, conn) -> int | None: ).fetchone() if row: return row["id"] if hasattr(row, "keys") else row[0] - except Exception: - logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True) + except sqlite3.OperationalError: + # Pre-v26 DB: publishers table doesn't exist yet. Fall through to None + # so writer behaves as before. Any other exception class is real signal + # (programming error, lock contention, corruption) — let it propagate. + logger.debug("is_publisher_handle: publishers table not present (pre-v26?)", exc_info=True) return None diff --git a/lib/contributor.py b/lib/contributor.py index b2cc11d..983fe6b 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -428,6 +428,13 @@ def upsert_contributor( # Schema v26 gate: orgs/citations live in publishers table, not contributors. # Skip without writing so the v26 classifier cleanup isn't undone by every # merge that has `sourcer: cnbc` (or similar) in claim frontmatter. + # + # Note: bare normalization (lower + lstrip @), no alias resolution. This is + # consistent with the existing `SELECT handle FROM contributors WHERE handle = ?` + # below — both look up by canonical-form-as-stored. Today's classifier produces + # one publisher row per canonical handle, so bare lookup hits. Branch 3 will + # normalize alias→canonical at writer entry points (extract.py, post_extract); + # at that point this gate auto-tightens because callers pass canonical handles. canonical_handle = handle.strip().lower().lstrip("@") if handle else "" if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None: logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle) From 2d332c66d4ba1496d0b749557eb1cef6b15d0df8 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Mon, 27 Apr 2026 12:38:53 +0100 Subject: [PATCH 3/5] fix(attribution): credit research-session sources to agents, not m3taversal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-part fix for a bug where every claim extracted from agent overnight research sessions was being credited to m3taversal in contribution_events (visible in the activity feed as "@m3taversal" on agent-derived claims). Forward fix (research/research-session.sh): The frontmatter template the agent prompt instructs Claude to use now includes `proposed_by: ${AGENT}` and `intake_tier: research-task`. With those fields present, extract.py path 1 (line 687) takes precedence and sets prs.submitted_by to the agent handle, which then propagates into contribution_events as a kind='agent' author event for the agent. Without the fields, extract.py fell through to the default branch on line 695 and set submitted_by='@m3taversal'. Backfill (scripts/backfill-research-session-attribution.py): Identifies research-session-derived PRs by finding teleo-codex commits matching `^: research session YYYY-MM-DD —`, listing the inbox/queue/*.md files added in each commit's diff, and matching those filename basenames against prs.source_path. Only PRs currently submitted_by='@m3taversal' AND merged within the configurable window are touched. Default --dry-run; --apply to commit. For each match the script: 1. UPDATE prs SET submitted_by = ' (self-directed)' 2. INSERT OR IGNORE the agent author event (kind='agent', weight=0.30) with the original PR's domain, channel, merged_at preserved 3. DELETE the misattributed m3taversal author event Applied 30-day backfill on VPS: - 304 PRs re-attributed (rio 74, clay 70, astra 53, vida 48, theseus 30, leo 29) - 297 m3taversal author events deleted, 304 agent author events inserted (delta of 7 = pre-v24 PRs that never had m3ta events in the first place; we still create the new agent event) - m3taversal author count: 1368 → 1071 (−22%) - Pre-backfill DB snapshot: pipeline.db.bak-pre-research-attribution Co-Authored-By: Claude Opus 4.7 (1M context) --- research/research-session.sh | 2 + .../backfill-research-session-attribution.py | 280 ++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 scripts/backfill-research-session-attribution.py diff --git a/research/research-session.sh b/research/research-session.sh index abc6ab8..4f6703a 100755 --- a/research/research-session.sh +++ b/research/research-session.sh @@ -267,6 +267,8 @@ format: tweet | thread status: unprocessed priority: high | medium | low tags: [topic1, topic2] +proposed_by: ${AGENT} +intake_tier: research-task --- ## Content diff --git a/scripts/backfill-research-session-attribution.py b/scripts/backfill-research-session-attribution.py new file mode 100644 index 0000000..21bf63d --- /dev/null +++ b/scripts/backfill-research-session-attribution.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""Backfill: re-attribute research-session-derived PRs from m3taversal to agent. + +Problem: research-session.sh used to write source frontmatter without +`proposed_by` / `intake_tier`, so extract.py's contributor-classification +fallback set `prs.submitted_by = '@m3taversal'`, which propagated into +`contribution_events` as a `handle='m3taversal', role='author'` row per +research-derived claim. Result: agent research credited to the human. + +Forward fix is a frontmatter-template patch to research-session.sh. +This script corrects historical records. + +Identification: + Research-session source archives are committed to teleo-codex with a + message matching `^: research session YYYY-MM-DD —`. The diff + for that commit lists `inbox/queue/*.md` files the agent created. Any + PR whose `source_path` matches one of those filenames is research-derived. + +Touch list (per matched PR): + 1. UPDATE prs SET submitted_by = ' (self-directed)' + 2. DELETE FROM contribution_events + WHERE handle='m3taversal' AND role='author' AND pr_number=? + 3. INSERT OR IGNORE INTO contribution_events with handle=, + kind='agent', role='author', weight=0.30, original timestamp/domain/channel. + +Defaults to --dry-run. Pass --apply to commit changes. + +Usage: + python3 backfill-research-session-attribution.py --dry-run --days 30 + python3 backfill-research-session-attribution.py --apply --days 30 +""" + +import argparse +import logging +import os +import re +import sqlite3 +import subprocess +import sys +from collections import defaultdict +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger("backfill-research-attr") + +DEFAULT_REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")) +DEFAULT_DB = Path(os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")) + +KNOWN_AGENTS = frozenset({"rio", "leo", "theseus", "vida", "clay", "astra"}) +COMMIT_HEADER_RE = re.compile(r"^([a-z]+):\s+research session\s+\d{4}-\d{2}-\d{2}\s+—") +AUTHOR_WEIGHT = 0.30 + + +def git(repo: Path, *args: str) -> str: + """Run a git command in repo, return stdout. Raises on non-zero.""" + result = subprocess.run( + ["git", "-C", str(repo), *args], + capture_output=True, text=True, check=True, + ) + return result.stdout + + +def discover_research_session_archives(repo: Path, days: int) -> dict[str, str]: + """Return {source_filename_basename: agent_handle} for last N days. + + Walks teleo-codex `git log --since`, filters to research-session commits, + parses agent from message header, lists inbox/queue/*.md files added in + that commit's diff. Maps the basename (which becomes source_path on extract) + to the agent who created it. + """ + log = git(repo, "log", f"--since={days} days ago", "--pretty=%H|%s", "--no-merges") + file_to_agent: dict[str, str] = {} + commits_seen = 0 + commits_matched = 0 + for line in log.splitlines(): + if not line or "|" not in line: + continue + commits_seen += 1 + sha, _, subject = line.partition("|") + m = COMMIT_HEADER_RE.match(subject) + if not m: + continue + agent = m.group(1) + if agent not in KNOWN_AGENTS: + logger.debug("skipping commit %s — unknown agent %r", sha[:8], agent) + continue + commits_matched += 1 + # List files added in this commit (inbox/queue/*.md only) + try: + added = git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", + "--diff-filter=A", sha) + except subprocess.CalledProcessError: + logger.warning("diff-tree failed for %s", sha[:8]) + continue + for f in added.splitlines(): + if f.startswith("inbox/queue/") and f.endswith(".md"): + basename = Path(f).name + if basename in file_to_agent and file_to_agent[basename] != agent: + logger.warning( + "filename collision: %s — was %s, now %s (keeping first)", + basename, file_to_agent[basename], agent, + ) + continue + file_to_agent.setdefault(basename, agent) + logger.info( + "scanned %d commits, %d research-session matches, %d unique source files", + commits_seen, commits_matched, len(file_to_agent), + ) + return file_to_agent + + +def find_misattributed_prs(conn: sqlite3.Connection, file_to_agent: dict[str, str], days: int): + """Return list of (pr_number, current_submitted_by, source_path, agent, domain, channel, merged_at). + + Only includes PRs: + - with source_path basename in our research-session map + - currently attributed to '@m3taversal' + - merged within the last N days (cap on temporal scope) + """ + rows = conn.execute( + """SELECT number, submitted_by, source_path, domain, source_channel, merged_at + FROM prs + WHERE submitted_by = '@m3taversal' + AND source_path IS NOT NULL + AND status = 'merged' + AND merged_at > datetime('now', ?)""", + (f"-{days} days",), + ).fetchall() + matches = [] + for row in rows: + basename = Path(row["source_path"]).name + agent = file_to_agent.get(basename) + if agent: + matches.append({ + "pr": row["number"], + "current_submitted_by": row["submitted_by"], + "source_path": row["source_path"], + "basename": basename, + "agent": agent, + "domain": row["domain"], + "channel": row["source_channel"], + "merged_at": row["merged_at"], + }) + return matches + + +def existing_event_count(conn: sqlite3.Connection, pr: int, handle: str, role: str) -> int: + """Return count of contribution_events rows matching (handle, role, pr_number, claim_path IS NULL).""" + return conn.execute( + """SELECT COUNT(*) FROM contribution_events + WHERE handle = ? AND role = ? AND pr_number = ? AND claim_path IS NULL""", + (handle, role, pr), + ).fetchone()[0] + + +def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) -> dict: + """Apply the backfill. Returns counters.""" + counters = defaultdict(int) + if not dry_run: + conn.execute("BEGIN") + try: + for m in matches: + pr = m["pr"] + agent = m["agent"] + + # Pre-checks for accurate dry-run reporting + old_event_exists = existing_event_count(conn, pr, "m3taversal", "author") > 0 + new_event_exists = existing_event_count(conn, pr, agent, "author") > 0 + + if dry_run: + logger.info( + "would update pr=%d submitted_by '%s' → '%s (self-directed)' " + "[m3ta_event=%s, agent_event=%s]", + pr, m["current_submitted_by"], agent, + old_event_exists, new_event_exists, + ) + counters["prs"] += 1 + if old_event_exists: + counters["events_to_delete"] += 1 + if not new_event_exists: + counters["events_to_insert"] += 1 + continue + + # 1. UPDATE prs.submitted_by + conn.execute( + "UPDATE prs SET submitted_by = ? WHERE number = ?", + (f"{agent} (self-directed)", pr), + ) + counters["prs"] += 1 + + # 2. INSERT new agent author event (idempotent via UNIQUE index) + cur = conn.execute( + """INSERT OR IGNORE INTO contribution_events + (handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp) + VALUES (?, 'agent', 'author', ?, ?, NULL, ?, ?, COALESCE(?, datetime('now')))""", + (agent, AUTHOR_WEIGHT, pr, m["domain"], m["channel"], m["merged_at"]), + ) + if cur.rowcount > 0: + counters["events_inserted"] += 1 + + # 3. DELETE old m3taversal author event + cur = conn.execute( + """DELETE FROM contribution_events + WHERE handle = 'm3taversal' AND role = 'author' + AND pr_number = ? AND claim_path IS NULL""", + (pr,), + ) + if cur.rowcount > 0: + counters["events_deleted"] += 1 + + if not dry_run: + conn.execute("COMMIT") + except Exception: + if not dry_run: + conn.execute("ROLLBACK") + raise + + return dict(counters) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--repo", type=Path, default=DEFAULT_REPO) + parser.add_argument("--db", type=Path, default=DEFAULT_DB) + parser.add_argument("--days", type=int, default=30) + parser.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)") + parser.add_argument("--limit", type=int, default=0, + help="cap PR updates (0 = no cap; useful for testing on a small slice)") + args = parser.parse_args() + dry_run = not args.apply + + logger.info("repo=%s db=%s days=%d mode=%s", + args.repo, args.db, args.days, "DRY-RUN" if dry_run else "APPLY") + + if not args.repo.exists(): + logger.error("repo not found: %s", args.repo) + sys.exit(1) + if not args.db.exists(): + logger.error("db not found: %s", args.db) + sys.exit(1) + + file_to_agent = discover_research_session_archives(args.repo, args.days) + if not file_to_agent: + logger.warning("no research-session source files found in last %d days", args.days) + sys.exit(0) + + # Per-agent breakdown + by_agent = defaultdict(int) + for agent in file_to_agent.values(): + by_agent[agent] += 1 + for agent, count in sorted(by_agent.items()): + logger.info(" research-session sources by %s: %d", agent, count) + + conn = sqlite3.connect(args.db) + conn.row_factory = sqlite3.Row + matches = find_misattributed_prs(conn, file_to_agent, args.days) + logger.info("misattributed PRs found: %d", len(matches)) + + if args.limit and len(matches) > args.limit: + logger.info("--limit=%d — truncating from %d", args.limit, len(matches)) + matches = matches[:args.limit] + + if not matches: + logger.info("nothing to do") + return + + # Per-agent breakdown of misattribution + miss_by_agent = defaultdict(int) + for m in matches: + miss_by_agent[m["agent"]] += 1 + logger.info("misattributed PR breakdown:") + for agent, count in sorted(miss_by_agent.items()): + logger.info(" %s: %d", agent, count) + + counters = apply_backfill(conn, matches, dry_run) + logger.info("RESULT (%s): %s", "DRY-RUN" if dry_run else "APPLIED", counters) + + +if __name__ == "__main__": + main() From 319e03e2c6676a1d622a8e34fabdf01002b31692 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Mon, 27 Apr 2026 12:50:17 +0100 Subject: [PATCH 4/5] test(attribution): prove research-backfill replay is idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five tests against the real contribution_events schema (lib/db.py:181-209): - pr-level dedup with NULL claim_path via idx_ce_unique_pr partial index - per-claim dedup with non-NULL claim_path via idx_ce_unique_claim partial index - pr-level and per-claim events coexist on the same pr_number - backfill (INSERT correct + DELETE wrong) is a true no-op on replay - replay against already-backfilled state preserves unrelated events Schema case identified: case 2 with partial-index split solution already in place. Two partial UNIQUE indexes target disjoint row sets (claim_path IS NULL vs IS NOT NULL), bypassing SQLite's NULL-not-equal-NULL UNIQUE quirk. Production replay verified: re-running backfill --apply against the live DB returns "misattributed PRs found: 0" because the first-run UPDATE flipped the WHERE predicate. Total contribution_events count: 3839 → 3839. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_research_backfill_idempotent.py | 167 +++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 tests/test_research_backfill_idempotent.py diff --git a/tests/test_research_backfill_idempotent.py b/tests/test_research_backfill_idempotent.py new file mode 100644 index 0000000..fac5969 --- /dev/null +++ b/tests/test_research_backfill_idempotent.py @@ -0,0 +1,167 @@ +"""Verify research-attribution backfill is replay-safe against real schema. + +Three things to prove: +1. (handle, role, pr_number) with claim_path=NULL deduplicates correctly + (idx_ce_unique_pr partial index handles SQLite NULL-not-equal-NULL). +2. Re-inserting an existing (handle, role, pr_number, NULL) row via INSERT OR IGNORE + is a true no-op — does not create a phantom duplicate. +3. The backfill script's specific operation (DELETE then INSERT for same key) + nets zero rows when run twice in sequence. +""" + +import sqlite3 +import sys + +# Schema lifted verbatim from lib/db.py:181-209 +SCHEMA = """ +CREATE TABLE contribution_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + handle TEXT NOT NULL, + kind TEXT NOT NULL DEFAULT 'person', + role TEXT NOT NULL, + weight REAL NOT NULL, + pr_number INTEGER NOT NULL, + claim_path TEXT, + domain TEXT, + channel TEXT, + timestamp TEXT NOT NULL DEFAULT (datetime('now')) +); +CREATE UNIQUE INDEX idx_ce_unique_claim ON contribution_events( + handle, role, pr_number, claim_path +) WHERE claim_path IS NOT NULL; +CREATE UNIQUE INDEX idx_ce_unique_pr ON contribution_events( + handle, role, pr_number +) WHERE claim_path IS NULL; +""" + + +def setup() -> sqlite3.Connection: + conn = sqlite3.connect(":memory:") + conn.row_factory = sqlite3.Row + conn.executescript(SCHEMA) + return conn + + +def insert_event(conn, handle, role, pr_number, claim_path=None): + cur = conn.execute( + """INSERT OR IGNORE INTO contribution_events + (handle, kind, role, weight, pr_number, claim_path) + VALUES (?, 'agent', ?, 0.30, ?, ?)""", + (handle, role, pr_number, claim_path), + ) + return cur.rowcount + + +def count(conn) -> int: + return conn.execute("SELECT COUNT(*) FROM contribution_events").fetchone()[0] + + +def test_pr_level_dedup_with_null_claim_path(): + """Two inserts of same (handle, role, pr_number, NULL) → 1 row.""" + conn = setup() + r1 = insert_event(conn, "rio", "author", 4061) + r2 = insert_event(conn, "rio", "author", 4061) + n = count(conn) + assert r1 == 1, f"first insert should write, got rowcount={r1}" + assert r2 == 0, f"second insert should be ignored, got rowcount={r2}" + assert n == 1, f"expected 1 row, got {n}" + print("PASS: pr-level dedup with NULL claim_path") + + +def test_per_claim_dedup_with_path(): + """Two inserts of same (handle, role, pr_number, path) → 1 row.""" + conn = setup() + r1 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md") + r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md") + n = count(conn) + assert r1 == 1 and r2 == 0 and n == 1 + print("PASS: per-claim dedup with claim_path") + + +def test_pr_level_and_per_claim_coexist(): + """A (handle, role, pr_number, NULL) and (handle, role, pr_number, 'x.md') coexist + because the partial indexes target different rows.""" + conn = setup() + r1 = insert_event(conn, "rio", "author", 4061, claim_path=None) + r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md") + n = count(conn) + assert r1 == 1 and r2 == 1 and n == 2 + print("PASS: pr-level and per-claim events coexist on same pr_number") + + +def test_backfill_replay_is_noop(): + """Simulate the exact backfill operation: INSERT correct event, DELETE wrong event. + Run twice. Expect identical state — no phantom rows, no double-deletions.""" + conn = setup() + + # Initial state: m3taversal has the wrong author event for pr=4061 + insert_event(conn, "m3taversal", "author", 4061) + assert count(conn) == 1 + + def backfill_pr_4061(): + # Insert the correct event (rio is the real author) + conn.execute( + """INSERT OR IGNORE INTO contribution_events + (handle, kind, role, weight, pr_number, claim_path) + VALUES (?, 'agent', 'author', 0.30, 4061, NULL)""", + ("rio (self-directed)",), + ) + # Delete the wrong event + conn.execute( + """DELETE FROM contribution_events + WHERE handle='m3taversal' AND role='author' + AND pr_number=4061 AND claim_path IS NULL""", + ) + conn.commit() + + backfill_pr_4061() + state_after_first = sorted( + (r["handle"], r["role"], r["pr_number"], r["claim_path"]) + for r in conn.execute("SELECT * FROM contribution_events") + ) + assert state_after_first == [("rio (self-directed)", "author", 4061, None)], state_after_first + + # Replay + backfill_pr_4061() + state_after_second = sorted( + (r["handle"], r["role"], r["pr_number"], r["claim_path"]) + for r in conn.execute("SELECT * FROM contribution_events") + ) + assert state_after_first == state_after_second, "replay should be idempotent" + assert count(conn) == 1, f"expected 1 row after replay, got {count(conn)}" + print("PASS: backfill replay is a true no-op") + + +def test_replay_against_already_backfilled_pr_does_not_double_delete(): + """If m3taversal event was already deleted, running backfill again must not error + or affect anything else.""" + conn = setup() + # Already-correct state: rio has the author event, m3taversal does not + insert_event(conn, "rio (self-directed)", "author", 4061) + insert_event(conn, "leo", "evaluator", 4061) # noise — should not be touched + + # Run backfill: tries to INSERT (rio, author, 4061) — already exists, no-op + # Tries to DELETE (m3taversal, author, 4061) — already absent, 0 rows affected + cur1 = conn.execute( + """INSERT OR IGNORE INTO contribution_events + (handle, kind, role, weight, pr_number, claim_path) + VALUES ('rio (self-directed)', 'agent', 'author', 0.30, 4061, NULL)""", + ) + cur2 = conn.execute( + """DELETE FROM contribution_events + WHERE handle='m3taversal' AND role='author' + AND pr_number=4061 AND claim_path IS NULL""", + ) + assert cur1.rowcount == 0, f"insert should be no-op, got {cur1.rowcount}" + assert cur2.rowcount == 0, f"delete should be no-op, got {cur2.rowcount}" + assert count(conn) == 2, f"expected 2 rows preserved, got {count(conn)}" + print("PASS: replay against already-backfilled state preserves unrelated events") + + +if __name__ == "__main__": + test_pr_level_dedup_with_null_claim_path() + test_per_claim_dedup_with_path() + test_pr_level_and_per_claim_coexist() + test_backfill_replay_is_noop() + test_replay_against_already_backfilled_pr_does_not_double_delete() + print("\nAll 5 tests passed against real schema.") From 6aff03ff56cae46b37137584b3ce6aae5bd57982 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Mon, 27 Apr 2026 12:53:52 +0100 Subject: [PATCH 5/5] fix(attribution): unify research-session format on "(self-directed)" suffix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves the format inconsistency between the forward fix and the 304-row backfill. Both halves now produce prs.submitted_by = "rio (self-directed)": - research-session.sh: drop proposed_by from the frontmatter template. extract.py path 1 (proposed_by-driven) no longer fires; path 2 fires instead and constructs f"{agent} (self-directed)" — matches backfill. - attribution.py: normalize_handle now strips "(self-directed)" suffix immediately after lowercase+@-strip, before alias lookup. Closes the phantom-person-event class on any future replay through record_contributor_attribution. Round-trips through alias rules keyed on bare agent names. Test (5 cases) still passes; suffix-strip behavior verified against hostile inputs (whitespace, casing, mid-string occurrences must NOT match — only trailing pattern). Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 1 + research/research-session.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/attribution.py b/lib/attribution.py index 8b571b9..694bd1f 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -82,6 +82,7 @@ def normalize_handle(handle: str, conn=None) -> str: if not handle: return "" h = handle.strip().lower().lstrip("@") + h = re.sub(r"\s*\(self-directed\)\s*$", "", h) if conn is None: return h try: diff --git a/research/research-session.sh b/research/research-session.sh index 4f6703a..dc40e07 100755 --- a/research/research-session.sh +++ b/research/research-session.sh @@ -267,7 +267,6 @@ format: tweet | thread status: unprocessed priority: high | medium | low tags: [topic1, topic2] -proposed_by: ${AGENT} intake_tier: research-task ---