From d0fb4c96e3e009ccf3a96ef98495e325cec0767a Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sun, 26 Apr 2026 14:21:10 +0100 Subject: [PATCH] fix(attribution): gate writer on publishers table (regression prevention) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema v26 (commit 3fe524d) split orgs/citations from contributors into the publishers table. Without a writer-side gate, every merged PR with `sourcer: cnbc` (or similar) re-creates CNBC as a contributor and undoes the v26 classifier cleanup. Once normal pipeline traffic resumes, the contributors table re-pollutes within hours. Fix: belt-and-suspenders gate at both writer surfaces. 1. `lib/attribution.py::is_publisher_handle(handle, conn)` — returns publisher.id if handle exists in publishers.name, else None. Falls back gracefully on pre-v26 DBs (no publishers table → returns None → writer behaves like before, no regression). 2. `lib/contributor.py::insert_contribution_event` — checks is_publisher_handle on canonical handle before INSERT. If it's a publisher, debug-log + return False. Prevents originator events for CNBC/SpaceNews/etc. 3. `lib/contributor.py::upsert_contributor` — same gate at top. Prevents the contributors table from re-acquiring publisher rows. Verified end-to-end against live VPS DB snapshot: - CNBC originator event: blocked (insert returns False) - CNBC contributors row: blocked (no row created) - alexastrum, thesensatore, newhandle_xyz: pass through unchanged - is_publisher_handle handles case-insensitive lookup correctly (CNBC and cnbc both match publisher_id=3) Pre-deploy event count was 3705. Post-classifier cleanup: 3623 (82 org events purged). Going forward, no new org events accumulate. Branch 2 of the schema-v26 rollout. Branch 3 (auto-create at tier='cited', extract.py sources.publisher_id wiring) is separate scope and not required for regression prevention. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 27 +++++++++++++++++++++++++++ lib/contributor.py | 16 +++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/lib/attribution.py b/lib/attribution.py index 664c39c..68e69eb 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -108,6 +108,33 @@ def classify_kind(handle: str) -> str: return "person" +def is_publisher_handle(handle: str, conn) -> int | None: + """Return publisher.id if the handle exists as a publisher name, else None. + + Schema v26 split orgs/citations into the publishers table. Writer code + (upsert_contributor, insert_contribution_event) calls this to gate creating + contributor rows or events for handles that belong to publishers. + + Without this gate, every merged PR with `sourcer: cnbc` (for example) would + re-create CNBC as a contributor and undo the v26 classifier cleanup. + + Falls back gracefully on pre-v26 DBs: returns None if publishers table + doesn't exist yet (writer behaves like before, no regression). + """ + if not handle or conn is None: + return None + h = handle.strip().lower().lstrip("@") + try: + row = conn.execute( + "SELECT id FROM publishers WHERE name = ?", (h,), + ).fetchone() + if row: + return row["id"] if hasattr(row, "keys") else row[0] + except Exception: + logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True) + return None + + # ─── Parse attribution from claim content ────────────────────────────────── diff --git a/lib/contributor.py b/lib/contributor.py index a2117d6..b2cc11d 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -14,7 +14,7 @@ import logging import re from . import config, db -from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle +from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle from .forgejo import get_pr_diff logger = logging.getLogger("pipeline.contributor") @@ -62,6 +62,12 @@ def insert_contribution_event( canonical = normalize_handle(handle, conn=conn) if not canonical: return False + # Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv, + # etc.) are provenance metadata, not contributors. Don't credit them. Without + # this gate every merge re-creates org events and undoes the v26 cleanup. + if is_publisher_handle(canonical, conn) is not None: + logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical) + return False kind = classify_kind(canonical) try: cur = conn.execute( @@ -419,6 +425,14 @@ def upsert_contributor( logger.warning("Unknown contributor role: %s", role) return + # Schema v26 gate: orgs/citations live in publishers table, not contributors. + # Skip without writing so the v26 classifier cleanup isn't undone by every + # merge that has `sourcer: cnbc` (or similar) in claim frontmatter. + canonical_handle = handle.strip().lower().lstrip("@") if handle else "" + if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None: + logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle) + return + existing = conn.execute( "SELECT handle FROM contributors WHERE handle = ?", (handle,) ).fetchone()