fix(attribution): gate writer on publishers table (regression prevention)
Schema v26 (commit 3fe524d) split orgs/citations from contributors into
the publishers table. Without a writer-side gate, every merged PR with
`sourcer: cnbc` (or similar) re-creates CNBC as a contributor and
undoes the v26 classifier cleanup. Once normal pipeline traffic resumes,
the contributors table re-pollutes within hours.
Fix: belt-and-suspenders gate at both writer surfaces.
1. `lib/attribution.py::is_publisher_handle(handle, conn)` — returns
publisher.id if handle exists in publishers.name, else None. Falls
back gracefully on pre-v26 DBs (no publishers table → returns None →
writer behaves like before, no regression).
2. `lib/contributor.py::insert_contribution_event` — checks
is_publisher_handle on canonical handle before INSERT. If it's a
publisher, debug-log + return False. Prevents originator events for
CNBC/SpaceNews/etc.
3. `lib/contributor.py::upsert_contributor` — same gate at top. Prevents
the contributors table from re-acquiring publisher rows.
Verified end-to-end against live VPS DB snapshot:
- CNBC originator event: blocked (insert returns False)
- CNBC contributors row: blocked (no row created)
- alexastrum, thesensatore, newhandle_xyz: pass through unchanged
- is_publisher_handle handles case-insensitive lookup correctly
(CNBC and cnbc both match publisher_id=3)
Pre-deploy event count was 3705. Post-classifier cleanup: 3623 (82 org
events purged). Going forward, no new org events accumulate.
Branch 2 of the schema-v26 rollout. Branch 3 (auto-create at tier='cited',
extract.py sources.publisher_id wiring) is separate scope and not required
for regression prevention.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
926a397839
commit
d0fb4c96e3
2 changed files with 42 additions and 1 deletions
|
|
@ -108,6 +108,33 @@ def classify_kind(handle: str) -> str:
|
||||||
return "person"
|
return "person"
|
||||||
|
|
||||||
|
|
||||||
|
def is_publisher_handle(handle: str, conn) -> int | None:
|
||||||
|
"""Return publisher.id if the handle exists as a publisher name, else None.
|
||||||
|
|
||||||
|
Schema v26 split orgs/citations into the publishers table. Writer code
|
||||||
|
(upsert_contributor, insert_contribution_event) calls this to gate creating
|
||||||
|
contributor rows or events for handles that belong to publishers.
|
||||||
|
|
||||||
|
Without this gate, every merged PR with `sourcer: cnbc` (for example) would
|
||||||
|
re-create CNBC as a contributor and undo the v26 classifier cleanup.
|
||||||
|
|
||||||
|
Falls back gracefully on pre-v26 DBs: returns None if publishers table
|
||||||
|
doesn't exist yet (writer behaves like before, no regression).
|
||||||
|
"""
|
||||||
|
if not handle or conn is None:
|
||||||
|
return None
|
||||||
|
h = handle.strip().lower().lstrip("@")
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id FROM publishers WHERE name = ?", (h,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["id"] if hasattr(row, "keys") else row[0]
|
||||||
|
except Exception:
|
||||||
|
logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ─── Parse attribution from claim content ──────────────────────────────────
|
# ─── Parse attribution from claim content ──────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import config, db
|
from . import config, db
|
||||||
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle
|
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle
|
||||||
from .forgejo import get_pr_diff
|
from .forgejo import get_pr_diff
|
||||||
|
|
||||||
logger = logging.getLogger("pipeline.contributor")
|
logger = logging.getLogger("pipeline.contributor")
|
||||||
|
|
@ -62,6 +62,12 @@ def insert_contribution_event(
|
||||||
canonical = normalize_handle(handle, conn=conn)
|
canonical = normalize_handle(handle, conn=conn)
|
||||||
if not canonical:
|
if not canonical:
|
||||||
return False
|
return False
|
||||||
|
# Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv,
|
||||||
|
# etc.) are provenance metadata, not contributors. Don't credit them. Without
|
||||||
|
# this gate every merge re-creates org events and undoes the v26 cleanup.
|
||||||
|
if is_publisher_handle(canonical, conn) is not None:
|
||||||
|
logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical)
|
||||||
|
return False
|
||||||
kind = classify_kind(canonical)
|
kind = classify_kind(canonical)
|
||||||
try:
|
try:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
|
|
@ -419,6 +425,14 @@ def upsert_contributor(
|
||||||
logger.warning("Unknown contributor role: %s", role)
|
logger.warning("Unknown contributor role: %s", role)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Schema v26 gate: orgs/citations live in publishers table, not contributors.
|
||||||
|
# Skip without writing so the v26 classifier cleanup isn't undone by every
|
||||||
|
# merge that has `sourcer: cnbc` (or similar) in claim frontmatter.
|
||||||
|
canonical_handle = handle.strip().lower().lstrip("@") if handle else ""
|
||||||
|
if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None:
|
||||||
|
logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle)
|
||||||
|
return
|
||||||
|
|
||||||
existing = conn.execute(
|
existing = conn.execute(
|
||||||
"SELECT handle FROM contributors WHERE handle = ?", (handle,)
|
"SELECT handle FROM contributors WHERE handle = ?", (handle,)
|
||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue