fix(attribution): credit research-session sources to agents, not m3taversal #7

Merged
m3taversal merged 5 commits from ship/research-attribution-fix into main 2026-04-27 11:59:55 +00:00
2 changed files with 42 additions and 1 deletions
Showing only changes of commit d0fb4c96e3 - Show all commits

View file

@ -108,6 +108,33 @@ def classify_kind(handle: str) -> str:
return "person"
def is_publisher_handle(handle: str, conn) -> int | None:
"""Return publisher.id if the handle exists as a publisher name, else None.
Schema v26 split orgs/citations into the publishers table. Writer code
(upsert_contributor, insert_contribution_event) calls this to gate creating
contributor rows or events for handles that belong to publishers.
Without this gate, every merged PR with `sourcer: cnbc` (for example) would
re-create CNBC as a contributor and undo the v26 classifier cleanup.
Falls back gracefully on pre-v26 DBs: returns None if publishers table
doesn't exist yet (writer behaves like before, no regression).
"""
if not handle or conn is None:
return None
h = handle.strip().lower().lstrip("@")
try:
row = conn.execute(
"SELECT id FROM publishers WHERE name = ?", (h,),
).fetchone()
if row:
return row["id"] if hasattr(row, "keys") else row[0]
except Exception:
logger.debug("is_publisher_handle: lookup failed for %r", h, exc_info=True)
return None
# ─── Parse attribution from claim content ──────────────────────────────────

View file

@ -14,7 +14,7 @@ import logging
import re
from . import config, db
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle
from .forgejo import get_pr_diff
logger = logging.getLogger("pipeline.contributor")
@ -62,6 +62,12 @@ def insert_contribution_event(
canonical = normalize_handle(handle, conn=conn)
if not canonical:
return False
# Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv,
# etc.) are provenance metadata, not contributors. Don't credit them. Without
# this gate every merge re-creates org events and undoes the v26 cleanup.
if is_publisher_handle(canonical, conn) is not None:
logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical)
return False
kind = classify_kind(canonical)
try:
cur = conn.execute(
@ -419,6 +425,14 @@ def upsert_contributor(
logger.warning("Unknown contributor role: %s", role)
return
# Schema v26 gate: orgs/citations live in publishers table, not contributors.
# Skip without writing so the v26 classifier cleanup isn't undone by every
# merge that has `sourcer: cnbc` (or similar) in claim frontmatter.
canonical_handle = handle.strip().lower().lstrip("@") if handle else ""
if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None:
logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle)
return
existing = conn.execute(
"SELECT handle FROM contributors WHERE handle = ?", (handle,)
).fetchone()