From 93917f9fc2514eb3e5e0cfc7564d5c5980d03e15 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Fri, 24 Apr 2026 12:58:55 +0100
Subject: [PATCH] fix(attribution): --diff-filter=A + handle sanity filter +
 remove legacy fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ganymede review findings on epimetheus/contributor-attribution-fix branch:

1. BUG: record_contributor_attribution used `git diff --name-only` (all modified
   files), not just added. Enrich/challenge PRs re-credited the sourcer on every
   subsequent modification. Fixed: --diff-filter=A restricts to new files only.
   The synthesizer/challenger/reviewer roles for enrich PRs are still credited
   via the Pentagon-Agent trailer path, so this doesn't lose any correct credit.

2. WARNING: Legacy `source`-field heuristic fabricated garbage handles from
   descriptive strings ("sec-interpretive-release-s7-2026-09-(march-17",
   "governance---meritocratic-voting-+-futarchy"). Removed outright + added
   regex handle sanity filter (`^[a-z0-9][a-z0-9_-]{0,38}$`). Applied before
   every return path in parse_attribution (the nested-block early return was
   previously bypassing the filter).

   Dry-run impact: unique handles 83→70 (13 garbage filtered), NEW contributors
   49→48, EXISTING drift rows 34→22. The filter drops rows where the literal
   garbage string lives in frontmatter (Slotkin case: attribution.sourcer.handle
   was written as "senator-elissa-slotkin-/-the-hill" by the buggy legacy path).

3. NIT: Aligned knowledge_prefixes in the file walker to match is_knowledge_pr
   (removed entities/, convictions/). Widening those requires Cory sign-off
   since is_knowledge_pr currently gates entity-only PRs out of CI.

Tests: 17 pass (added test_bad_handles_filtered, test_valid_handle_with_hyphen_passes,
updated test_legacy_source_fallback → test_legacy_source_fallback_removed).

Ganymede review — 3-message protocol msg 3 pending.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/attribution.py        | 56 ++++++++++++++++++++++++++++-----------
 lib/contributor.py        | 20 +++++++++-----
 tests/test_attribution.py | 25 +++++++++++++++--
 3 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/lib/attribution.py b/lib/attribution.py
index 05da485..2034e1f 100644
--- a/lib/attribution.py
+++ b/lib/attribution.py
@@ -21,6 +21,32 @@ logger = logging.getLogger("pipeline.attribution")
 
 VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
 
+# Handle sanity: lowercase alphanumerics, hyphens, underscores. 1-39 chars (matches
+# GitHub's handle rules). Rejects garbage like "governance---meritocratic-voting-+-futarchy"
+# or "sec-interpretive-release-s7-2026-09-(march-17" that upstream frontmatter hygiene
+# bugs produce. Apply at parse time so bad handles never reach the contributors table.
+_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")
+
+
+def _valid_handle(handle: str) -> bool:
+    """Return True if handle matches the handle format (alphanum + _-, ≤39 chars)."""
+    if not handle or not isinstance(handle, str):
+        return False
+    h = handle.strip().lower().lstrip("@")
+    if h.endswith("-") or h.endswith("_"):
+        return False
+    return bool(_HANDLE_RE.match(h))
+
+
+def _filter_valid_handles(result: dict) -> dict:
+    """Drop entries with invalid handles from a parsed attribution dict."""
+    filtered: dict[str, list[dict]] = {role: [] for role in VALID_ROLES}
+    for role, entries in result.items():
+        for entry in entries:
+            if _valid_handle(entry.get("handle", "")):
+                filtered[role].append(entry)
+    return filtered
+
 
 # ─── Parse attribution from claim content ──────────────────────────────────
 
@@ -51,7 +77,11 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
             elif isinstance(entries, str):
                 # Single entry as string
                 result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
-        return result
+        # Fall through to the filter at the end (don't early-return). The nested
+        # block path was skipping the handle sanity filter, letting garbage like
+        # "senator-elissa-slotkin-/-the-hill" through when it was written into
+        # frontmatter during the legacy-fallback era.
+        return _filter_valid_handles(result)
 
     # Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
     for role in VALID_ROLES:
@@ -88,22 +118,16 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
                         "context": v.get("context"),
                     })
 
-    # Legacy fallback: infer from source field
-    if not any(result[r] for r in VALID_ROLES):
-        source = fm.get("source", "")
-        if isinstance(source, str) and source:
-            # Try to extract author handle from source string
-            # Patterns: "@handle", "Author Name", "org, description"
-            handle_match = re.search(r"@(\w+)", source)
-            if handle_match:
-                result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source})
-            else:
-                # Use first word/phrase before comma as sourcer handle
-                author = source.split(",")[0].strip().lower().replace(" ", "-")
-                if author and len(author) > 1:
-                    result["sourcer"].append({"handle": author, "agent_id": None, "context": source})
+    # Legacy `source` heuristic REMOVED (Ganymede review, Apr 24). It fabricated
+    # handles from descriptive source strings — "governance---meritocratic-voting-+-
+    # futarchy", "cameron-(contributor)", "sec-interpretive-release-s7-2026-09-
+    # (march-17". Hit rate on real handles was near-zero, false-positive rate was
+    # high. Claims without explicit attribution now return empty (better surface as
+    # data hygiene than invent fake contributors).
 
-    return result
+    # Filter to valid handles only. Bad handles (garbage from upstream frontmatter
+    # bugs) get dropped rather than written to the contributors table.
+    return _filter_valid_handles(result)
 
 
 def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
diff --git a/lib/contributor.py b/lib/contributor.py
index b6c3a8f..5150c67 100644
--- a/lib/contributor.py
+++ b/lib/contributor.py
@@ -148,14 +148,20 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
             )
             agents_found.add(agent_name)
 
-    # Parse attribution from changed knowledge files via the canonical attribution
+    # Parse attribution from NEWLY ADDED knowledge files via the canonical attribution
     # parser (lib/attribution.py). The previous diff-line regex parser dropped
     # both the bare-key flat format (`sourcer: alexastrum`) and the nested
     # `attribution:` block format because it only matched `- handle: "X"` lines.
     # The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
     # thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
+    #
+    # --diff-filter=A restricts to added files only (Ganymede review): enrich and
+    # challenge PRs modify existing claims, and re-crediting the existing sourcer on
+    # every modification would inflate counts. The synthesizer/challenger/reviewer
+    # roles for those PRs are credited via the Pentagon-Agent trailer path above.
     rc_files, files_output = await git_fn(
-        "diff", "--name-only", f"origin/main...origin/{branch}", timeout=10,
+        "diff", "--name-only", "--diff-filter=A",
+        f"origin/main...origin/{branch}", timeout=10,
     )
     if rc_files == 0 and files_output:
         from pathlib import Path
@@ -163,10 +169,12 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
         from .attribution import parse_attribution_from_file
 
         main_root = Path(config.MAIN_WORKTREE)
-        knowledge_prefixes = (
-            "domains/", "entities/", "decisions/", "foundations/",
-            "convictions/", "core/",
-        )
+        # Match is_knowledge_pr's gate exactly. Entities/convictions are excluded
+        # here because is_knowledge_pr skips entity-only PRs at line 123 — so a
+        # broader list here only matters for mixed PRs where the narrower list
+        # already matches via the claim file. Widening requires Cory sign-off
+        # since it would change leaderboard accounting (entity-only PRs → CI credit).
+        knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/")
         for rel_path in files_output.strip().split("\n"):
             rel_path = rel_path.strip()
             if not rel_path.endswith(".md"):
diff --git a/tests/test_attribution.py b/tests/test_attribution.py
index b46e8dd..2124c7d 100644
--- a/tests/test_attribution.py
+++ b/tests/test_attribution.py
@@ -34,13 +34,34 @@ class TestParseAttribution:
         assert result["extractor"][0]["handle"] == "rio"
         assert result["sourcer"][0]["handle"] == "theiaresearch"
 
-    def test_legacy_source_fallback(self):
+    def test_legacy_source_fallback_removed(self):
+        """Legacy `source` heuristic removed (Ganymede review, Apr 24).
+
+        It fabricated handles from descriptive strings (garbage like
+        'sec-interpretive-release-s7-2026-09-(march-17'). Claims without
+        explicit attribution now return empty — better to surface as data
+        hygiene than invent contributors.
+        """
         fm = {
             "type": "claim",
             "source": "@pineanalytics, Q4 2025 report",
         }
         result = parse_attribution(fm)
-        assert result["sourcer"][0]["handle"] == "pineanalytics"
+        assert all(len(v) == 0 for v in result.values())
+
+    def test_bad_handles_filtered(self):
+        """Handles with spaces, parens, or garbage chars are dropped."""
+        fm = {
+            "sourcer": "governance---meritocratic-voting-+-futarchy",
+        }
+        result = parse_attribution(fm)
+        assert len(result["sourcer"]) == 0
+
+    def test_valid_handle_with_hyphen_passes(self):
+        """Legitimate handles like 'cameron-s1' survive the filter."""
+        fm = {"sourcer": "cameron-s1"}
+        result = parse_attribution(fm)
+        assert result["sourcer"][0]["handle"] == "cameron-s1"
 
     def test_empty_attribution(self):
         fm = {"type": "claim"}