From 93917f9fc2514eb3e5e0cfc7564d5c5980d03e15 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Fri, 24 Apr 2026 12:58:55 +0100 Subject: [PATCH] fix(attribution): --diff-filter=A + handle sanity filter + remove legacy fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ganymede review findings on epimetheus/contributor-attribution-fix branch: 1. BUG: record_contributor_attribution used `git diff --name-only` (all modified files), not just added. Enrich/challenge PRs re-credited the sourcer on every subsequent modification. Fixed: --diff-filter=A restricts to new files only. The synthesizer/challenger/reviewer roles for enrich PRs are still credited via the Pentagon-Agent trailer path, so this doesn't lose any correct credit. 2. WARNING: Legacy `source`-field heuristic fabricated garbage handles from descriptive strings ("sec-interpretive-release-s7-2026-09-(march-17", "governance---meritocratic-voting-+-futarchy"). Removed outright + added regex handle sanity filter (`^[a-z0-9][a-z0-9_-]{0,38}$`). Applied before every return path in parse_attribution (the nested-block early return was previously bypassing the filter). Dry-run impact: unique handles 83→70 (13 garbage filtered), NEW contributors 49→48, EXISTING drift rows 34→22. The filter drops rows where the literal garbage string lives in frontmatter (Slotkin case: attribution.sourcer.handle was written as "senator-elissa-slotkin-/-the-hill" by the buggy legacy path). 3. NIT: Aligned knowledge_prefixes in the file walker to match is_knowledge_pr (removed entities/, convictions/). Widening those requires Cory sign-off since is_knowledge_pr currently gates entity-only PRs out of CI. Tests: 17 pass (added test_bad_handles_filtered, test_valid_handle_with_hyphen_passes, updated test_legacy_source_fallback → test_legacy_source_fallback_removed). Ganymede review — 3-message protocol msg 3 pending. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 56 ++++++++++++++++++++++++++++----------- lib/contributor.py | 20 +++++++++----- tests/test_attribution.py | 25 +++++++++++++++-- 3 files changed, 77 insertions(+), 24 deletions(-) diff --git a/lib/attribution.py b/lib/attribution.py index 05da485..2034e1f 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -21,6 +21,32 @@ logger = logging.getLogger("pipeline.attribution") VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"}) +# Handle sanity: lowercase alphanumerics, hyphens, underscores. 1-39 chars (matches +# GitHub's handle rules). Rejects garbage like "governance---meritocratic-voting-+-futarchy" +# or "sec-interpretive-release-s7-2026-09-(march-17" that upstream frontmatter hygiene +# bugs produce. Apply at parse time so bad handles never reach the contributors table. +_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$") + + +def _valid_handle(handle: str) -> bool: + """Return True if handle matches the handle format (alphanum + _-, ≤39 chars).""" + if not handle or not isinstance(handle, str): + return False + h = handle.strip().lower().lstrip("@") + if h.endswith("-") or h.endswith("_"): + return False + return bool(_HANDLE_RE.match(h)) + + +def _filter_valid_handles(result: dict) -> dict: + """Drop entries with invalid handles from a parsed attribution dict.""" + filtered: dict[str, list[dict]] = {role: [] for role in VALID_ROLES} + for role, entries in result.items(): + for entry in entries: + if _valid_handle(entry.get("handle", "")): + filtered[role].append(entry) + return filtered + # ─── Parse attribution from claim content ────────────────────────────────── @@ -51,7 +77,11 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]: elif isinstance(entries, str): # Single entry as string result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None}) - return result + # Fall through to the filter at the end (don't early-return). The nested + # block path was skipping the handle sanity filter, letting garbage like + # "senator-elissa-slotkin-/-the-hill" through when it was written into + # frontmatter during the legacy-fallback era. + return _filter_valid_handles(result) # Flat format fallback (attribution_sourcer, attribution_extractor, etc.) for role in VALID_ROLES: @@ -88,22 +118,16 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]: "context": v.get("context"), }) - # Legacy fallback: infer from source field - if not any(result[r] for r in VALID_ROLES): - source = fm.get("source", "") - if isinstance(source, str) and source: - # Try to extract author handle from source string - # Patterns: "@handle", "Author Name", "org, description" - handle_match = re.search(r"@(\w+)", source) - if handle_match: - result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source}) - else: - # Use first word/phrase before comma as sourcer handle - author = source.split(",")[0].strip().lower().replace(" ", "-") - if author and len(author) > 1: - result["sourcer"].append({"handle": author, "agent_id": None, "context": source}) + # Legacy `source` heuristic REMOVED (Ganymede review, Apr 24). It fabricated + # handles from descriptive source strings — "governance---meritocratic-voting-+- + # futarchy", "cameron-(contributor)", "sec-interpretive-release-s7-2026-09- + # (march-17". Hit rate on real handles was near-zero, false-positive rate was + # high. Claims without explicit attribution now return empty (better surface as + # data hygiene than invent fake contributors). - return result + # Filter to valid handles only. Bad handles (garbage from upstream frontmatter + # bugs) get dropped rather than written to the contributors table. + return _filter_valid_handles(result) def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]: diff --git a/lib/contributor.py b/lib/contributor.py index b6c3a8f..5150c67 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -148,14 +148,20 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ ) agents_found.add(agent_name) - # Parse attribution from changed knowledge files via the canonical attribution + # Parse attribution from NEWLY ADDED knowledge files via the canonical attribution # parser (lib/attribution.py). The previous diff-line regex parser dropped # both the bare-key flat format (`sourcer: alexastrum`) and the nested # `attribution:` block format because it only matched `- handle: "X"` lines. # The Apr 24 incident traced missing leaderboard entries (alexastrum=0, # thesensatore=0, cameron-s1=0) directly to this parser's blind spots. + # + # --diff-filter=A restricts to added files only (Ganymede review): enrich and + # challenge PRs modify existing claims, and re-crediting the existing sourcer on + # every modification would inflate counts. The synthesizer/challenger/reviewer + # roles for those PRs are credited via the Pentagon-Agent trailer path above. rc_files, files_output = await git_fn( - "diff", "--name-only", f"origin/main...origin/{branch}", timeout=10, + "diff", "--name-only", "--diff-filter=A", + f"origin/main...origin/{branch}", timeout=10, ) if rc_files == 0 and files_output: from pathlib import Path @@ -163,10 +169,12 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ from .attribution import parse_attribution_from_file main_root = Path(config.MAIN_WORKTREE) - knowledge_prefixes = ( - "domains/", "entities/", "decisions/", "foundations/", - "convictions/", "core/", - ) + # Match is_knowledge_pr's gate exactly. Entities/convictions are excluded + # here because is_knowledge_pr skips entity-only PRs at line 123 — so a + # broader list here only matters for mixed PRs where the narrower list + # already matches via the claim file. Widening requires Cory sign-off + # since it would change leaderboard accounting (entity-only PRs → CI credit). + knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/") for rel_path in files_output.strip().split("\n"): rel_path = rel_path.strip() if not rel_path.endswith(".md"): diff --git a/tests/test_attribution.py b/tests/test_attribution.py index b46e8dd..2124c7d 100644 --- a/tests/test_attribution.py +++ b/tests/test_attribution.py @@ -34,13 +34,34 @@ class TestParseAttribution: assert result["extractor"][0]["handle"] == "rio" assert result["sourcer"][0]["handle"] == "theiaresearch" - def test_legacy_source_fallback(self): + def test_legacy_source_fallback_removed(self): + """Legacy `source` heuristic removed (Ganymede review, Apr 24). + + It fabricated handles from descriptive strings (garbage like + 'sec-interpretive-release-s7-2026-09-(march-17'). Claims without + explicit attribution now return empty — better to surface as data + hygiene than invent contributors. + """ fm = { "type": "claim", "source": "@pineanalytics, Q4 2025 report", } result = parse_attribution(fm) - assert result["sourcer"][0]["handle"] == "pineanalytics" + assert all(len(v) == 0 for v in result.values()) + + def test_bad_handles_filtered(self): + """Handles with spaces, parens, or garbage chars are dropped.""" + fm = { + "sourcer": "governance---meritocratic-voting-+-futarchy", + } + result = parse_attribution(fm) + assert len(result["sourcer"]) == 0 + + def test_valid_handle_with_hyphen_passes(self): + """Legitimate handles like 'cameron-s1' survive the filter.""" + fm = {"sourcer": "cameron-s1"} + result = parse_attribution(fm) + assert result["sourcer"][0]["handle"] == "cameron-s1" def test_empty_attribution(self): fm = {"type": "claim"}