fix(attribution): --diff-filter=A + handle sanity filter + remove legacy fallback
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Ganymede review findings on epimetheus/contributor-attribution-fix branch:
1. BUG: record_contributor_attribution used `git diff --name-only` (all modified
files), not just added. Enrich/challenge PRs re-credited the sourcer on every
subsequent modification. Fixed: --diff-filter=A restricts to new files only.
The synthesizer/challenger/reviewer roles for enrich PRs are still credited
via the Pentagon-Agent trailer path, so this doesn't lose any correct credit.
2. WARNING: Legacy `source`-field heuristic fabricated garbage handles from
descriptive strings ("sec-interpretive-release-s7-2026-09-(march-17",
"governance---meritocratic-voting-+-futarchy"). Removed outright + added
regex handle sanity filter (`^[a-z0-9][a-z0-9_-]{0,38}$`). Applied before
every return path in parse_attribution (the nested-block early return was
previously bypassing the filter).
Dry-run impact: unique handles 83→70 (13 garbage filtered), NEW contributors
49→48, EXISTING drift rows 34→22. The filter drops rows where the literal
garbage string lives in frontmatter (Slotkin case: attribution.sourcer.handle
was written as "senator-elissa-slotkin-/-the-hill" by the buggy legacy path).
3. NIT: Aligned knowledge_prefixes in the file walker to match is_knowledge_pr
(removed entities/, convictions/). Widening those requires Cory sign-off
since is_knowledge_pr currently gates entity-only PRs out of CI.
Tests: 17 pass (added test_bad_handles_filtered, test_valid_handle_with_hyphen_passes,
updated test_legacy_source_fallback → test_legacy_source_fallback_removed).
Ganymede review — 3-message protocol msg 3 pending.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3fe0f4b744
commit
93917f9fc2
3 changed files with 77 additions and 24 deletions
|
|
@ -21,6 +21,32 @@ logger = logging.getLogger("pipeline.attribution")
|
||||||
|
|
||||||
VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
|
VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
|
||||||
|
|
||||||
|
# Handle sanity: lowercase alphanumerics, hyphens, underscores. 1-39 chars (matches
|
||||||
|
# GitHub's handle rules). Rejects garbage like "governance---meritocratic-voting-+-futarchy"
|
||||||
|
# or "sec-interpretive-release-s7-2026-09-(march-17" that upstream frontmatter hygiene
|
||||||
|
# bugs produce. Apply at parse time so bad handles never reach the contributors table.
|
||||||
|
_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_handle(handle: str) -> bool:
|
||||||
|
"""Return True if handle matches the handle format (alphanum + _-, ≤39 chars)."""
|
||||||
|
if not handle or not isinstance(handle, str):
|
||||||
|
return False
|
||||||
|
h = handle.strip().lower().lstrip("@")
|
||||||
|
if h.endswith("-") or h.endswith("_"):
|
||||||
|
return False
|
||||||
|
return bool(_HANDLE_RE.match(h))
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_valid_handles(result: dict) -> dict:
|
||||||
|
"""Drop entries with invalid handles from a parsed attribution dict."""
|
||||||
|
filtered: dict[str, list[dict]] = {role: [] for role in VALID_ROLES}
|
||||||
|
for role, entries in result.items():
|
||||||
|
for entry in entries:
|
||||||
|
if _valid_handle(entry.get("handle", "")):
|
||||||
|
filtered[role].append(entry)
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
# ─── Parse attribution from claim content ──────────────────────────────────
|
# ─── Parse attribution from claim content ──────────────────────────────────
|
||||||
|
|
||||||
|
|
@ -51,7 +77,11 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
|
||||||
elif isinstance(entries, str):
|
elif isinstance(entries, str):
|
||||||
# Single entry as string
|
# Single entry as string
|
||||||
result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
||||||
return result
|
# Fall through to the filter at the end (don't early-return). The nested
|
||||||
|
# block path was skipping the handle sanity filter, letting garbage like
|
||||||
|
# "senator-elissa-slotkin-/-the-hill" through when it was written into
|
||||||
|
# frontmatter during the legacy-fallback era.
|
||||||
|
return _filter_valid_handles(result)
|
||||||
|
|
||||||
# Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
|
# Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
|
||||||
for role in VALID_ROLES:
|
for role in VALID_ROLES:
|
||||||
|
|
@ -88,22 +118,16 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
|
||||||
"context": v.get("context"),
|
"context": v.get("context"),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Legacy fallback: infer from source field
|
# Legacy `source` heuristic REMOVED (Ganymede review, Apr 24). It fabricated
|
||||||
if not any(result[r] for r in VALID_ROLES):
|
# handles from descriptive source strings — "governance---meritocratic-voting-+-
|
||||||
source = fm.get("source", "")
|
# futarchy", "cameron-(contributor)", "sec-interpretive-release-s7-2026-09-
|
||||||
if isinstance(source, str) and source:
|
# (march-17". Hit rate on real handles was near-zero, false-positive rate was
|
||||||
# Try to extract author handle from source string
|
# high. Claims without explicit attribution now return empty (better surface as
|
||||||
# Patterns: "@handle", "Author Name", "org, description"
|
# data hygiene than invent fake contributors).
|
||||||
handle_match = re.search(r"@(\w+)", source)
|
|
||||||
if handle_match:
|
|
||||||
result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source})
|
|
||||||
else:
|
|
||||||
# Use first word/phrase before comma as sourcer handle
|
|
||||||
author = source.split(",")[0].strip().lower().replace(" ", "-")
|
|
||||||
if author and len(author) > 1:
|
|
||||||
result["sourcer"].append({"handle": author, "agent_id": None, "context": source})
|
|
||||||
|
|
||||||
return result
|
# Filter to valid handles only. Bad handles (garbage from upstream frontmatter
|
||||||
|
# bugs) get dropped rather than written to the contributors table.
|
||||||
|
return _filter_valid_handles(result)
|
||||||
|
|
||||||
|
|
||||||
def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
|
def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
|
||||||
|
|
|
||||||
|
|
@ -148,14 +148,20 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
|
||||||
)
|
)
|
||||||
agents_found.add(agent_name)
|
agents_found.add(agent_name)
|
||||||
|
|
||||||
# Parse attribution from changed knowledge files via the canonical attribution
|
# Parse attribution from NEWLY ADDED knowledge files via the canonical attribution
|
||||||
# parser (lib/attribution.py). The previous diff-line regex parser dropped
|
# parser (lib/attribution.py). The previous diff-line regex parser dropped
|
||||||
# both the bare-key flat format (`sourcer: alexastrum`) and the nested
|
# both the bare-key flat format (`sourcer: alexastrum`) and the nested
|
||||||
# `attribution:` block format because it only matched `- handle: "X"` lines.
|
# `attribution:` block format because it only matched `- handle: "X"` lines.
|
||||||
# The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
|
# The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
|
||||||
# thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
|
# thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
|
||||||
|
#
|
||||||
|
# --diff-filter=A restricts to added files only (Ganymede review): enrich and
|
||||||
|
# challenge PRs modify existing claims, and re-crediting the existing sourcer on
|
||||||
|
# every modification would inflate counts. The synthesizer/challenger/reviewer
|
||||||
|
# roles for those PRs are credited via the Pentagon-Agent trailer path above.
|
||||||
rc_files, files_output = await git_fn(
|
rc_files, files_output = await git_fn(
|
||||||
"diff", "--name-only", f"origin/main...origin/{branch}", timeout=10,
|
"diff", "--name-only", "--diff-filter=A",
|
||||||
|
f"origin/main...origin/{branch}", timeout=10,
|
||||||
)
|
)
|
||||||
if rc_files == 0 and files_output:
|
if rc_files == 0 and files_output:
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -163,10 +169,12 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
|
||||||
from .attribution import parse_attribution_from_file
|
from .attribution import parse_attribution_from_file
|
||||||
|
|
||||||
main_root = Path(config.MAIN_WORKTREE)
|
main_root = Path(config.MAIN_WORKTREE)
|
||||||
knowledge_prefixes = (
|
# Match is_knowledge_pr's gate exactly. Entities/convictions are excluded
|
||||||
"domains/", "entities/", "decisions/", "foundations/",
|
# here because is_knowledge_pr skips entity-only PRs at line 123 — so a
|
||||||
"convictions/", "core/",
|
# broader list here only matters for mixed PRs where the narrower list
|
||||||
)
|
# already matches via the claim file. Widening requires Cory sign-off
|
||||||
|
# since it would change leaderboard accounting (entity-only PRs → CI credit).
|
||||||
|
knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/")
|
||||||
for rel_path in files_output.strip().split("\n"):
|
for rel_path in files_output.strip().split("\n"):
|
||||||
rel_path = rel_path.strip()
|
rel_path = rel_path.strip()
|
||||||
if not rel_path.endswith(".md"):
|
if not rel_path.endswith(".md"):
|
||||||
|
|
|
||||||
|
|
@ -34,13 +34,34 @@ class TestParseAttribution:
|
||||||
assert result["extractor"][0]["handle"] == "rio"
|
assert result["extractor"][0]["handle"] == "rio"
|
||||||
assert result["sourcer"][0]["handle"] == "theiaresearch"
|
assert result["sourcer"][0]["handle"] == "theiaresearch"
|
||||||
|
|
||||||
def test_legacy_source_fallback(self):
|
def test_legacy_source_fallback_removed(self):
|
||||||
|
"""Legacy `source` heuristic removed (Ganymede review, Apr 24).
|
||||||
|
|
||||||
|
It fabricated handles from descriptive strings (garbage like
|
||||||
|
'sec-interpretive-release-s7-2026-09-(march-17'). Claims without
|
||||||
|
explicit attribution now return empty — better to surface as data
|
||||||
|
hygiene than invent contributors.
|
||||||
|
"""
|
||||||
fm = {
|
fm = {
|
||||||
"type": "claim",
|
"type": "claim",
|
||||||
"source": "@pineanalytics, Q4 2025 report",
|
"source": "@pineanalytics, Q4 2025 report",
|
||||||
}
|
}
|
||||||
result = parse_attribution(fm)
|
result = parse_attribution(fm)
|
||||||
assert result["sourcer"][0]["handle"] == "pineanalytics"
|
assert all(len(v) == 0 for v in result.values())
|
||||||
|
|
||||||
|
def test_bad_handles_filtered(self):
|
||||||
|
"""Handles with spaces, parens, or garbage chars are dropped."""
|
||||||
|
fm = {
|
||||||
|
"sourcer": "governance---meritocratic-voting-+-futarchy",
|
||||||
|
}
|
||||||
|
result = parse_attribution(fm)
|
||||||
|
assert len(result["sourcer"]) == 0
|
||||||
|
|
||||||
|
def test_valid_handle_with_hyphen_passes(self):
|
||||||
|
"""Legitimate handles like 'cameron-s1' survive the filter."""
|
||||||
|
fm = {"sourcer": "cameron-s1"}
|
||||||
|
result = parse_attribution(fm)
|
||||||
|
assert result["sourcer"][0]["handle"] == "cameron-s1"
|
||||||
|
|
||||||
def test_empty_attribution(self):
|
def test_empty_attribution(self):
|
||||||
fm = {"type": "claim"}
|
fm = {"type": "claim"}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue