fix(attribution): credit sourcer/extractor from claim frontmatter

Three layers of contributor-attribution bug surfaced by Apr 24 leaderboard investigation. alexastrum, thesensatore, cameron-s1 all had real merged contributions but zero credit in the contributors table. 1. lib/attribution.py: parse_attribution() only read `attribution_sourcer:` prefix-keyed flat fields. ~42% of claim files (535/1280) use the bare-key form `sourcer: alexastrum` written by extract.py. Added bare-key handling between the prefixed-flat path and the legacy-source-field fallback. Block format (`attribution: { sourcer: [...] }`) still wins when present. 2. lib/contributor.py: record_contributor_attribution() parsed the diff text with regex looking for `+- handle: "X"` lines. This matched neither the bare-key flat format nor the `attribution: { sourcer: [...] }` block format Leo uses for manual extractions. Replaced the regex parser with a file walker that calls attribution.parse_attribution_from_file() on each changed knowledge file — single source of truth for both formats. 3. scripts/backfill-sourcer-attribution.py: walks all merged knowledge files, re-attributes via the canonical parser, upserts contributors. Default additive mode preserves existing high counts (e.g. m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via a different code path that this fix does not touch). --reset flag for the destructive case. Dry-run preview (additive mode): - 670 NEW contributors to insert (mostly source-citation handles) - 77 EXISTING contributors with under-counted role columns - alexastrum: 0 → 6, thesensatore: 0 → 5, cameron-s1: 0 → 2 - astra.sourcer: 0 → 96, leo.sourcer: 0 → 44, theseus.sourcer: 0 → 18 - m3taversal.sourcer: 1011 (preserved, not 22 from file walk) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 12:48:41 +01:00 · 2026-04-24 12:48:41 +01:00 · 3fe0f4b744
commit 3fe0f4b744
parent 05d15cea56
3 changed files with 320 additions and 20 deletions
--- a/lib/attribution.py
+++ b/lib/attribution.py
@ -64,6 +64,30 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
                    if isinstance(v, str):
                        result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})

+    # Bare-key flat format: `sourcer: alexastrum`, `extractor: leo`, etc.
+    # This is what extract.py writes (line 290: f'sourcer: "{sourcer}"') — the most
+    # common format in practice (~42% of claim files). The Apr 24 incident traced
+    # missing leaderboard entries to this format being silently dropped because the
+    # parser only checked the `attribution_*` prefix.
+    # Only fill if the role wasn't already populated by the prefixed form, to avoid
+    # double-counting when both formats coexist on the same claim.
+    for role in VALID_ROLES:
+        if result[role]:
+            continue
+        bare_val = fm.get(role)
+        if isinstance(bare_val, str) and bare_val.strip():
+            result[role].append({"handle": bare_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
+        elif isinstance(bare_val, list):
+            for v in bare_val:
+                if isinstance(v, str) and v.strip():
+                    result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
+                elif isinstance(v, dict) and v.get("handle"):
+                    result[role].append({
+                        "handle": v["handle"].strip().lower().lstrip("@"),
+                        "agent_id": v.get("agent_id"),
+                        "context": v.get("context"),
+                    })
+
    # Legacy fallback: infer from source field
    if not any(result[r] for r in VALID_ROLES):
        source = fm.get("source", "")
--- a/lib/contributor.py
+++ b/lib/contributor.py
@ -148,27 +148,42 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
            )
            agents_found.add(agent_name)

-    # Parse attribution blocks from claim frontmatter in diff
-    # Look for added lines with attribution YAML
-    current_role = None
-    for line in diff.split("\n"):
-        if not line.startswith("+") or line.startswith("+++"):
-            continue
-        stripped = line[1:].strip()
+    # Parse attribution from changed knowledge files via the canonical attribution
+    # parser (lib/attribution.py). The previous diff-line regex parser dropped
+    # both the bare-key flat format (`sourcer: alexastrum`) and the nested
+    # `attribution:` block format because it only matched `- handle: "X"` lines.
+    # The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
+    # thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
+    rc_files, files_output = await git_fn(
+        "diff", "--name-only", f"origin/main...origin/{branch}", timeout=10,
+    )
+    if rc_files == 0 and files_output:
+        from pathlib import Path
+        from . import config
+        from .attribution import parse_attribution_from_file

-        # Detect role sections in attribution block
-        for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"):
-            if stripped.startswith(f"{role}:"):
-                current_role = role
-                break
-
-        # Extract handle from attribution entries
-        handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped)
-        if handle_match and current_role:
-            handle = handle_match.group(1).strip().lower()
-            agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped)
-            agent_id = agent_id_match.group(1).strip() if agent_id_match else None
-            upsert_contributor(conn, handle, agent_id, current_role, today)
+        main_root = Path(config.MAIN_WORKTREE)
+        knowledge_prefixes = (
+            "domains/", "entities/", "decisions/", "foundations/",
+            "convictions/", "core/",
+        )
+        for rel_path in files_output.strip().split("\n"):
+            rel_path = rel_path.strip()
+            if not rel_path.endswith(".md"):
+                continue
+            if not rel_path.startswith(knowledge_prefixes):
+                continue
+            full = main_root / rel_path
+            if not full.exists():
+                continue  # file removed in this PR
+            attribution = parse_attribution_from_file(str(full))
+            for role, entries in attribution.items():
+                for entry in entries:
+                    handle = entry.get("handle")
+                    if handle:
+                        upsert_contributor(
+                            conn, handle, entry.get("agent_id"), role, today,
+                        )

    # Fallback: if no Pentagon-Agent trailer found, try git commit authors
    _BOT_AUTHORS = frozenset({
--- a/scripts/backfill-sourcer-attribution.py
+++ b/scripts/backfill-sourcer-attribution.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""Backfill sourcer/extractor/etc. attribution from claim frontmatter.
+
+Walks every merged knowledge file under domains/, entities/, decisions/,
+foundations/, convictions/, core/ and re-runs the canonical attribution
+parser (lib/attribution.py). For each parsed (handle, role) pair, increments
+the corresponding *_count column on the contributors table.
+
+Why this is needed (Apr 24 incident):
+  - lib/contributor.py used a diff-line regex parser that handled neither
+    the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor
+    the nested `attribution: { sourcer: [...] }` block format used by Leo's
+    manual extractions (Shaga's claims).
+  - Result: alexastrum, thesensatore, cameron-s1, and similar handles were
+    silently dropped at merge time. Their contributor rows either don't
+    exist or are stuck at zero counts.
+
+Usage:
+    python3 backfill-sourcer-attribution.py --dry-run    # report deltas, no writes
+    python3 backfill-sourcer-attribution.py              # apply (additive: max(db, truth))
+    python3 backfill-sourcer-attribution.py --reset      # destructive: set absolute truth
+
+Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth).
+This preserves any existing high counts that came from non-frontmatter sources
+(e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via
+a different code path; truncating to the file-walk truth would be destructive).
+
+Use --reset to set absolute truth from the file walk only — this clobbers
+all existing role counts including legitimate non-frontmatter credit.
+
+Idempotency: additive mode is safe to re-run. --reset run is gated by an
+audit_log marker; pass --force to override.
+"""
+import argparse
+import os
+import sqlite3
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# Allow running from anywhere — point at pipeline lib
+PIPELINE_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PIPELINE_ROOT))
+
+from lib.attribution import parse_attribution_from_file, VALID_ROLES  # noqa: E402
+
+DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
+REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
+KNOWLEDGE_PREFIXES = (
+    "domains", "entities", "decisions", "foundations", "convictions", "core",
+)
+
+
+def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]:
+    """Walk all knowledge files; return {handle: {role: count}}."""
+    counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
+    files_scanned = 0
+    files_with_attribution = 0
+
+    for prefix in KNOWLEDGE_PREFIXES:
+        base = repo_root / prefix
+        if not base.exists():
+            continue
+        for path in base.rglob("*.md"):
+            if path.name.startswith("_"):
+                continue
+            files_scanned += 1
+            attr = parse_attribution_from_file(str(path))
+            had_any = False
+            for role, entries in attr.items():
+                for entry in entries:
+                    handle = entry.get("handle")
+                    if handle:
+                        counts[handle][role] += 1
+                        had_any = True
+            if had_any:
+                files_with_attribution += 1
+
+    print(f"  Scanned {files_scanned} knowledge files", file=sys.stderr)
+    print(f"  {files_with_attribution} had parseable attribution", file=sys.stderr)
+    return counts
+
+
+def existing_contributors(conn) -> dict[str, dict[str, int]]:
+    """Return {handle: {role: count}} from the current DB."""
+    rows = conn.execute(
+        "SELECT handle, sourcer_count, extractor_count, challenger_count, "
+        "synthesizer_count, reviewer_count, claims_merged FROM contributors"
+    ).fetchall()
+    out = {}
+    for r in rows:
+        out[r["handle"]] = {
+            "sourcer": r["sourcer_count"] or 0,
+            "extractor": r["extractor_count"] or 0,
+            "challenger": r["challenger_count"] or 0,
+            "synthesizer": r["synthesizer_count"] or 0,
+            "reviewer": r["reviewer_count"] or 0,
+            "claims_merged": r["claims_merged"] or 0,
+        }
+    return out
+
+
+def claims_merged_for(role_counts: dict[str, int]) -> int:
+    """Mirror upsert_contributor logic: claims_merged += sourcer + extractor."""
+    return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Report deltas without writing")
+    parser.add_argument("--reset", action="store_true",
+                        help="Destructive: set absolute truth from file walk "
+                             "(default is additive max(db, truth))")
+    parser.add_argument("--force", action="store_true",
+                        help="Re-run even if a previous --reset marker exists")
+    args = parser.parse_args()
+
+    if not REPO.exists():
+        print(f"ERROR: repo not found at {REPO}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"DB: {DB_PATH}", file=sys.stderr)
+    print(f"Repo: {REPO}", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Walking knowledge tree...", file=sys.stderr)
+
+    truth = collect_attributions(REPO)
+    print(f"  Found attributions for {len(truth)} unique handles", file=sys.stderr)
+    print("", file=sys.stderr)
+
+    conn = sqlite3.connect(DB_PATH, timeout=30)
+    conn.row_factory = sqlite3.Row
+    current = existing_contributors(conn)
+
+    # Compute deltas: new handles + handles with role-count mismatches
+    new_handles: list[tuple[str, dict[str, int]]] = []
+    role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = []
+
+    for handle, roles in truth.items():
+        if handle not in current:
+            new_handles.append((handle, dict(roles)))
+        else:
+            cur = current[handle]
+            mismatches = {r: roles.get(r, 0) for r in VALID_ROLES
+                          if roles.get(r, 0) != cur.get(r, 0)}
+            if mismatches:
+                role_deltas.append((handle, dict(roles), cur))
+
+    print(f"=== {len(new_handles)} NEW contributors to insert ===")
+    for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]:
+        roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0)
+        print(f"  + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})")
+    if len(new_handles) > 20:
+        print(f"  ... and {len(new_handles) - 20} more")
+    print()
+
+    print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===")
+    for handle, truth_roles, cur_roles in sorted(
+        role_deltas,
+        key=lambda x: -sum(x[1].values()),
+    )[:20]:
+        for role in VALID_ROLES:
+            t = truth_roles.get(role, 0)
+            c = cur_roles.get(role, 0)
+            if t != c:
+                print(f"  ~ {handle}.{role}: db={c} → truth={t} (Δ{t - c:+d})")
+    if len(role_deltas) > 20:
+        print(f"  ... and {len(role_deltas) - 20} more")
+    print()
+
+    if args.dry_run:
+        mode = "RESET" if args.reset else "ADDITIVE"
+        print(f"Dry run ({mode} mode) — no changes written.")
+        if not args.reset:
+            print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.")
+            print("Pass --reset to clobber existing counts with file-walk truth.")
+        return
+
+    # Idempotency: --reset is gated by audit marker. Additive mode is always safe.
+    if args.reset:
+        marker = conn.execute(
+            "SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1"
+        ).fetchone()
+        if marker and not args.force:
+            print("ERROR: --reset has already run (audit marker present).")
+            print("Pass --force to re-run.")
+            sys.exit(2)
+
+    inserted = 0
+    updated = 0
+    preserved_higher = 0
+    for handle, roles in truth.items():
+        truth_counts = {
+            "sourcer": roles.get("sourcer", 0),
+            "extractor": roles.get("extractor", 0),
+            "challenger": roles.get("challenger", 0),
+            "synthesizer": roles.get("synthesizer", 0),
+            "reviewer": roles.get("reviewer", 0),
+        }
+
+        if handle in current:
+            cur = current[handle]
+            if args.reset:
+                # Preserve reviewer_count even on reset (PR-level not file-level)
+                final = dict(truth_counts)
+                final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0))
+            else:
+                # Additive: max of db vs truth, per role
+                final = {
+                    role: max(truth_counts[role], cur.get(role, 0))
+                    for role in truth_counts
+                }
+                if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts):
+                    preserved_higher += 1
+
+            cm = final["sourcer"] + final["extractor"]
+            conn.execute(
+                """UPDATE contributors SET
+                    sourcer_count = ?,
+                    extractor_count = ?,
+                    challenger_count = ?,
+                    synthesizer_count = ?,
+                    reviewer_count = ?,
+                    claims_merged = ?,
+                    updated_at = datetime('now')
+                WHERE handle = ?""",
+                (final["sourcer"], final["extractor"], final["challenger"],
+                 final["synthesizer"], final["reviewer"], cm, handle),
+            )
+            updated += 1
+        else:
+            cm = truth_counts["sourcer"] + truth_counts["extractor"]
+            conn.execute(
+                """INSERT INTO contributors (
+                    handle, sourcer_count, extractor_count, challenger_count,
+                    synthesizer_count, reviewer_count, claims_merged,
+                    first_contribution, last_contribution, tier
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""",
+                (handle, truth_counts["sourcer"], truth_counts["extractor"],
+                 truth_counts["challenger"], truth_counts["synthesizer"],
+                 truth_counts["reviewer"], cm),
+            )
+            inserted += 1
+
+    event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill"
+    conn.execute(
+        "INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
+        ("contributor", event,
+         f'{{"inserted": {inserted}, "updated": {updated}, '
+         f'"preserved_higher": {preserved_higher}, "mode": '
+         f'"{"reset" if args.reset else "additive"}"}}'),
+    )
+    conn.commit()
+    print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). "
+          f"Inserted {inserted} new, updated {updated} existing, "
+          f"preserved {preserved_higher} higher-than-truth values.")
+
+
+if __name__ == "__main__":
+    main()