From 3fe0f4b744d9e5d3202b29c16c8739d4b9354121 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Fri, 24 Apr 2026 12:48:41 +0100 Subject: [PATCH] fix(attribution): credit sourcer/extractor from claim frontmatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layers of contributor-attribution bug surfaced by Apr 24 leaderboard investigation. alexastrum, thesensatore, cameron-s1 all had real merged contributions but zero credit in the contributors table. 1. lib/attribution.py: parse_attribution() only read `attribution_sourcer:` prefix-keyed flat fields. ~42% of claim files (535/1280) use the bare-key form `sourcer: alexastrum` written by extract.py. Added bare-key handling between the prefixed-flat path and the legacy-source-field fallback. Block format (`attribution: { sourcer: [...] }`) still wins when present. 2. lib/contributor.py: record_contributor_attribution() parsed the diff text with regex looking for `+- handle: "X"` lines. This matched neither the bare-key flat format nor the `attribution: { sourcer: [...] }` block format Leo uses for manual extractions. Replaced the regex parser with a file walker that calls attribution.parse_attribution_from_file() on each changed knowledge file — single source of truth for both formats. 3. scripts/backfill-sourcer-attribution.py: walks all merged knowledge files, re-attributes via the canonical parser, upserts contributors. Default additive mode preserves existing high counts (e.g. m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via a different code path that this fix does not touch). --reset flag for the destructive case. Dry-run preview (additive mode): - 670 NEW contributors to insert (mostly source-citation handles) - 77 EXISTING contributors with under-counted role columns - alexastrum: 0 → 6, thesensatore: 0 → 5, cameron-s1: 0 → 2 - astra.sourcer: 0 → 96, leo.sourcer: 0 → 44, theseus.sourcer: 0 → 18 - m3taversal.sourcer: 1011 (preserved, not 22 from file walk) Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/attribution.py | 24 +++ lib/contributor.py | 55 +++-- scripts/backfill-sourcer-attribution.py | 261 ++++++++++++++++++++++++ 3 files changed, 320 insertions(+), 20 deletions(-) create mode 100755 scripts/backfill-sourcer-attribution.py diff --git a/lib/attribution.py b/lib/attribution.py index 7ca5233..05da485 100644 --- a/lib/attribution.py +++ b/lib/attribution.py @@ -64,6 +64,30 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]: if isinstance(v, str): result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None}) + # Bare-key flat format: `sourcer: alexastrum`, `extractor: leo`, etc. + # This is what extract.py writes (line 290: f'sourcer: "{sourcer}"') — the most + # common format in practice (~42% of claim files). The Apr 24 incident traced + # missing leaderboard entries to this format being silently dropped because the + # parser only checked the `attribution_*` prefix. + # Only fill if the role wasn't already populated by the prefixed form, to avoid + # double-counting when both formats coexist on the same claim. + for role in VALID_ROLES: + if result[role]: + continue + bare_val = fm.get(role) + if isinstance(bare_val, str) and bare_val.strip(): + result[role].append({"handle": bare_val.strip().lower().lstrip("@"), "agent_id": None, "context": None}) + elif isinstance(bare_val, list): + for v in bare_val: + if isinstance(v, str) and v.strip(): + result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None}) + elif isinstance(v, dict) and v.get("handle"): + result[role].append({ + "handle": v["handle"].strip().lower().lstrip("@"), + "agent_id": v.get("agent_id"), + "context": v.get("context"), + }) + # Legacy fallback: infer from source field if not any(result[r] for r in VALID_ROLES): source = fm.get("source", "") diff --git a/lib/contributor.py b/lib/contributor.py index 713dab4..b6c3a8f 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -148,27 +148,42 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ ) agents_found.add(agent_name) - # Parse attribution blocks from claim frontmatter in diff - # Look for added lines with attribution YAML - current_role = None - for line in diff.split("\n"): - if not line.startswith("+") or line.startswith("+++"): - continue - stripped = line[1:].strip() + # Parse attribution from changed knowledge files via the canonical attribution + # parser (lib/attribution.py). The previous diff-line regex parser dropped + # both the bare-key flat format (`sourcer: alexastrum`) and the nested + # `attribution:` block format because it only matched `- handle: "X"` lines. + # The Apr 24 incident traced missing leaderboard entries (alexastrum=0, + # thesensatore=0, cameron-s1=0) directly to this parser's blind spots. + rc_files, files_output = await git_fn( + "diff", "--name-only", f"origin/main...origin/{branch}", timeout=10, + ) + if rc_files == 0 and files_output: + from pathlib import Path + from . import config + from .attribution import parse_attribution_from_file - # Detect role sections in attribution block - for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"): - if stripped.startswith(f"{role}:"): - current_role = role - break - - # Extract handle from attribution entries - handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped) - if handle_match and current_role: - handle = handle_match.group(1).strip().lower() - agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped) - agent_id = agent_id_match.group(1).strip() if agent_id_match else None - upsert_contributor(conn, handle, agent_id, current_role, today) + main_root = Path(config.MAIN_WORKTREE) + knowledge_prefixes = ( + "domains/", "entities/", "decisions/", "foundations/", + "convictions/", "core/", + ) + for rel_path in files_output.strip().split("\n"): + rel_path = rel_path.strip() + if not rel_path.endswith(".md"): + continue + if not rel_path.startswith(knowledge_prefixes): + continue + full = main_root / rel_path + if not full.exists(): + continue # file removed in this PR + attribution = parse_attribution_from_file(str(full)) + for role, entries in attribution.items(): + for entry in entries: + handle = entry.get("handle") + if handle: + upsert_contributor( + conn, handle, entry.get("agent_id"), role, today, + ) # Fallback: if no Pentagon-Agent trailer found, try git commit authors _BOT_AUTHORS = frozenset({ diff --git a/scripts/backfill-sourcer-attribution.py b/scripts/backfill-sourcer-attribution.py new file mode 100755 index 0000000..4be6149 --- /dev/null +++ b/scripts/backfill-sourcer-attribution.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Backfill sourcer/extractor/etc. attribution from claim frontmatter. + +Walks every merged knowledge file under domains/, entities/, decisions/, +foundations/, convictions/, core/ and re-runs the canonical attribution +parser (lib/attribution.py). For each parsed (handle, role) pair, increments +the corresponding *_count column on the contributors table. + +Why this is needed (Apr 24 incident): + - lib/contributor.py used a diff-line regex parser that handled neither + the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor + the nested `attribution: { sourcer: [...] }` block format used by Leo's + manual extractions (Shaga's claims). + - Result: alexastrum, thesensatore, cameron-s1, and similar handles were + silently dropped at merge time. Their contributor rows either don't + exist or are stuck at zero counts. + +Usage: + python3 backfill-sourcer-attribution.py --dry-run # report deltas, no writes + python3 backfill-sourcer-attribution.py # apply (additive: max(db, truth)) + python3 backfill-sourcer-attribution.py --reset # destructive: set absolute truth + +Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth). +This preserves any existing high counts that came from non-frontmatter sources +(e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via +a different code path; truncating to the file-walk truth would be destructive). + +Use --reset to set absolute truth from the file walk only — this clobbers +all existing role counts including legitimate non-frontmatter credit. + +Idempotency: additive mode is safe to re-run. --reset run is gated by an +audit_log marker; pass --force to override. +""" +import argparse +import os +import sqlite3 +import sys +from collections import defaultdict +from pathlib import Path + +# Allow running from anywhere — point at pipeline lib +PIPELINE_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(PIPELINE_ROOT)) + +from lib.attribution import parse_attribution_from_file, VALID_ROLES # noqa: E402 + +DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") +REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")) +KNOWLEDGE_PREFIXES = ( + "domains", "entities", "decisions", "foundations", "convictions", "core", +) + + +def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]: + """Walk all knowledge files; return {handle: {role: count}}.""" + counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + files_scanned = 0 + files_with_attribution = 0 + + for prefix in KNOWLEDGE_PREFIXES: + base = repo_root / prefix + if not base.exists(): + continue + for path in base.rglob("*.md"): + if path.name.startswith("_"): + continue + files_scanned += 1 + attr = parse_attribution_from_file(str(path)) + had_any = False + for role, entries in attr.items(): + for entry in entries: + handle = entry.get("handle") + if handle: + counts[handle][role] += 1 + had_any = True + if had_any: + files_with_attribution += 1 + + print(f" Scanned {files_scanned} knowledge files", file=sys.stderr) + print(f" {files_with_attribution} had parseable attribution", file=sys.stderr) + return counts + + +def existing_contributors(conn) -> dict[str, dict[str, int]]: + """Return {handle: {role: count}} from the current DB.""" + rows = conn.execute( + "SELECT handle, sourcer_count, extractor_count, challenger_count, " + "synthesizer_count, reviewer_count, claims_merged FROM contributors" + ).fetchall() + out = {} + for r in rows: + out[r["handle"]] = { + "sourcer": r["sourcer_count"] or 0, + "extractor": r["extractor_count"] or 0, + "challenger": r["challenger_count"] or 0, + "synthesizer": r["synthesizer_count"] or 0, + "reviewer": r["reviewer_count"] or 0, + "claims_merged": r["claims_merged"] or 0, + } + return out + + +def claims_merged_for(role_counts: dict[str, int]) -> int: + """Mirror upsert_contributor logic: claims_merged += sourcer + extractor.""" + return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true", + help="Report deltas without writing") + parser.add_argument("--reset", action="store_true", + help="Destructive: set absolute truth from file walk " + "(default is additive max(db, truth))") + parser.add_argument("--force", action="store_true", + help="Re-run even if a previous --reset marker exists") + args = parser.parse_args() + + if not REPO.exists(): + print(f"ERROR: repo not found at {REPO}", file=sys.stderr) + sys.exit(1) + + print(f"DB: {DB_PATH}", file=sys.stderr) + print(f"Repo: {REPO}", file=sys.stderr) + print("", file=sys.stderr) + print("Walking knowledge tree...", file=sys.stderr) + + truth = collect_attributions(REPO) + print(f" Found attributions for {len(truth)} unique handles", file=sys.stderr) + print("", file=sys.stderr) + + conn = sqlite3.connect(DB_PATH, timeout=30) + conn.row_factory = sqlite3.Row + current = existing_contributors(conn) + + # Compute deltas: new handles + handles with role-count mismatches + new_handles: list[tuple[str, dict[str, int]]] = [] + role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = [] + + for handle, roles in truth.items(): + if handle not in current: + new_handles.append((handle, dict(roles))) + else: + cur = current[handle] + mismatches = {r: roles.get(r, 0) for r in VALID_ROLES + if roles.get(r, 0) != cur.get(r, 0)} + if mismatches: + role_deltas.append((handle, dict(roles), cur)) + + print(f"=== {len(new_handles)} NEW contributors to insert ===") + for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]: + roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0) + print(f" + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})") + if len(new_handles) > 20: + print(f" ... and {len(new_handles) - 20} more") + print() + + print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===") + for handle, truth_roles, cur_roles in sorted( + role_deltas, + key=lambda x: -sum(x[1].values()), + )[:20]: + for role in VALID_ROLES: + t = truth_roles.get(role, 0) + c = cur_roles.get(role, 0) + if t != c: + print(f" ~ {handle}.{role}: db={c} → truth={t} (Δ{t - c:+d})") + if len(role_deltas) > 20: + print(f" ... and {len(role_deltas) - 20} more") + print() + + if args.dry_run: + mode = "RESET" if args.reset else "ADDITIVE" + print(f"Dry run ({mode} mode) — no changes written.") + if not args.reset: + print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.") + print("Pass --reset to clobber existing counts with file-walk truth.") + return + + # Idempotency: --reset is gated by audit marker. Additive mode is always safe. + if args.reset: + marker = conn.execute( + "SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1" + ).fetchone() + if marker and not args.force: + print("ERROR: --reset has already run (audit marker present).") + print("Pass --force to re-run.") + sys.exit(2) + + inserted = 0 + updated = 0 + preserved_higher = 0 + for handle, roles in truth.items(): + truth_counts = { + "sourcer": roles.get("sourcer", 0), + "extractor": roles.get("extractor", 0), + "challenger": roles.get("challenger", 0), + "synthesizer": roles.get("synthesizer", 0), + "reviewer": roles.get("reviewer", 0), + } + + if handle in current: + cur = current[handle] + if args.reset: + # Preserve reviewer_count even on reset (PR-level not file-level) + final = dict(truth_counts) + final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0)) + else: + # Additive: max of db vs truth, per role + final = { + role: max(truth_counts[role], cur.get(role, 0)) + for role in truth_counts + } + if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts): + preserved_higher += 1 + + cm = final["sourcer"] + final["extractor"] + conn.execute( + """UPDATE contributors SET + sourcer_count = ?, + extractor_count = ?, + challenger_count = ?, + synthesizer_count = ?, + reviewer_count = ?, + claims_merged = ?, + updated_at = datetime('now') + WHERE handle = ?""", + (final["sourcer"], final["extractor"], final["challenger"], + final["synthesizer"], final["reviewer"], cm, handle), + ) + updated += 1 + else: + cm = truth_counts["sourcer"] + truth_counts["extractor"] + conn.execute( + """INSERT INTO contributors ( + handle, sourcer_count, extractor_count, challenger_count, + synthesizer_count, reviewer_count, claims_merged, + first_contribution, last_contribution, tier + ) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""", + (handle, truth_counts["sourcer"], truth_counts["extractor"], + truth_counts["challenger"], truth_counts["synthesizer"], + truth_counts["reviewer"], cm), + ) + inserted += 1 + + event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill" + conn.execute( + "INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)", + ("contributor", event, + f'{{"inserted": {inserted}, "updated": {updated}, ' + f'"preserved_higher": {preserved_higher}, "mode": ' + f'"{"reset" if args.reset else "additive"}"}}'), + ) + conn.commit() + print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). " + f"Inserted {inserted} new, updated {updated} existing, " + f"preserved {preserved_higher} higher-than-truth values.") + + +if __name__ == "__main__": + main()