fix(attribution): credit sourcer/extractor from claim frontmatter

Three layers of contributor-attribution bug surfaced by Apr 24 leaderboard
investigation. alexastrum, thesensatore, cameron-s1 all had real merged
contributions but zero credit in the contributors table.

1. lib/attribution.py: parse_attribution() only read `attribution_sourcer:`
   prefix-keyed flat fields. ~42% of claim files (535/1280) use the bare-key
   form `sourcer: alexastrum` written by extract.py. Added bare-key handling
   between the prefixed-flat path and the legacy-source-field fallback.
   Block format (`attribution: { sourcer: [...] }`) still wins when present.

2. lib/contributor.py: record_contributor_attribution() parsed the diff text
   with regex looking for `+- handle: "X"` lines. This matched neither the
   bare-key flat format nor the `attribution: { sourcer: [...] }` block
   format Leo uses for manual extractions. Replaced the regex parser with
   a file walker that calls attribution.parse_attribution_from_file() on
   each changed knowledge file — single source of truth for both formats.

3. scripts/backfill-sourcer-attribution.py: walks all merged knowledge files,
   re-attributes via the canonical parser, upserts contributors. Default
   additive mode preserves existing high counts (e.g. m3taversal.sourcer=1011
   reflects Telegram-curator credit accumulated via a different code path
   that this fix does not touch). --reset flag for the destructive case.

Dry-run preview (additive mode):
  - 670 NEW contributors to insert (mostly source-citation handles)
  - 77 EXISTING contributors with under-counted role columns
  - alexastrum: 0 → 6, thesensatore: 0 → 5, cameron-s1: 0 → 2
  - astra.sourcer: 0 → 96, leo.sourcer: 0 → 44, theseus.sourcer: 0 → 18
  - m3taversal.sourcer: 1011 (preserved, not 22 from file walk)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-24 12:48:41 +01:00
parent 05d15cea56
commit 3fe0f4b744
3 changed files with 320 additions and 20 deletions

View file

@ -64,6 +64,30 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
if isinstance(v, str):
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
# Bare-key flat format: `sourcer: alexastrum`, `extractor: leo`, etc.
# This is what extract.py writes (line 290: f'sourcer: "{sourcer}"') — the most
# common format in practice (~42% of claim files). The Apr 24 incident traced
# missing leaderboard entries to this format being silently dropped because the
# parser only checked the `attribution_*` prefix.
# Only fill if the role wasn't already populated by the prefixed form, to avoid
# double-counting when both formats coexist on the same claim.
for role in VALID_ROLES:
if result[role]:
continue
bare_val = fm.get(role)
if isinstance(bare_val, str) and bare_val.strip():
result[role].append({"handle": bare_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
elif isinstance(bare_val, list):
for v in bare_val:
if isinstance(v, str) and v.strip():
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
elif isinstance(v, dict) and v.get("handle"):
result[role].append({
"handle": v["handle"].strip().lower().lstrip("@"),
"agent_id": v.get("agent_id"),
"context": v.get("context"),
})
# Legacy fallback: infer from source field
if not any(result[r] for r in VALID_ROLES):
source = fm.get("source", "")

View file

@ -148,27 +148,42 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
)
agents_found.add(agent_name)
# Parse attribution blocks from claim frontmatter in diff
# Look for added lines with attribution YAML
current_role = None
for line in diff.split("\n"):
if not line.startswith("+") or line.startswith("+++"):
continue
stripped = line[1:].strip()
# Parse attribution from changed knowledge files via the canonical attribution
# parser (lib/attribution.py). The previous diff-line regex parser dropped
# both the bare-key flat format (`sourcer: alexastrum`) and the nested
# `attribution:` block format because it only matched `- handle: "X"` lines.
# The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
# thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
rc_files, files_output = await git_fn(
"diff", "--name-only", f"origin/main...origin/{branch}", timeout=10,
)
if rc_files == 0 and files_output:
from pathlib import Path
from . import config
from .attribution import parse_attribution_from_file
# Detect role sections in attribution block
for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"):
if stripped.startswith(f"{role}:"):
current_role = role
break
# Extract handle from attribution entries
handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped)
if handle_match and current_role:
handle = handle_match.group(1).strip().lower()
agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped)
agent_id = agent_id_match.group(1).strip() if agent_id_match else None
upsert_contributor(conn, handle, agent_id, current_role, today)
main_root = Path(config.MAIN_WORKTREE)
knowledge_prefixes = (
"domains/", "entities/", "decisions/", "foundations/",
"convictions/", "core/",
)
for rel_path in files_output.strip().split("\n"):
rel_path = rel_path.strip()
if not rel_path.endswith(".md"):
continue
if not rel_path.startswith(knowledge_prefixes):
continue
full = main_root / rel_path
if not full.exists():
continue # file removed in this PR
attribution = parse_attribution_from_file(str(full))
for role, entries in attribution.items():
for entry in entries:
handle = entry.get("handle")
if handle:
upsert_contributor(
conn, handle, entry.get("agent_id"), role, today,
)
# Fallback: if no Pentagon-Agent trailer found, try git commit authors
_BOT_AUTHORS = frozenset({

View file

@ -0,0 +1,261 @@
#!/usr/bin/env python3
"""Backfill sourcer/extractor/etc. attribution from claim frontmatter.
Walks every merged knowledge file under domains/, entities/, decisions/,
foundations/, convictions/, core/ and re-runs the canonical attribution
parser (lib/attribution.py). For each parsed (handle, role) pair, increments
the corresponding *_count column on the contributors table.
Why this is needed (Apr 24 incident):
- lib/contributor.py used a diff-line regex parser that handled neither
the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor
the nested `attribution: { sourcer: [...] }` block format used by Leo's
manual extractions (Shaga's claims).
- Result: alexastrum, thesensatore, cameron-s1, and similar handles were
silently dropped at merge time. Their contributor rows either don't
exist or are stuck at zero counts.
Usage:
python3 backfill-sourcer-attribution.py --dry-run # report deltas, no writes
python3 backfill-sourcer-attribution.py # apply (additive: max(db, truth))
python3 backfill-sourcer-attribution.py --reset # destructive: set absolute truth
Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth).
This preserves any existing high counts that came from non-frontmatter sources
(e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via
a different code path; truncating to the file-walk truth would be destructive).
Use --reset to set absolute truth from the file walk only this clobbers
all existing role counts including legitimate non-frontmatter credit.
Idempotency: additive mode is safe to re-run. --reset run is gated by an
audit_log marker; pass --force to override.
"""
import argparse
import os
import sqlite3
import sys
from collections import defaultdict
from pathlib import Path
# Allow running from anywhere — point at pipeline lib
PIPELINE_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PIPELINE_ROOT))
from lib.attribution import parse_attribution_from_file, VALID_ROLES # noqa: E402
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
KNOWLEDGE_PREFIXES = (
"domains", "entities", "decisions", "foundations", "convictions", "core",
)
def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]:
"""Walk all knowledge files; return {handle: {role: count}}."""
counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
files_scanned = 0
files_with_attribution = 0
for prefix in KNOWLEDGE_PREFIXES:
base = repo_root / prefix
if not base.exists():
continue
for path in base.rglob("*.md"):
if path.name.startswith("_"):
continue
files_scanned += 1
attr = parse_attribution_from_file(str(path))
had_any = False
for role, entries in attr.items():
for entry in entries:
handle = entry.get("handle")
if handle:
counts[handle][role] += 1
had_any = True
if had_any:
files_with_attribution += 1
print(f" Scanned {files_scanned} knowledge files", file=sys.stderr)
print(f" {files_with_attribution} had parseable attribution", file=sys.stderr)
return counts
def existing_contributors(conn) -> dict[str, dict[str, int]]:
"""Return {handle: {role: count}} from the current DB."""
rows = conn.execute(
"SELECT handle, sourcer_count, extractor_count, challenger_count, "
"synthesizer_count, reviewer_count, claims_merged FROM contributors"
).fetchall()
out = {}
for r in rows:
out[r["handle"]] = {
"sourcer": r["sourcer_count"] or 0,
"extractor": r["extractor_count"] or 0,
"challenger": r["challenger_count"] or 0,
"synthesizer": r["synthesizer_count"] or 0,
"reviewer": r["reviewer_count"] or 0,
"claims_merged": r["claims_merged"] or 0,
}
return out
def claims_merged_for(role_counts: dict[str, int]) -> int:
"""Mirror upsert_contributor logic: claims_merged += sourcer + extractor."""
return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true",
help="Report deltas without writing")
parser.add_argument("--reset", action="store_true",
help="Destructive: set absolute truth from file walk "
"(default is additive max(db, truth))")
parser.add_argument("--force", action="store_true",
help="Re-run even if a previous --reset marker exists")
args = parser.parse_args()
if not REPO.exists():
print(f"ERROR: repo not found at {REPO}", file=sys.stderr)
sys.exit(1)
print(f"DB: {DB_PATH}", file=sys.stderr)
print(f"Repo: {REPO}", file=sys.stderr)
print("", file=sys.stderr)
print("Walking knowledge tree...", file=sys.stderr)
truth = collect_attributions(REPO)
print(f" Found attributions for {len(truth)} unique handles", file=sys.stderr)
print("", file=sys.stderr)
conn = sqlite3.connect(DB_PATH, timeout=30)
conn.row_factory = sqlite3.Row
current = existing_contributors(conn)
# Compute deltas: new handles + handles with role-count mismatches
new_handles: list[tuple[str, dict[str, int]]] = []
role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = []
for handle, roles in truth.items():
if handle not in current:
new_handles.append((handle, dict(roles)))
else:
cur = current[handle]
mismatches = {r: roles.get(r, 0) for r in VALID_ROLES
if roles.get(r, 0) != cur.get(r, 0)}
if mismatches:
role_deltas.append((handle, dict(roles), cur))
print(f"=== {len(new_handles)} NEW contributors to insert ===")
for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]:
roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0)
print(f" + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})")
if len(new_handles) > 20:
print(f" ... and {len(new_handles) - 20} more")
print()
print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===")
for handle, truth_roles, cur_roles in sorted(
role_deltas,
key=lambda x: -sum(x[1].values()),
)[:20]:
for role in VALID_ROLES:
t = truth_roles.get(role, 0)
c = cur_roles.get(role, 0)
if t != c:
print(f" ~ {handle}.{role}: db={c} → truth={t}{t - c:+d})")
if len(role_deltas) > 20:
print(f" ... and {len(role_deltas) - 20} more")
print()
if args.dry_run:
mode = "RESET" if args.reset else "ADDITIVE"
print(f"Dry run ({mode} mode) — no changes written.")
if not args.reset:
print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.")
print("Pass --reset to clobber existing counts with file-walk truth.")
return
# Idempotency: --reset is gated by audit marker. Additive mode is always safe.
if args.reset:
marker = conn.execute(
"SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1"
).fetchone()
if marker and not args.force:
print("ERROR: --reset has already run (audit marker present).")
print("Pass --force to re-run.")
sys.exit(2)
inserted = 0
updated = 0
preserved_higher = 0
for handle, roles in truth.items():
truth_counts = {
"sourcer": roles.get("sourcer", 0),
"extractor": roles.get("extractor", 0),
"challenger": roles.get("challenger", 0),
"synthesizer": roles.get("synthesizer", 0),
"reviewer": roles.get("reviewer", 0),
}
if handle in current:
cur = current[handle]
if args.reset:
# Preserve reviewer_count even on reset (PR-level not file-level)
final = dict(truth_counts)
final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0))
else:
# Additive: max of db vs truth, per role
final = {
role: max(truth_counts[role], cur.get(role, 0))
for role in truth_counts
}
if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts):
preserved_higher += 1
cm = final["sourcer"] + final["extractor"]
conn.execute(
"""UPDATE contributors SET
sourcer_count = ?,
extractor_count = ?,
challenger_count = ?,
synthesizer_count = ?,
reviewer_count = ?,
claims_merged = ?,
updated_at = datetime('now')
WHERE handle = ?""",
(final["sourcer"], final["extractor"], final["challenger"],
final["synthesizer"], final["reviewer"], cm, handle),
)
updated += 1
else:
cm = truth_counts["sourcer"] + truth_counts["extractor"]
conn.execute(
"""INSERT INTO contributors (
handle, sourcer_count, extractor_count, challenger_count,
synthesizer_count, reviewer_count, claims_merged,
first_contribution, last_contribution, tier
) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""",
(handle, truth_counts["sourcer"], truth_counts["extractor"],
truth_counts["challenger"], truth_counts["synthesizer"],
truth_counts["reviewer"], cm),
)
inserted += 1
event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill"
conn.execute(
"INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
("contributor", event,
f'{{"inserted": {inserted}, "updated": {updated}, '
f'"preserved_higher": {preserved_higher}, "mode": '
f'"{"reset" if args.reset else "additive"}"}}'),
)
conn.commit()
print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). "
f"Inserted {inserted} new, updated {updated} existing, "
f"preserved {preserved_higher} higher-than-truth values.")
if __name__ == "__main__":
main()