fix(attribution): credit sourcer/extractor from claim frontmatter
Three layers of contributor-attribution bug surfaced by Apr 24 leaderboard
investigation. alexastrum, thesensatore, cameron-s1 all had real merged
contributions but zero credit in the contributors table.
1. lib/attribution.py: parse_attribution() only read `attribution_sourcer:`
prefix-keyed flat fields. ~42% of claim files (535/1280) use the bare-key
form `sourcer: alexastrum` written by extract.py. Added bare-key handling
between the prefixed-flat path and the legacy-source-field fallback.
Block format (`attribution: { sourcer: [...] }`) still wins when present.
2. lib/contributor.py: record_contributor_attribution() parsed the diff text
with regex looking for `+- handle: "X"` lines. This matched neither the
bare-key flat format nor the `attribution: { sourcer: [...] }` block
format Leo uses for manual extractions. Replaced the regex parser with
a file walker that calls attribution.parse_attribution_from_file() on
each changed knowledge file — single source of truth for both formats.
3. scripts/backfill-sourcer-attribution.py: walks all merged knowledge files,
re-attributes via the canonical parser, upserts contributors. Default
additive mode preserves existing high counts (e.g. m3taversal.sourcer=1011
reflects Telegram-curator credit accumulated via a different code path
that this fix does not touch). --reset flag for the destructive case.
Dry-run preview (additive mode):
- 670 NEW contributors to insert (mostly source-citation handles)
- 77 EXISTING contributors with under-counted role columns
- alexastrum: 0 → 6, thesensatore: 0 → 5, cameron-s1: 0 → 2
- astra.sourcer: 0 → 96, leo.sourcer: 0 → 44, theseus.sourcer: 0 → 18
- m3taversal.sourcer: 1011 (preserved, not 22 from file walk)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
05d15cea56
commit
3fe0f4b744
3 changed files with 320 additions and 20 deletions
|
|
@ -64,6 +64,30 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
|
|||
if isinstance(v, str):
|
||||
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
||||
|
||||
# Bare-key flat format: `sourcer: alexastrum`, `extractor: leo`, etc.
|
||||
# This is what extract.py writes (line 290: f'sourcer: "{sourcer}"') — the most
|
||||
# common format in practice (~42% of claim files). The Apr 24 incident traced
|
||||
# missing leaderboard entries to this format being silently dropped because the
|
||||
# parser only checked the `attribution_*` prefix.
|
||||
# Only fill if the role wasn't already populated by the prefixed form, to avoid
|
||||
# double-counting when both formats coexist on the same claim.
|
||||
for role in VALID_ROLES:
|
||||
if result[role]:
|
||||
continue
|
||||
bare_val = fm.get(role)
|
||||
if isinstance(bare_val, str) and bare_val.strip():
|
||||
result[role].append({"handle": bare_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
||||
elif isinstance(bare_val, list):
|
||||
for v in bare_val:
|
||||
if isinstance(v, str) and v.strip():
|
||||
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
||||
elif isinstance(v, dict) and v.get("handle"):
|
||||
result[role].append({
|
||||
"handle": v["handle"].strip().lower().lstrip("@"),
|
||||
"agent_id": v.get("agent_id"),
|
||||
"context": v.get("context"),
|
||||
})
|
||||
|
||||
# Legacy fallback: infer from source field
|
||||
if not any(result[r] for r in VALID_ROLES):
|
||||
source = fm.get("source", "")
|
||||
|
|
|
|||
|
|
@ -148,27 +148,42 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
|
|||
)
|
||||
agents_found.add(agent_name)
|
||||
|
||||
# Parse attribution blocks from claim frontmatter in diff
|
||||
# Look for added lines with attribution YAML
|
||||
current_role = None
|
||||
for line in diff.split("\n"):
|
||||
if not line.startswith("+") or line.startswith("+++"):
|
||||
continue
|
||||
stripped = line[1:].strip()
|
||||
# Parse attribution from changed knowledge files via the canonical attribution
|
||||
# parser (lib/attribution.py). The previous diff-line regex parser dropped
|
||||
# both the bare-key flat format (`sourcer: alexastrum`) and the nested
|
||||
# `attribution:` block format because it only matched `- handle: "X"` lines.
|
||||
# The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
|
||||
# thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
|
||||
rc_files, files_output = await git_fn(
|
||||
"diff", "--name-only", f"origin/main...origin/{branch}", timeout=10,
|
||||
)
|
||||
if rc_files == 0 and files_output:
|
||||
from pathlib import Path
|
||||
from . import config
|
||||
from .attribution import parse_attribution_from_file
|
||||
|
||||
# Detect role sections in attribution block
|
||||
for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"):
|
||||
if stripped.startswith(f"{role}:"):
|
||||
current_role = role
|
||||
break
|
||||
|
||||
# Extract handle from attribution entries
|
||||
handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped)
|
||||
if handle_match and current_role:
|
||||
handle = handle_match.group(1).strip().lower()
|
||||
agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped)
|
||||
agent_id = agent_id_match.group(1).strip() if agent_id_match else None
|
||||
upsert_contributor(conn, handle, agent_id, current_role, today)
|
||||
main_root = Path(config.MAIN_WORKTREE)
|
||||
knowledge_prefixes = (
|
||||
"domains/", "entities/", "decisions/", "foundations/",
|
||||
"convictions/", "core/",
|
||||
)
|
||||
for rel_path in files_output.strip().split("\n"):
|
||||
rel_path = rel_path.strip()
|
||||
if not rel_path.endswith(".md"):
|
||||
continue
|
||||
if not rel_path.startswith(knowledge_prefixes):
|
||||
continue
|
||||
full = main_root / rel_path
|
||||
if not full.exists():
|
||||
continue # file removed in this PR
|
||||
attribution = parse_attribution_from_file(str(full))
|
||||
for role, entries in attribution.items():
|
||||
for entry in entries:
|
||||
handle = entry.get("handle")
|
||||
if handle:
|
||||
upsert_contributor(
|
||||
conn, handle, entry.get("agent_id"), role, today,
|
||||
)
|
||||
|
||||
# Fallback: if no Pentagon-Agent trailer found, try git commit authors
|
||||
_BOT_AUTHORS = frozenset({
|
||||
|
|
|
|||
261
scripts/backfill-sourcer-attribution.py
Executable file
261
scripts/backfill-sourcer-attribution.py
Executable file
|
|
@ -0,0 +1,261 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Backfill sourcer/extractor/etc. attribution from claim frontmatter.
|
||||
|
||||
Walks every merged knowledge file under domains/, entities/, decisions/,
|
||||
foundations/, convictions/, core/ and re-runs the canonical attribution
|
||||
parser (lib/attribution.py). For each parsed (handle, role) pair, increments
|
||||
the corresponding *_count column on the contributors table.
|
||||
|
||||
Why this is needed (Apr 24 incident):
|
||||
- lib/contributor.py used a diff-line regex parser that handled neither
|
||||
the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor
|
||||
the nested `attribution: { sourcer: [...] }` block format used by Leo's
|
||||
manual extractions (Shaga's claims).
|
||||
- Result: alexastrum, thesensatore, cameron-s1, and similar handles were
|
||||
silently dropped at merge time. Their contributor rows either don't
|
||||
exist or are stuck at zero counts.
|
||||
|
||||
Usage:
|
||||
python3 backfill-sourcer-attribution.py --dry-run # report deltas, no writes
|
||||
python3 backfill-sourcer-attribution.py # apply (additive: max(db, truth))
|
||||
python3 backfill-sourcer-attribution.py --reset # destructive: set absolute truth
|
||||
|
||||
Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth).
|
||||
This preserves any existing high counts that came from non-frontmatter sources
|
||||
(e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via
|
||||
a different code path; truncating to the file-walk truth would be destructive).
|
||||
|
||||
Use --reset to set absolute truth from the file walk only — this clobbers
|
||||
all existing role counts including legitimate non-frontmatter credit.
|
||||
|
||||
Idempotency: additive mode is safe to re-run. --reset run is gated by an
|
||||
audit_log marker; pass --force to override.
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
# Allow running from anywhere — point at pipeline lib
|
||||
PIPELINE_ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(PIPELINE_ROOT))
|
||||
|
||||
from lib.attribution import parse_attribution_from_file, VALID_ROLES # noqa: E402
|
||||
|
||||
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
||||
REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
|
||||
KNOWLEDGE_PREFIXES = (
|
||||
"domains", "entities", "decisions", "foundations", "convictions", "core",
|
||||
)
|
||||
|
||||
|
||||
def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]:
|
||||
"""Walk all knowledge files; return {handle: {role: count}}."""
|
||||
counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
||||
files_scanned = 0
|
||||
files_with_attribution = 0
|
||||
|
||||
for prefix in KNOWLEDGE_PREFIXES:
|
||||
base = repo_root / prefix
|
||||
if not base.exists():
|
||||
continue
|
||||
for path in base.rglob("*.md"):
|
||||
if path.name.startswith("_"):
|
||||
continue
|
||||
files_scanned += 1
|
||||
attr = parse_attribution_from_file(str(path))
|
||||
had_any = False
|
||||
for role, entries in attr.items():
|
||||
for entry in entries:
|
||||
handle = entry.get("handle")
|
||||
if handle:
|
||||
counts[handle][role] += 1
|
||||
had_any = True
|
||||
if had_any:
|
||||
files_with_attribution += 1
|
||||
|
||||
print(f" Scanned {files_scanned} knowledge files", file=sys.stderr)
|
||||
print(f" {files_with_attribution} had parseable attribution", file=sys.stderr)
|
||||
return counts
|
||||
|
||||
|
||||
def existing_contributors(conn) -> dict[str, dict[str, int]]:
|
||||
"""Return {handle: {role: count}} from the current DB."""
|
||||
rows = conn.execute(
|
||||
"SELECT handle, sourcer_count, extractor_count, challenger_count, "
|
||||
"synthesizer_count, reviewer_count, claims_merged FROM contributors"
|
||||
).fetchall()
|
||||
out = {}
|
||||
for r in rows:
|
||||
out[r["handle"]] = {
|
||||
"sourcer": r["sourcer_count"] or 0,
|
||||
"extractor": r["extractor_count"] or 0,
|
||||
"challenger": r["challenger_count"] or 0,
|
||||
"synthesizer": r["synthesizer_count"] or 0,
|
||||
"reviewer": r["reviewer_count"] or 0,
|
||||
"claims_merged": r["claims_merged"] or 0,
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def claims_merged_for(role_counts: dict[str, int]) -> int:
|
||||
"""Mirror upsert_contributor logic: claims_merged += sourcer + extractor."""
|
||||
return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Report deltas without writing")
|
||||
parser.add_argument("--reset", action="store_true",
|
||||
help="Destructive: set absolute truth from file walk "
|
||||
"(default is additive max(db, truth))")
|
||||
parser.add_argument("--force", action="store_true",
|
||||
help="Re-run even if a previous --reset marker exists")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not REPO.exists():
|
||||
print(f"ERROR: repo not found at {REPO}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"DB: {DB_PATH}", file=sys.stderr)
|
||||
print(f"Repo: {REPO}", file=sys.stderr)
|
||||
print("", file=sys.stderr)
|
||||
print("Walking knowledge tree...", file=sys.stderr)
|
||||
|
||||
truth = collect_attributions(REPO)
|
||||
print(f" Found attributions for {len(truth)} unique handles", file=sys.stderr)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH, timeout=30)
|
||||
conn.row_factory = sqlite3.Row
|
||||
current = existing_contributors(conn)
|
||||
|
||||
# Compute deltas: new handles + handles with role-count mismatches
|
||||
new_handles: list[tuple[str, dict[str, int]]] = []
|
||||
role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = []
|
||||
|
||||
for handle, roles in truth.items():
|
||||
if handle not in current:
|
||||
new_handles.append((handle, dict(roles)))
|
||||
else:
|
||||
cur = current[handle]
|
||||
mismatches = {r: roles.get(r, 0) for r in VALID_ROLES
|
||||
if roles.get(r, 0) != cur.get(r, 0)}
|
||||
if mismatches:
|
||||
role_deltas.append((handle, dict(roles), cur))
|
||||
|
||||
print(f"=== {len(new_handles)} NEW contributors to insert ===")
|
||||
for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]:
|
||||
roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0)
|
||||
print(f" + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})")
|
||||
if len(new_handles) > 20:
|
||||
print(f" ... and {len(new_handles) - 20} more")
|
||||
print()
|
||||
|
||||
print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===")
|
||||
for handle, truth_roles, cur_roles in sorted(
|
||||
role_deltas,
|
||||
key=lambda x: -sum(x[1].values()),
|
||||
)[:20]:
|
||||
for role in VALID_ROLES:
|
||||
t = truth_roles.get(role, 0)
|
||||
c = cur_roles.get(role, 0)
|
||||
if t != c:
|
||||
print(f" ~ {handle}.{role}: db={c} → truth={t} (Δ{t - c:+d})")
|
||||
if len(role_deltas) > 20:
|
||||
print(f" ... and {len(role_deltas) - 20} more")
|
||||
print()
|
||||
|
||||
if args.dry_run:
|
||||
mode = "RESET" if args.reset else "ADDITIVE"
|
||||
print(f"Dry run ({mode} mode) — no changes written.")
|
||||
if not args.reset:
|
||||
print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.")
|
||||
print("Pass --reset to clobber existing counts with file-walk truth.")
|
||||
return
|
||||
|
||||
# Idempotency: --reset is gated by audit marker. Additive mode is always safe.
|
||||
if args.reset:
|
||||
marker = conn.execute(
|
||||
"SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1"
|
||||
).fetchone()
|
||||
if marker and not args.force:
|
||||
print("ERROR: --reset has already run (audit marker present).")
|
||||
print("Pass --force to re-run.")
|
||||
sys.exit(2)
|
||||
|
||||
inserted = 0
|
||||
updated = 0
|
||||
preserved_higher = 0
|
||||
for handle, roles in truth.items():
|
||||
truth_counts = {
|
||||
"sourcer": roles.get("sourcer", 0),
|
||||
"extractor": roles.get("extractor", 0),
|
||||
"challenger": roles.get("challenger", 0),
|
||||
"synthesizer": roles.get("synthesizer", 0),
|
||||
"reviewer": roles.get("reviewer", 0),
|
||||
}
|
||||
|
||||
if handle in current:
|
||||
cur = current[handle]
|
||||
if args.reset:
|
||||
# Preserve reviewer_count even on reset (PR-level not file-level)
|
||||
final = dict(truth_counts)
|
||||
final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0))
|
||||
else:
|
||||
# Additive: max of db vs truth, per role
|
||||
final = {
|
||||
role: max(truth_counts[role], cur.get(role, 0))
|
||||
for role in truth_counts
|
||||
}
|
||||
if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts):
|
||||
preserved_higher += 1
|
||||
|
||||
cm = final["sourcer"] + final["extractor"]
|
||||
conn.execute(
|
||||
"""UPDATE contributors SET
|
||||
sourcer_count = ?,
|
||||
extractor_count = ?,
|
||||
challenger_count = ?,
|
||||
synthesizer_count = ?,
|
||||
reviewer_count = ?,
|
||||
claims_merged = ?,
|
||||
updated_at = datetime('now')
|
||||
WHERE handle = ?""",
|
||||
(final["sourcer"], final["extractor"], final["challenger"],
|
||||
final["synthesizer"], final["reviewer"], cm, handle),
|
||||
)
|
||||
updated += 1
|
||||
else:
|
||||
cm = truth_counts["sourcer"] + truth_counts["extractor"]
|
||||
conn.execute(
|
||||
"""INSERT INTO contributors (
|
||||
handle, sourcer_count, extractor_count, challenger_count,
|
||||
synthesizer_count, reviewer_count, claims_merged,
|
||||
first_contribution, last_contribution, tier
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""",
|
||||
(handle, truth_counts["sourcer"], truth_counts["extractor"],
|
||||
truth_counts["challenger"], truth_counts["synthesizer"],
|
||||
truth_counts["reviewer"], cm),
|
||||
)
|
||||
inserted += 1
|
||||
|
||||
event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill"
|
||||
conn.execute(
|
||||
"INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
|
||||
("contributor", event,
|
||||
f'{{"inserted": {inserted}, "updated": {updated}, '
|
||||
f'"preserved_higher": {preserved_higher}, "mode": '
|
||||
f'"{"reset" if args.reset else "additive"}"}}'),
|
||||
)
|
||||
conn.commit()
|
||||
print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). "
|
||||
f"Inserted {inserted} new, updated {updated} existing, "
|
||||
f"preserved {preserved_higher} higher-than-truth values.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue