Three layers of contributor-attribution bug surfaced by Apr 24 leaderboard
investigation. alexastrum, thesensatore, cameron-s1 all had real merged
contributions but zero credit in the contributors table.
1. lib/attribution.py: parse_attribution() only read `attribution_sourcer:`
prefix-keyed flat fields. ~42% of claim files (535/1280) use the bare-key
form `sourcer: alexastrum` written by extract.py. Added bare-key handling
between the prefixed-flat path and the legacy-source-field fallback.
Block format (`attribution: { sourcer: [...] }`) still wins when present.
2. lib/contributor.py: record_contributor_attribution() parsed the diff text
with regex looking for `+- handle: "X"` lines. This matched neither the
bare-key flat format nor the `attribution: { sourcer: [...] }` block
format Leo uses for manual extractions. Replaced the regex parser with
a file walker that calls attribution.parse_attribution_from_file() on
each changed knowledge file — single source of truth for both formats.
3. scripts/backfill-sourcer-attribution.py: walks all merged knowledge files,
re-attributes via the canonical parser, upserts contributors. Default
additive mode preserves existing high counts (e.g. m3taversal.sourcer=1011
reflects Telegram-curator credit accumulated via a different code path
that this fix does not touch). --reset flag for the destructive case.
Dry-run preview (additive mode):
- 670 NEW contributors to insert (mostly source-citation handles)
- 77 EXISTING contributors with under-counted role columns
- alexastrum: 0 → 6, thesensatore: 0 → 5, cameron-s1: 0 → 2
- astra.sourcer: 0 → 96, leo.sourcer: 0 → 44, theseus.sourcer: 0 → 18
- m3taversal.sourcer: 1011 (preserved, not 22 from file walk)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
261 lines
11 KiB
Python
Executable file
261 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Backfill sourcer/extractor/etc. attribution from claim frontmatter.
|
|
|
|
Walks every merged knowledge file under domains/, entities/, decisions/,
|
|
foundations/, convictions/, core/ and re-runs the canonical attribution
|
|
parser (lib/attribution.py). For each parsed (handle, role) pair, increments
|
|
the corresponding *_count column on the contributors table.
|
|
|
|
Why this is needed (Apr 24 incident):
|
|
- lib/contributor.py used a diff-line regex parser that handled neither
|
|
the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor
|
|
the nested `attribution: { sourcer: [...] }` block format used by Leo's
|
|
manual extractions (Shaga's claims).
|
|
- Result: alexastrum, thesensatore, cameron-s1, and similar handles were
|
|
silently dropped at merge time. Their contributor rows either don't
|
|
exist or are stuck at zero counts.
|
|
|
|
Usage:
|
|
python3 backfill-sourcer-attribution.py --dry-run # report deltas, no writes
|
|
python3 backfill-sourcer-attribution.py # apply (additive: max(db, truth))
|
|
python3 backfill-sourcer-attribution.py --reset # destructive: set absolute truth
|
|
|
|
Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth).
|
|
This preserves any existing high counts that came from non-frontmatter sources
|
|
(e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via
|
|
a different code path; truncating to the file-walk truth would be destructive).
|
|
|
|
Use --reset to set absolute truth from the file walk only — this clobbers
|
|
all existing role counts including legitimate non-frontmatter credit.
|
|
|
|
Idempotency: additive mode is safe to re-run. --reset run is gated by an
|
|
audit_log marker; pass --force to override.
|
|
"""
|
|
import argparse
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
# Allow running from anywhere — point at pipeline lib
|
|
PIPELINE_ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(PIPELINE_ROOT))
|
|
|
|
from lib.attribution import parse_attribution_from_file, VALID_ROLES # noqa: E402
|
|
|
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
|
|
KNOWLEDGE_PREFIXES = (
|
|
"domains", "entities", "decisions", "foundations", "convictions", "core",
|
|
)
|
|
|
|
|
|
def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]:
|
|
"""Walk all knowledge files; return {handle: {role: count}}."""
|
|
counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
files_scanned = 0
|
|
files_with_attribution = 0
|
|
|
|
for prefix in KNOWLEDGE_PREFIXES:
|
|
base = repo_root / prefix
|
|
if not base.exists():
|
|
continue
|
|
for path in base.rglob("*.md"):
|
|
if path.name.startswith("_"):
|
|
continue
|
|
files_scanned += 1
|
|
attr = parse_attribution_from_file(str(path))
|
|
had_any = False
|
|
for role, entries in attr.items():
|
|
for entry in entries:
|
|
handle = entry.get("handle")
|
|
if handle:
|
|
counts[handle][role] += 1
|
|
had_any = True
|
|
if had_any:
|
|
files_with_attribution += 1
|
|
|
|
print(f" Scanned {files_scanned} knowledge files", file=sys.stderr)
|
|
print(f" {files_with_attribution} had parseable attribution", file=sys.stderr)
|
|
return counts
|
|
|
|
|
|
def existing_contributors(conn) -> dict[str, dict[str, int]]:
|
|
"""Return {handle: {role: count}} from the current DB."""
|
|
rows = conn.execute(
|
|
"SELECT handle, sourcer_count, extractor_count, challenger_count, "
|
|
"synthesizer_count, reviewer_count, claims_merged FROM contributors"
|
|
).fetchall()
|
|
out = {}
|
|
for r in rows:
|
|
out[r["handle"]] = {
|
|
"sourcer": r["sourcer_count"] or 0,
|
|
"extractor": r["extractor_count"] or 0,
|
|
"challenger": r["challenger_count"] or 0,
|
|
"synthesizer": r["synthesizer_count"] or 0,
|
|
"reviewer": r["reviewer_count"] or 0,
|
|
"claims_merged": r["claims_merged"] or 0,
|
|
}
|
|
return out
|
|
|
|
|
|
def claims_merged_for(role_counts: dict[str, int]) -> int:
|
|
"""Mirror upsert_contributor logic: claims_merged += sourcer + extractor."""
|
|
return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Report deltas without writing")
|
|
parser.add_argument("--reset", action="store_true",
|
|
help="Destructive: set absolute truth from file walk "
|
|
"(default is additive max(db, truth))")
|
|
parser.add_argument("--force", action="store_true",
|
|
help="Re-run even if a previous --reset marker exists")
|
|
args = parser.parse_args()
|
|
|
|
if not REPO.exists():
|
|
print(f"ERROR: repo not found at {REPO}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"DB: {DB_PATH}", file=sys.stderr)
|
|
print(f"Repo: {REPO}", file=sys.stderr)
|
|
print("", file=sys.stderr)
|
|
print("Walking knowledge tree...", file=sys.stderr)
|
|
|
|
truth = collect_attributions(REPO)
|
|
print(f" Found attributions for {len(truth)} unique handles", file=sys.stderr)
|
|
print("", file=sys.stderr)
|
|
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
current = existing_contributors(conn)
|
|
|
|
# Compute deltas: new handles + handles with role-count mismatches
|
|
new_handles: list[tuple[str, dict[str, int]]] = []
|
|
role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = []
|
|
|
|
for handle, roles in truth.items():
|
|
if handle not in current:
|
|
new_handles.append((handle, dict(roles)))
|
|
else:
|
|
cur = current[handle]
|
|
mismatches = {r: roles.get(r, 0) for r in VALID_ROLES
|
|
if roles.get(r, 0) != cur.get(r, 0)}
|
|
if mismatches:
|
|
role_deltas.append((handle, dict(roles), cur))
|
|
|
|
print(f"=== {len(new_handles)} NEW contributors to insert ===")
|
|
for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]:
|
|
roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0)
|
|
print(f" + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})")
|
|
if len(new_handles) > 20:
|
|
print(f" ... and {len(new_handles) - 20} more")
|
|
print()
|
|
|
|
print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===")
|
|
for handle, truth_roles, cur_roles in sorted(
|
|
role_deltas,
|
|
key=lambda x: -sum(x[1].values()),
|
|
)[:20]:
|
|
for role in VALID_ROLES:
|
|
t = truth_roles.get(role, 0)
|
|
c = cur_roles.get(role, 0)
|
|
if t != c:
|
|
print(f" ~ {handle}.{role}: db={c} → truth={t} (Δ{t - c:+d})")
|
|
if len(role_deltas) > 20:
|
|
print(f" ... and {len(role_deltas) - 20} more")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
mode = "RESET" if args.reset else "ADDITIVE"
|
|
print(f"Dry run ({mode} mode) — no changes written.")
|
|
if not args.reset:
|
|
print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.")
|
|
print("Pass --reset to clobber existing counts with file-walk truth.")
|
|
return
|
|
|
|
# Idempotency: --reset is gated by audit marker. Additive mode is always safe.
|
|
if args.reset:
|
|
marker = conn.execute(
|
|
"SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1"
|
|
).fetchone()
|
|
if marker and not args.force:
|
|
print("ERROR: --reset has already run (audit marker present).")
|
|
print("Pass --force to re-run.")
|
|
sys.exit(2)
|
|
|
|
inserted = 0
|
|
updated = 0
|
|
preserved_higher = 0
|
|
for handle, roles in truth.items():
|
|
truth_counts = {
|
|
"sourcer": roles.get("sourcer", 0),
|
|
"extractor": roles.get("extractor", 0),
|
|
"challenger": roles.get("challenger", 0),
|
|
"synthesizer": roles.get("synthesizer", 0),
|
|
"reviewer": roles.get("reviewer", 0),
|
|
}
|
|
|
|
if handle in current:
|
|
cur = current[handle]
|
|
if args.reset:
|
|
# Preserve reviewer_count even on reset (PR-level not file-level)
|
|
final = dict(truth_counts)
|
|
final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0))
|
|
else:
|
|
# Additive: max of db vs truth, per role
|
|
final = {
|
|
role: max(truth_counts[role], cur.get(role, 0))
|
|
for role in truth_counts
|
|
}
|
|
if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts):
|
|
preserved_higher += 1
|
|
|
|
cm = final["sourcer"] + final["extractor"]
|
|
conn.execute(
|
|
"""UPDATE contributors SET
|
|
sourcer_count = ?,
|
|
extractor_count = ?,
|
|
challenger_count = ?,
|
|
synthesizer_count = ?,
|
|
reviewer_count = ?,
|
|
claims_merged = ?,
|
|
updated_at = datetime('now')
|
|
WHERE handle = ?""",
|
|
(final["sourcer"], final["extractor"], final["challenger"],
|
|
final["synthesizer"], final["reviewer"], cm, handle),
|
|
)
|
|
updated += 1
|
|
else:
|
|
cm = truth_counts["sourcer"] + truth_counts["extractor"]
|
|
conn.execute(
|
|
"""INSERT INTO contributors (
|
|
handle, sourcer_count, extractor_count, challenger_count,
|
|
synthesizer_count, reviewer_count, claims_merged,
|
|
first_contribution, last_contribution, tier
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""",
|
|
(handle, truth_counts["sourcer"], truth_counts["extractor"],
|
|
truth_counts["challenger"], truth_counts["synthesizer"],
|
|
truth_counts["reviewer"], cm),
|
|
)
|
|
inserted += 1
|
|
|
|
event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill"
|
|
conn.execute(
|
|
"INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
|
|
("contributor", event,
|
|
f'{{"inserted": {inserted}, "updated": {updated}, '
|
|
f'"preserved_higher": {preserved_higher}, "mode": '
|
|
f'"{"reset" if args.reset else "additive"}"}}'),
|
|
)
|
|
conn.commit()
|
|
print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). "
|
|
f"Inserted {inserted} new, updated {updated} existing, "
|
|
f"preserved {preserved_higher} higher-than-truth values.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|