feat: source author backfill — credits intellectual foundations of KB

Parses source: frontmatter across 616 claims, matches against entity
files + manual author map, credits sourcer_count. 33 authors matched,
8 new contributor entries created.

Bostrom (9), Shapiro (8), Hanson (6), Conitzer (7) etc. now visible
on the leaderboard as sourcers.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-26 15:26:04 +00:00
parent 2b49b17eb2
commit 47fa33fd53

271
backfill-source-authors.py Normal file
View file

@ -0,0 +1,271 @@
#!/usr/bin/env python3
# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers.
"""Backfill sourcer attribution from claim source: fields.
Parses every claim's source: frontmatter, matches against entity files
and known author patterns, credits sourcer_count in contributors table.
Usage:
python3 backfill-source-authors.py [--dry-run]
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
"""
import argparse
import os
import re
import sqlite3
from collections import Counter
from pathlib import Path
import yaml
DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
# Entity name → canonical handle mapping (built from entities/ files)
def _build_entity_map() -> dict[str, str]:
"""Build lowercase name → handle map from entity files."""
entity_map = {}
entities_dir = REPO_DIR / "entities"
for md_file in entities_dir.rglob("*.md"):
try:
text = md_file.read_text(errors="replace")
if not text.startswith("---"):
continue
end = text.find("\n---", 3)
if end == -1:
continue
fm = yaml.safe_load(text[3:end])
if not fm:
continue
handle = md_file.stem # filename without .md
name = fm.get("name", handle)
entity_map[name.lower()] = handle
entity_map[handle.lower()] = handle
# Add aliases
for alias in (fm.get("aliases", []) or []):
entity_map[alias.lower()] = handle
for h in (fm.get("handles", []) or []):
entity_map[h.lower().lstrip("@")] = handle
except Exception:
pass
return entity_map
# Known author patterns that don't have entity files
MANUAL_AUTHOR_MAP = {
"bostrom": "bostrom",
"nick bostrom": "bostrom",
"hanson": "hanson",
"robin hanson": "hanson",
"doug shapiro": "doug-shapiro",
"shapiro": "doug-shapiro",
"matthew ball shapiro": "doug-shapiro",
"heavey": "heavey",
"noah smith": "noah-smith",
"noahpinion": "noah-smith",
"bak": "bak",
"per bak": "bak",
"ostrom": "ostrom",
"elinor ostrom": "ostrom",
"coase": "coase",
"ronald coase": "coase",
"hayek": "hayek",
"f.a. hayek": "hayek",
"friston": "friston",
"karl friston": "friston",
"dario amodei": "dario-amodei",
"amodei": "dario-amodei",
"karpathy": "karpathy",
"andrej karpathy": "karpathy",
"metaproph3t": "proph3t",
"proph3t": "proph3t",
"nallok": "nallok",
"metanallok": "nallok",
"ben hawkins": "ben-hawkins",
"aquino-michaels": "aquino-michaels",
"conitzer": "conitzer",
"conitzer et al.": "conitzer",
"ramstead": "ramstead",
"maxwell ramstead": "ramstead",
"christensen": "clayton-christensen",
"clayton christensen": "clayton-christensen",
"blackmore": "blackmore",
"susan blackmore": "blackmore",
"leopold aschenbrenner": "leopold-aschenbrenner",
"aschenbrenner": "leopold-aschenbrenner",
"bessemer venture partners": "bessemer-venture-partners",
"kaiser family foundation": "kaiser-family-foundation",
"theia research": "theia-research",
"alea research": "alea-research",
"architectural investing": "architectural-investing",
"kaufmann": "kaufmann",
"stuart kaufmann": "kaufmann",
"stuart kauffman": "kaufmann",
"knuth": "knuth",
"donald knuth": "knuth",
"ward whitt": "ward-whitt",
"centola": "centola",
"damon centola": "centola",
"hidalgo": "hidalgo",
"cesar hidalgo": "hidalgo",
"juarrero": "juarrero",
"alicia juarrero": "juarrero",
"larsson": "larsson",
"pine analytics": "pine-analytics",
"pineanalytics": "pine-analytics",
"@01resolved": "01resolved",
"01resolved": "01resolved",
"drew": "01resolved",
"galaxy research": "galaxy-research",
"fortune": "fortune",
}
# Skip these — they're agent synthesis, not external sources
SKIP_SOURCES = {
"rio", "leo", "clay", "theseus", "vida", "astra",
"web research compilation", "web research", "synthesis",
"strategy session journal", "living capital thesis development",
"attractor state historical backtesting", "teleohumanity manifesto",
"governance - meritocratic voting + futarchy",
}
def extract_authors(source_field: str) -> list[str]:
"""Extract author names from a source: field. Returns canonical handles."""
if not source_field:
return []
source = str(source_field).strip().strip('"').strip("'").lower()
# Skip agent/internal sources
for skip in SKIP_SOURCES:
if source.startswith(skip):
return []
authors = []
# Try direct match first
if source in MANUAL_AUTHOR_MAP:
return [MANUAL_AUTHOR_MAP[source]]
# Extract first author (before comma, parenthesis, or connecting words)
# "Bostrom, Superintelligence (2014)" → "bostrom"
# "Conitzer et al., 2024" → "conitzer"
# "rio, based on Solomon DAO" → skip (agent)
match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source)
if match:
candidate = match.group(1).strip()
if candidate in MANUAL_AUTHOR_MAP:
authors.append(MANUAL_AUTHOR_MAP[candidate])
elif candidate in SKIP_SOURCES:
pass
elif len(candidate) > 2 and len(candidate) < 50:
# Check entity map (built at runtime)
authors.append(candidate) # Will be matched against entity map later
# Also check for "analysis by Rio" pattern — credit the source, not the agent
by_match = re.search(r'analysis by (\w+)', source)
if by_match and by_match.group(1).lower() in SKIP_SOURCES:
pass # Agent analysis, already handled
return authors
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
# Build entity map
entity_map = _build_entity_map()
print(f"Entity map: {len(entity_map)} entries")
# Merge with manual map
full_map = {**MANUAL_AUTHOR_MAP, **entity_map}
# Walk all claims
claim_dirs = ["domains", "core", "foundations", "decisions"]
author_counts = Counter()
unmatched = Counter()
for d in claim_dirs:
base = REPO_DIR / d
if not base.exists():
continue
for md_file in base.rglob("*.md"):
if md_file.name.startswith("_"):
continue
try:
text = md_file.read_text(errors="replace")
if not text.startswith("---"):
continue
end = text.find("\n---", 3)
if end == -1:
continue
fm = yaml.safe_load(text[3:end])
if not fm or not fm.get("source"):
continue
authors = extract_authors(fm["source"])
for author in authors:
# Resolve through full map
canonical = full_map.get(author, author)
if canonical in full_map.values() or canonical in full_map:
# Known author
final = full_map.get(canonical, canonical)
author_counts[final] += 1
else:
unmatched[author] += 1
except Exception:
pass
print(f"\n=== Matched authors ({len(author_counts)}) ===")
for author, count in author_counts.most_common(25):
print(f" {count}x: {author}")
print(f"\n=== Unmatched ({len(unmatched)}) ===")
for author, count in unmatched.most_common(15):
print(f" {count}x: {author}")
if args.dry_run:
print("\nDry run — no DB changes")
return
# Update contributors table
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
updated = 0
created = 0
for handle, count in author_counts.items():
existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone()
if existing:
new_count = (existing["sourcer_count"] or 0) + count
conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?",
(new_count, count, handle))
updated += 1
else:
conn.execute("""INSERT INTO contributors
(handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier)
VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""",
(handle, count, count))
created += 1
conn.commit()
print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created")
# Show results
weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
print("\n=== Top contributors after source-author backfill ===")
for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged
FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall():
ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05
p = f" -> {r['principal']}" if r['principal'] else ""
print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}")
if __name__ == "__main__":
main()