feat: source author backfill — credits intellectual foundations of KB

Parses source: frontmatter across 616 claims, matches against entity files + manual author map, credits sourcer_count. 33 authors matched, 8 new contributor entries created. Bostrom (9), Shapiro (8), Hanson (6), Conitzer (7) etc. now visible on the leaderboard as sourcers. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-26 15:26:04 +00:00 · 2026-03-26 15:26:04 +00:00 · 47fa33fd53
commit 47fa33fd53
parent 2b49b17eb2
1 changed files with 271 additions and 0 deletions
--- a/backfill-source-authors.py
+++ b/backfill-source-authors.py
@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers.
+"""Backfill sourcer attribution from claim source: fields.
+
+Parses every claim's source: frontmatter, matches against entity files
+and known author patterns, credits sourcer_count in contributors table.
+
+Usage:
+    python3 backfill-source-authors.py [--dry-run]
+
+Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
+"""
+
+import argparse
+import os
+import re
+import sqlite3
+from collections import Counter
+from pathlib import Path
+
+import yaml
+
+DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
+REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
+
+# Entity name → canonical handle mapping (built from entities/ files)
+def _build_entity_map() -> dict[str, str]:
+    """Build lowercase name → handle map from entity files."""
+    entity_map = {}
+    entities_dir = REPO_DIR / "entities"
+    for md_file in entities_dir.rglob("*.md"):
+        try:
+            text = md_file.read_text(errors="replace")
+            if not text.startswith("---"):
+                continue
+            end = text.find("\n---", 3)
+            if end == -1:
+                continue
+            fm = yaml.safe_load(text[3:end])
+            if not fm:
+                continue
+            handle = md_file.stem  # filename without .md
+            name = fm.get("name", handle)
+            entity_map[name.lower()] = handle
+            entity_map[handle.lower()] = handle
+            # Add aliases
+            for alias in (fm.get("aliases", []) or []):
+                entity_map[alias.lower()] = handle
+            for h in (fm.get("handles", []) or []):
+                entity_map[h.lower().lstrip("@")] = handle
+        except Exception:
+            pass
+    return entity_map
+
+
+# Known author patterns that don't have entity files
+MANUAL_AUTHOR_MAP = {
+    "bostrom": "bostrom",
+    "nick bostrom": "bostrom",
+    "hanson": "hanson",
+    "robin hanson": "hanson",
+    "doug shapiro": "doug-shapiro",
+    "shapiro": "doug-shapiro",
+    "matthew ball shapiro": "doug-shapiro",
+    "heavey": "heavey",
+    "noah smith": "noah-smith",
+    "noahpinion": "noah-smith",
+    "bak": "bak",
+    "per bak": "bak",
+    "ostrom": "ostrom",
+    "elinor ostrom": "ostrom",
+    "coase": "coase",
+    "ronald coase": "coase",
+    "hayek": "hayek",
+    "f.a. hayek": "hayek",
+    "friston": "friston",
+    "karl friston": "friston",
+    "dario amodei": "dario-amodei",
+    "amodei": "dario-amodei",
+    "karpathy": "karpathy",
+    "andrej karpathy": "karpathy",
+    "metaproph3t": "proph3t",
+    "proph3t": "proph3t",
+    "nallok": "nallok",
+    "metanallok": "nallok",
+    "ben hawkins": "ben-hawkins",
+    "aquino-michaels": "aquino-michaels",
+    "conitzer": "conitzer",
+    "conitzer et al.": "conitzer",
+    "ramstead": "ramstead",
+    "maxwell ramstead": "ramstead",
+    "christensen": "clayton-christensen",
+    "clayton christensen": "clayton-christensen",
+    "blackmore": "blackmore",
+    "susan blackmore": "blackmore",
+    "leopold aschenbrenner": "leopold-aschenbrenner",
+    "aschenbrenner": "leopold-aschenbrenner",
+    "bessemer venture partners": "bessemer-venture-partners",
+    "kaiser family foundation": "kaiser-family-foundation",
+    "theia research": "theia-research",
+    "alea research": "alea-research",
+    "architectural investing": "architectural-investing",
+    "kaufmann": "kaufmann",
+    "stuart kaufmann": "kaufmann",
+    "stuart kauffman": "kaufmann",
+    "knuth": "knuth",
+    "donald knuth": "knuth",
+    "ward whitt": "ward-whitt",
+    "centola": "centola",
+    "damon centola": "centola",
+    "hidalgo": "hidalgo",
+    "cesar hidalgo": "hidalgo",
+    "juarrero": "juarrero",
+    "alicia juarrero": "juarrero",
+    "larsson": "larsson",
+    "pine analytics": "pine-analytics",
+    "pineanalytics": "pine-analytics",
+    "@01resolved": "01resolved",
+    "01resolved": "01resolved",
+    "drew": "01resolved",
+    "galaxy research": "galaxy-research",
+    "fortune": "fortune",
+}
+
+# Skip these — they're agent synthesis, not external sources
+SKIP_SOURCES = {
+    "rio", "leo", "clay", "theseus", "vida", "astra",
+    "web research compilation", "web research", "synthesis",
+    "strategy session journal", "living capital thesis development",
+    "attractor state historical backtesting", "teleohumanity manifesto",
+    "governance - meritocratic voting + futarchy",
+}
+
+
+def extract_authors(source_field: str) -> list[str]:
+    """Extract author names from a source: field. Returns canonical handles."""
+    if not source_field:
+        return []
+
+    source = str(source_field).strip().strip('"').strip("'").lower()
+
+    # Skip agent/internal sources
+    for skip in SKIP_SOURCES:
+        if source.startswith(skip):
+            return []
+
+    authors = []
+
+    # Try direct match first
+    if source in MANUAL_AUTHOR_MAP:
+        return [MANUAL_AUTHOR_MAP[source]]
+
+    # Extract first author (before comma, parenthesis, or connecting words)
+    # "Bostrom, Superintelligence (2014)" → "bostrom"
+    # "Conitzer et al., 2024" → "conitzer"
+    # "rio, based on Solomon DAO" → skip (agent)
+    match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source)
+    if match:
+        candidate = match.group(1).strip()
+        if candidate in MANUAL_AUTHOR_MAP:
+            authors.append(MANUAL_AUTHOR_MAP[candidate])
+        elif candidate in SKIP_SOURCES:
+            pass
+        elif len(candidate) > 2 and len(candidate) < 50:
+            # Check entity map (built at runtime)
+            authors.append(candidate)  # Will be matched against entity map later
+
+    # Also check for "analysis by Rio" pattern — credit the source, not the agent
+    by_match = re.search(r'analysis by (\w+)', source)
+    if by_match and by_match.group(1).lower() in SKIP_SOURCES:
+        pass  # Agent analysis, already handled
+
+    return authors
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    # Build entity map
+    entity_map = _build_entity_map()
+    print(f"Entity map: {len(entity_map)} entries")
+
+    # Merge with manual map
+    full_map = {**MANUAL_AUTHOR_MAP, **entity_map}
+
+    # Walk all claims
+    claim_dirs = ["domains", "core", "foundations", "decisions"]
+    author_counts = Counter()
+    unmatched = Counter()
+
+    for d in claim_dirs:
+        base = REPO_DIR / d
+        if not base.exists():
+            continue
+        for md_file in base.rglob("*.md"):
+            if md_file.name.startswith("_"):
+                continue
+            try:
+                text = md_file.read_text(errors="replace")
+                if not text.startswith("---"):
+                    continue
+                end = text.find("\n---", 3)
+                if end == -1:
+                    continue
+                fm = yaml.safe_load(text[3:end])
+                if not fm or not fm.get("source"):
+                    continue
+
+                authors = extract_authors(fm["source"])
+                for author in authors:
+                    # Resolve through full map
+                    canonical = full_map.get(author, author)
+                    if canonical in full_map.values() or canonical in full_map:
+                        # Known author
+                        final = full_map.get(canonical, canonical)
+                        author_counts[final] += 1
+                    else:
+                        unmatched[author] += 1
+
+            except Exception:
+                pass
+
+    print(f"\n=== Matched authors ({len(author_counts)}) ===")
+    for author, count in author_counts.most_common(25):
+        print(f"  {count}x: {author}")
+
+    print(f"\n=== Unmatched ({len(unmatched)}) ===")
+    for author, count in unmatched.most_common(15):
+        print(f"  {count}x: {author}")
+
+    if args.dry_run:
+        print("\nDry run — no DB changes")
+        return
+
+    # Update contributors table
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    updated = 0
+    created = 0
+    for handle, count in author_counts.items():
+        existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone()
+        if existing:
+            new_count = (existing["sourcer_count"] or 0) + count
+            conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?",
+                         (new_count, count, handle))
+            updated += 1
+        else:
+            conn.execute("""INSERT INTO contributors
+                (handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier)
+                VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""",
+                (handle, count, count))
+            created += 1
+
+    conn.commit()
+    print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created")
+
+    # Show results
+    weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
+    print("\n=== Top contributors after source-author backfill ===")
+    for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged
+        FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall():
+        ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05
+        p = f" -> {r['principal']}" if r['principal'] else ""
+        print(f"  {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}")
+
+
+if __name__ == "__main__":
+    main()