feat: source author backfill — credits intellectual foundations of KB
Parses source: frontmatter across 616 claims, matches against entity files + manual author map, credits sourcer_count. 33 authors matched, 8 new contributor entries created. Bostrom (9), Shapiro (8), Hanson (6), Conitzer (7) etc. now visible on the leaderboard as sourcers. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
2b49b17eb2
commit
47fa33fd53
1 changed files with 271 additions and 0 deletions
271
backfill-source-authors.py
Normal file
271
backfill-source-authors.py
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
#!/usr/bin/env python3
|
||||
# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers.
|
||||
"""Backfill sourcer attribution from claim source: fields.
|
||||
|
||||
Parses every claim's source: frontmatter, matches against entity files
|
||||
and known author patterns, credits sourcer_count in contributors table.
|
||||
|
||||
Usage:
|
||||
python3 backfill-source-authors.py [--dry-run]
|
||||
|
||||
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
|
||||
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
|
||||
|
||||
# Entity name → canonical handle mapping (built from entities/ files)
|
||||
def _build_entity_map() -> dict[str, str]:
|
||||
"""Build lowercase name → handle map from entity files."""
|
||||
entity_map = {}
|
||||
entities_dir = REPO_DIR / "entities"
|
||||
for md_file in entities_dir.rglob("*.md"):
|
||||
try:
|
||||
text = md_file.read_text(errors="replace")
|
||||
if not text.startswith("---"):
|
||||
continue
|
||||
end = text.find("\n---", 3)
|
||||
if end == -1:
|
||||
continue
|
||||
fm = yaml.safe_load(text[3:end])
|
||||
if not fm:
|
||||
continue
|
||||
handle = md_file.stem # filename without .md
|
||||
name = fm.get("name", handle)
|
||||
entity_map[name.lower()] = handle
|
||||
entity_map[handle.lower()] = handle
|
||||
# Add aliases
|
||||
for alias in (fm.get("aliases", []) or []):
|
||||
entity_map[alias.lower()] = handle
|
||||
for h in (fm.get("handles", []) or []):
|
||||
entity_map[h.lower().lstrip("@")] = handle
|
||||
except Exception:
|
||||
pass
|
||||
return entity_map
|
||||
|
||||
|
||||
# Known author patterns that don't have entity files
|
||||
MANUAL_AUTHOR_MAP = {
|
||||
"bostrom": "bostrom",
|
||||
"nick bostrom": "bostrom",
|
||||
"hanson": "hanson",
|
||||
"robin hanson": "hanson",
|
||||
"doug shapiro": "doug-shapiro",
|
||||
"shapiro": "doug-shapiro",
|
||||
"matthew ball shapiro": "doug-shapiro",
|
||||
"heavey": "heavey",
|
||||
"noah smith": "noah-smith",
|
||||
"noahpinion": "noah-smith",
|
||||
"bak": "bak",
|
||||
"per bak": "bak",
|
||||
"ostrom": "ostrom",
|
||||
"elinor ostrom": "ostrom",
|
||||
"coase": "coase",
|
||||
"ronald coase": "coase",
|
||||
"hayek": "hayek",
|
||||
"f.a. hayek": "hayek",
|
||||
"friston": "friston",
|
||||
"karl friston": "friston",
|
||||
"dario amodei": "dario-amodei",
|
||||
"amodei": "dario-amodei",
|
||||
"karpathy": "karpathy",
|
||||
"andrej karpathy": "karpathy",
|
||||
"metaproph3t": "proph3t",
|
||||
"proph3t": "proph3t",
|
||||
"nallok": "nallok",
|
||||
"metanallok": "nallok",
|
||||
"ben hawkins": "ben-hawkins",
|
||||
"aquino-michaels": "aquino-michaels",
|
||||
"conitzer": "conitzer",
|
||||
"conitzer et al.": "conitzer",
|
||||
"ramstead": "ramstead",
|
||||
"maxwell ramstead": "ramstead",
|
||||
"christensen": "clayton-christensen",
|
||||
"clayton christensen": "clayton-christensen",
|
||||
"blackmore": "blackmore",
|
||||
"susan blackmore": "blackmore",
|
||||
"leopold aschenbrenner": "leopold-aschenbrenner",
|
||||
"aschenbrenner": "leopold-aschenbrenner",
|
||||
"bessemer venture partners": "bessemer-venture-partners",
|
||||
"kaiser family foundation": "kaiser-family-foundation",
|
||||
"theia research": "theia-research",
|
||||
"alea research": "alea-research",
|
||||
"architectural investing": "architectural-investing",
|
||||
"kaufmann": "kaufmann",
|
||||
"stuart kaufmann": "kaufmann",
|
||||
"stuart kauffman": "kaufmann",
|
||||
"knuth": "knuth",
|
||||
"donald knuth": "knuth",
|
||||
"ward whitt": "ward-whitt",
|
||||
"centola": "centola",
|
||||
"damon centola": "centola",
|
||||
"hidalgo": "hidalgo",
|
||||
"cesar hidalgo": "hidalgo",
|
||||
"juarrero": "juarrero",
|
||||
"alicia juarrero": "juarrero",
|
||||
"larsson": "larsson",
|
||||
"pine analytics": "pine-analytics",
|
||||
"pineanalytics": "pine-analytics",
|
||||
"@01resolved": "01resolved",
|
||||
"01resolved": "01resolved",
|
||||
"drew": "01resolved",
|
||||
"galaxy research": "galaxy-research",
|
||||
"fortune": "fortune",
|
||||
}
|
||||
|
||||
# Skip these — they're agent synthesis, not external sources
|
||||
SKIP_SOURCES = {
|
||||
"rio", "leo", "clay", "theseus", "vida", "astra",
|
||||
"web research compilation", "web research", "synthesis",
|
||||
"strategy session journal", "living capital thesis development",
|
||||
"attractor state historical backtesting", "teleohumanity manifesto",
|
||||
"governance - meritocratic voting + futarchy",
|
||||
}
|
||||
|
||||
|
||||
def extract_authors(source_field: str) -> list[str]:
|
||||
"""Extract author names from a source: field. Returns canonical handles."""
|
||||
if not source_field:
|
||||
return []
|
||||
|
||||
source = str(source_field).strip().strip('"').strip("'").lower()
|
||||
|
||||
# Skip agent/internal sources
|
||||
for skip in SKIP_SOURCES:
|
||||
if source.startswith(skip):
|
||||
return []
|
||||
|
||||
authors = []
|
||||
|
||||
# Try direct match first
|
||||
if source in MANUAL_AUTHOR_MAP:
|
||||
return [MANUAL_AUTHOR_MAP[source]]
|
||||
|
||||
# Extract first author (before comma, parenthesis, or connecting words)
|
||||
# "Bostrom, Superintelligence (2014)" → "bostrom"
|
||||
# "Conitzer et al., 2024" → "conitzer"
|
||||
# "rio, based on Solomon DAO" → skip (agent)
|
||||
match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source)
|
||||
if match:
|
||||
candidate = match.group(1).strip()
|
||||
if candidate in MANUAL_AUTHOR_MAP:
|
||||
authors.append(MANUAL_AUTHOR_MAP[candidate])
|
||||
elif candidate in SKIP_SOURCES:
|
||||
pass
|
||||
elif len(candidate) > 2 and len(candidate) < 50:
|
||||
# Check entity map (built at runtime)
|
||||
authors.append(candidate) # Will be matched against entity map later
|
||||
|
||||
# Also check for "analysis by Rio" pattern — credit the source, not the agent
|
||||
by_match = re.search(r'analysis by (\w+)', source)
|
||||
if by_match and by_match.group(1).lower() in SKIP_SOURCES:
|
||||
pass # Agent analysis, already handled
|
||||
|
||||
return authors
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build entity map
|
||||
entity_map = _build_entity_map()
|
||||
print(f"Entity map: {len(entity_map)} entries")
|
||||
|
||||
# Merge with manual map
|
||||
full_map = {**MANUAL_AUTHOR_MAP, **entity_map}
|
||||
|
||||
# Walk all claims
|
||||
claim_dirs = ["domains", "core", "foundations", "decisions"]
|
||||
author_counts = Counter()
|
||||
unmatched = Counter()
|
||||
|
||||
for d in claim_dirs:
|
||||
base = REPO_DIR / d
|
||||
if not base.exists():
|
||||
continue
|
||||
for md_file in base.rglob("*.md"):
|
||||
if md_file.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
text = md_file.read_text(errors="replace")
|
||||
if not text.startswith("---"):
|
||||
continue
|
||||
end = text.find("\n---", 3)
|
||||
if end == -1:
|
||||
continue
|
||||
fm = yaml.safe_load(text[3:end])
|
||||
if not fm or not fm.get("source"):
|
||||
continue
|
||||
|
||||
authors = extract_authors(fm["source"])
|
||||
for author in authors:
|
||||
# Resolve through full map
|
||||
canonical = full_map.get(author, author)
|
||||
if canonical in full_map.values() or canonical in full_map:
|
||||
# Known author
|
||||
final = full_map.get(canonical, canonical)
|
||||
author_counts[final] += 1
|
||||
else:
|
||||
unmatched[author] += 1
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"\n=== Matched authors ({len(author_counts)}) ===")
|
||||
for author, count in author_counts.most_common(25):
|
||||
print(f" {count}x: {author}")
|
||||
|
||||
print(f"\n=== Unmatched ({len(unmatched)}) ===")
|
||||
for author, count in unmatched.most_common(15):
|
||||
print(f" {count}x: {author}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDry run — no DB changes")
|
||||
return
|
||||
|
||||
# Update contributors table
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
updated = 0
|
||||
created = 0
|
||||
for handle, count in author_counts.items():
|
||||
existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone()
|
||||
if existing:
|
||||
new_count = (existing["sourcer_count"] or 0) + count
|
||||
conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?",
|
||||
(new_count, count, handle))
|
||||
updated += 1
|
||||
else:
|
||||
conn.execute("""INSERT INTO contributors
|
||||
(handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier)
|
||||
VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""",
|
||||
(handle, count, count))
|
||||
created += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created")
|
||||
|
||||
# Show results
|
||||
weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
|
||||
print("\n=== Top contributors after source-author backfill ===")
|
||||
for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged
|
||||
FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall():
|
||||
ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05
|
||||
p = f" -> {r['principal']}" if r['principal'] else ""
|
||||
print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue