Parses source: frontmatter across 616 claims, matches against entity files + manual author map, credits sourcer_count. 33 authors matched, 8 new contributor entries created. Bostrom (9), Shapiro (8), Hanson (6), Conitzer (7) etc. now visible on the leaderboard as sourcers. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
271 lines
9.4 KiB
Python
271 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers.
|
|
"""Backfill sourcer attribution from claim source: fields.
|
|
|
|
Parses every claim's source: frontmatter, matches against entity files
|
|
and known author patterns, credits sourcer_count in contributors table.
|
|
|
|
Usage:
|
|
python3 backfill-source-authors.py [--dry-run]
|
|
|
|
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
|
|
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
|
|
|
|
# Entity name → canonical handle mapping (built from entities/ files)
|
|
def _build_entity_map() -> dict[str, str]:
|
|
"""Build lowercase name → handle map from entity files."""
|
|
entity_map = {}
|
|
entities_dir = REPO_DIR / "entities"
|
|
for md_file in entities_dir.rglob("*.md"):
|
|
try:
|
|
text = md_file.read_text(errors="replace")
|
|
if not text.startswith("---"):
|
|
continue
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
continue
|
|
fm = yaml.safe_load(text[3:end])
|
|
if not fm:
|
|
continue
|
|
handle = md_file.stem # filename without .md
|
|
name = fm.get("name", handle)
|
|
entity_map[name.lower()] = handle
|
|
entity_map[handle.lower()] = handle
|
|
# Add aliases
|
|
for alias in (fm.get("aliases", []) or []):
|
|
entity_map[alias.lower()] = handle
|
|
for h in (fm.get("handles", []) or []):
|
|
entity_map[h.lower().lstrip("@")] = handle
|
|
except Exception:
|
|
pass
|
|
return entity_map
|
|
|
|
|
|
# Known author patterns that don't have entity files
|
|
MANUAL_AUTHOR_MAP = {
|
|
"bostrom": "bostrom",
|
|
"nick bostrom": "bostrom",
|
|
"hanson": "hanson",
|
|
"robin hanson": "hanson",
|
|
"doug shapiro": "doug-shapiro",
|
|
"shapiro": "doug-shapiro",
|
|
"matthew ball shapiro": "doug-shapiro",
|
|
"heavey": "heavey",
|
|
"noah smith": "noah-smith",
|
|
"noahpinion": "noah-smith",
|
|
"bak": "bak",
|
|
"per bak": "bak",
|
|
"ostrom": "ostrom",
|
|
"elinor ostrom": "ostrom",
|
|
"coase": "coase",
|
|
"ronald coase": "coase",
|
|
"hayek": "hayek",
|
|
"f.a. hayek": "hayek",
|
|
"friston": "friston",
|
|
"karl friston": "friston",
|
|
"dario amodei": "dario-amodei",
|
|
"amodei": "dario-amodei",
|
|
"karpathy": "karpathy",
|
|
"andrej karpathy": "karpathy",
|
|
"metaproph3t": "proph3t",
|
|
"proph3t": "proph3t",
|
|
"nallok": "nallok",
|
|
"metanallok": "nallok",
|
|
"ben hawkins": "ben-hawkins",
|
|
"aquino-michaels": "aquino-michaels",
|
|
"conitzer": "conitzer",
|
|
"conitzer et al.": "conitzer",
|
|
"ramstead": "ramstead",
|
|
"maxwell ramstead": "ramstead",
|
|
"christensen": "clayton-christensen",
|
|
"clayton christensen": "clayton-christensen",
|
|
"blackmore": "blackmore",
|
|
"susan blackmore": "blackmore",
|
|
"leopold aschenbrenner": "leopold-aschenbrenner",
|
|
"aschenbrenner": "leopold-aschenbrenner",
|
|
"bessemer venture partners": "bessemer-venture-partners",
|
|
"kaiser family foundation": "kaiser-family-foundation",
|
|
"theia research": "theia-research",
|
|
"alea research": "alea-research",
|
|
"architectural investing": "architectural-investing",
|
|
"kaufmann": "kaufmann",
|
|
"stuart kaufmann": "kaufmann",
|
|
"stuart kauffman": "kaufmann",
|
|
"knuth": "knuth",
|
|
"donald knuth": "knuth",
|
|
"ward whitt": "ward-whitt",
|
|
"centola": "centola",
|
|
"damon centola": "centola",
|
|
"hidalgo": "hidalgo",
|
|
"cesar hidalgo": "hidalgo",
|
|
"juarrero": "juarrero",
|
|
"alicia juarrero": "juarrero",
|
|
"larsson": "larsson",
|
|
"pine analytics": "pine-analytics",
|
|
"pineanalytics": "pine-analytics",
|
|
"@01resolved": "01resolved",
|
|
"01resolved": "01resolved",
|
|
"drew": "01resolved",
|
|
"galaxy research": "galaxy-research",
|
|
"fortune": "fortune",
|
|
}
|
|
|
|
# Skip these — they're agent synthesis, not external sources
|
|
SKIP_SOURCES = {
|
|
"rio", "leo", "clay", "theseus", "vida", "astra",
|
|
"web research compilation", "web research", "synthesis",
|
|
"strategy session journal", "living capital thesis development",
|
|
"attractor state historical backtesting", "teleohumanity manifesto",
|
|
"governance - meritocratic voting + futarchy",
|
|
}
|
|
|
|
|
|
def extract_authors(source_field: str) -> list[str]:
|
|
"""Extract author names from a source: field. Returns canonical handles."""
|
|
if not source_field:
|
|
return []
|
|
|
|
source = str(source_field).strip().strip('"').strip("'").lower()
|
|
|
|
# Skip agent/internal sources
|
|
for skip in SKIP_SOURCES:
|
|
if source.startswith(skip):
|
|
return []
|
|
|
|
authors = []
|
|
|
|
# Try direct match first
|
|
if source in MANUAL_AUTHOR_MAP:
|
|
return [MANUAL_AUTHOR_MAP[source]]
|
|
|
|
# Extract first author (before comma, parenthesis, or connecting words)
|
|
# "Bostrom, Superintelligence (2014)" → "bostrom"
|
|
# "Conitzer et al., 2024" → "conitzer"
|
|
# "rio, based on Solomon DAO" → skip (agent)
|
|
match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source)
|
|
if match:
|
|
candidate = match.group(1).strip()
|
|
if candidate in MANUAL_AUTHOR_MAP:
|
|
authors.append(MANUAL_AUTHOR_MAP[candidate])
|
|
elif candidate in SKIP_SOURCES:
|
|
pass
|
|
elif len(candidate) > 2 and len(candidate) < 50:
|
|
# Check entity map (built at runtime)
|
|
authors.append(candidate) # Will be matched against entity map later
|
|
|
|
# Also check for "analysis by Rio" pattern — credit the source, not the agent
|
|
by_match = re.search(r'analysis by (\w+)', source)
|
|
if by_match and by_match.group(1).lower() in SKIP_SOURCES:
|
|
pass # Agent analysis, already handled
|
|
|
|
return authors
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
# Build entity map
|
|
entity_map = _build_entity_map()
|
|
print(f"Entity map: {len(entity_map)} entries")
|
|
|
|
# Merge with manual map
|
|
full_map = {**MANUAL_AUTHOR_MAP, **entity_map}
|
|
|
|
# Walk all claims
|
|
claim_dirs = ["domains", "core", "foundations", "decisions"]
|
|
author_counts = Counter()
|
|
unmatched = Counter()
|
|
|
|
for d in claim_dirs:
|
|
base = REPO_DIR / d
|
|
if not base.exists():
|
|
continue
|
|
for md_file in base.rglob("*.md"):
|
|
if md_file.name.startswith("_"):
|
|
continue
|
|
try:
|
|
text = md_file.read_text(errors="replace")
|
|
if not text.startswith("---"):
|
|
continue
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
continue
|
|
fm = yaml.safe_load(text[3:end])
|
|
if not fm or not fm.get("source"):
|
|
continue
|
|
|
|
authors = extract_authors(fm["source"])
|
|
for author in authors:
|
|
# Resolve through full map
|
|
canonical = full_map.get(author, author)
|
|
if canonical in full_map.values() or canonical in full_map:
|
|
# Known author
|
|
final = full_map.get(canonical, canonical)
|
|
author_counts[final] += 1
|
|
else:
|
|
unmatched[author] += 1
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
print(f"\n=== Matched authors ({len(author_counts)}) ===")
|
|
for author, count in author_counts.most_common(25):
|
|
print(f" {count}x: {author}")
|
|
|
|
print(f"\n=== Unmatched ({len(unmatched)}) ===")
|
|
for author, count in unmatched.most_common(15):
|
|
print(f" {count}x: {author}")
|
|
|
|
if args.dry_run:
|
|
print("\nDry run — no DB changes")
|
|
return
|
|
|
|
# Update contributors table
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
updated = 0
|
|
created = 0
|
|
for handle, count in author_counts.items():
|
|
existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone()
|
|
if existing:
|
|
new_count = (existing["sourcer_count"] or 0) + count
|
|
conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?",
|
|
(new_count, count, handle))
|
|
updated += 1
|
|
else:
|
|
conn.execute("""INSERT INTO contributors
|
|
(handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier)
|
|
VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""",
|
|
(handle, count, count))
|
|
created += 1
|
|
|
|
conn.commit()
|
|
print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created")
|
|
|
|
# Show results
|
|
weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
|
|
print("\n=== Top contributors after source-author backfill ===")
|
|
for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged
|
|
FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall():
|
|
ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05
|
|
p = f" -> {r['principal']}" if r['principal'] else ""
|
|
print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|