#!/usr/bin/env python3 # ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers. """Backfill sourcer attribution from claim source: fields. Parses every claim's source: frontmatter, matches against entity files and known author patterns, credits sourcer_count in contributors table. Usage: python3 backfill-source-authors.py [--dry-run] Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> """ import argparse import os import re import sqlite3 from collections import Counter from pathlib import Path import yaml DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db" REPO_DIR = Path("/opt/teleo-eval/workspaces/main") # Entity name → canonical handle mapping (built from entities/ files) def _build_entity_map() -> dict[str, str]: """Build lowercase name → handle map from entity files.""" entity_map = {} entities_dir = REPO_DIR / "entities" for md_file in entities_dir.rglob("*.md"): try: text = md_file.read_text(errors="replace") if not text.startswith("---"): continue end = text.find("\n---", 3) if end == -1: continue fm = yaml.safe_load(text[3:end]) if not fm: continue handle = md_file.stem # filename without .md name = fm.get("name", handle) entity_map[name.lower()] = handle entity_map[handle.lower()] = handle # Add aliases for alias in (fm.get("aliases", []) or []): entity_map[alias.lower()] = handle for h in (fm.get("handles", []) or []): entity_map[h.lower().lstrip("@")] = handle except Exception: pass return entity_map # Known author patterns that don't have entity files MANUAL_AUTHOR_MAP = { "bostrom": "bostrom", "nick bostrom": "bostrom", "hanson": "hanson", "robin hanson": "hanson", "doug shapiro": "doug-shapiro", "shapiro": "doug-shapiro", "matthew ball shapiro": "doug-shapiro", "heavey": "heavey", "noah smith": "noah-smith", "noahpinion": "noah-smith", "bak": "bak", "per bak": "bak", "ostrom": "ostrom", "elinor ostrom": "ostrom", "coase": "coase", "ronald coase": "coase", "hayek": "hayek", "f.a. hayek": "hayek", "friston": "friston", "karl friston": "friston", "dario amodei": "dario-amodei", "amodei": "dario-amodei", "karpathy": "karpathy", "andrej karpathy": "karpathy", "metaproph3t": "proph3t", "proph3t": "proph3t", "nallok": "nallok", "metanallok": "nallok", "ben hawkins": "ben-hawkins", "aquino-michaels": "aquino-michaels", "conitzer": "conitzer", "conitzer et al.": "conitzer", "ramstead": "ramstead", "maxwell ramstead": "ramstead", "christensen": "clayton-christensen", "clayton christensen": "clayton-christensen", "blackmore": "blackmore", "susan blackmore": "blackmore", "leopold aschenbrenner": "leopold-aschenbrenner", "aschenbrenner": "leopold-aschenbrenner", "bessemer venture partners": "bessemer-venture-partners", "kaiser family foundation": "kaiser-family-foundation", "theia research": "theia-research", "alea research": "alea-research", "architectural investing": "architectural-investing", "kaufmann": "kaufmann", "stuart kaufmann": "kaufmann", "stuart kauffman": "kaufmann", "knuth": "knuth", "donald knuth": "knuth", "ward whitt": "ward-whitt", "centola": "centola", "damon centola": "centola", "hidalgo": "hidalgo", "cesar hidalgo": "hidalgo", "juarrero": "juarrero", "alicia juarrero": "juarrero", "larsson": "larsson", "pine analytics": "pine-analytics", "pineanalytics": "pine-analytics", "@01resolved": "01resolved", "01resolved": "01resolved", "drew": "01resolved", "galaxy research": "galaxy-research", "fortune": "fortune", } # Skip these — they're agent synthesis, not external sources SKIP_SOURCES = { "rio", "leo", "clay", "theseus", "vida", "astra", "web research compilation", "web research", "synthesis", "strategy session journal", "living capital thesis development", "attractor state historical backtesting", "teleohumanity manifesto", "governance - meritocratic voting + futarchy", } def extract_authors(source_field: str) -> list[str]: """Extract author names from a source: field. Returns canonical handles.""" if not source_field: return [] source = str(source_field).strip().strip('"').strip("'").lower() # Skip agent/internal sources for skip in SKIP_SOURCES: if source.startswith(skip): return [] authors = [] # Try direct match first if source in MANUAL_AUTHOR_MAP: return [MANUAL_AUTHOR_MAP[source]] # Extract first author (before comma, parenthesis, or connecting words) # "Bostrom, Superintelligence (2014)" → "bostrom" # "Conitzer et al., 2024" → "conitzer" # "rio, based on Solomon DAO" → skip (agent) match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source) if match: candidate = match.group(1).strip() if candidate in MANUAL_AUTHOR_MAP: authors.append(MANUAL_AUTHOR_MAP[candidate]) elif candidate in SKIP_SOURCES: pass elif len(candidate) > 2 and len(candidate) < 50: # Check entity map (built at runtime) authors.append(candidate) # Will be matched against entity map later # Also check for "analysis by Rio" pattern — credit the source, not the agent by_match = re.search(r'analysis by (\w+)', source) if by_match and by_match.group(1).lower() in SKIP_SOURCES: pass # Agent analysis, already handled return authors def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() # Build entity map entity_map = _build_entity_map() print(f"Entity map: {len(entity_map)} entries") # Merge with manual map full_map = {**MANUAL_AUTHOR_MAP, **entity_map} # Walk all claims claim_dirs = ["domains", "core", "foundations", "decisions"] author_counts = Counter() unmatched = Counter() for d in claim_dirs: base = REPO_DIR / d if not base.exists(): continue for md_file in base.rglob("*.md"): if md_file.name.startswith("_"): continue try: text = md_file.read_text(errors="replace") if not text.startswith("---"): continue end = text.find("\n---", 3) if end == -1: continue fm = yaml.safe_load(text[3:end]) if not fm or not fm.get("source"): continue authors = extract_authors(fm["source"]) for author in authors: # Resolve through full map canonical = full_map.get(author, author) if canonical in full_map.values() or canonical in full_map: # Known author final = full_map.get(canonical, canonical) author_counts[final] += 1 else: unmatched[author] += 1 except Exception: pass print(f"\n=== Matched authors ({len(author_counts)}) ===") for author, count in author_counts.most_common(25): print(f" {count}x: {author}") print(f"\n=== Unmatched ({len(unmatched)}) ===") for author, count in unmatched.most_common(15): print(f" {count}x: {author}") if args.dry_run: print("\nDry run — no DB changes") return # Update contributors table conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row updated = 0 created = 0 for handle, count in author_counts.items(): existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone() if existing: new_count = (existing["sourcer_count"] or 0) + count conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?", (new_count, count, handle)) updated += 1 else: conn.execute("""INSERT INTO contributors (handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier) VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""", (handle, count, count)) created += 1 conn.commit() print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created") # Show results weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20} print("\n=== Top contributors after source-author backfill ===") for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall(): ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05 p = f" -> {r['principal']}" if r['principal'] else "" print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}") if __name__ == "__main__": main()