diff --git a/backfill-source-authors.py b/backfill-source-authors.py new file mode 100644 index 0000000..7011c2b --- /dev/null +++ b/backfill-source-authors.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 +# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers. +"""Backfill sourcer attribution from claim source: fields. + +Parses every claim's source: frontmatter, matches against entity files +and known author patterns, credits sourcer_count in contributors table. + +Usage: + python3 backfill-source-authors.py [--dry-run] + +Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> +""" + +import argparse +import os +import re +import sqlite3 +from collections import Counter +from pathlib import Path + +import yaml + +DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db" +REPO_DIR = Path("/opt/teleo-eval/workspaces/main") + +# Entity name → canonical handle mapping (built from entities/ files) +def _build_entity_map() -> dict[str, str]: + """Build lowercase name → handle map from entity files.""" + entity_map = {} + entities_dir = REPO_DIR / "entities" + for md_file in entities_dir.rglob("*.md"): + try: + text = md_file.read_text(errors="replace") + if not text.startswith("---"): + continue + end = text.find("\n---", 3) + if end == -1: + continue + fm = yaml.safe_load(text[3:end]) + if not fm: + continue + handle = md_file.stem # filename without .md + name = fm.get("name", handle) + entity_map[name.lower()] = handle + entity_map[handle.lower()] = handle + # Add aliases + for alias in (fm.get("aliases", []) or []): + entity_map[alias.lower()] = handle + for h in (fm.get("handles", []) or []): + entity_map[h.lower().lstrip("@")] = handle + except Exception: + pass + return entity_map + + +# Known author patterns that don't have entity files +MANUAL_AUTHOR_MAP = { + "bostrom": "bostrom", + "nick bostrom": "bostrom", + "hanson": "hanson", + "robin hanson": "hanson", + "doug shapiro": "doug-shapiro", + "shapiro": "doug-shapiro", + "matthew ball shapiro": "doug-shapiro", + "heavey": "heavey", + "noah smith": "noah-smith", + "noahpinion": "noah-smith", + "bak": "bak", + "per bak": "bak", + "ostrom": "ostrom", + "elinor ostrom": "ostrom", + "coase": "coase", + "ronald coase": "coase", + "hayek": "hayek", + "f.a. hayek": "hayek", + "friston": "friston", + "karl friston": "friston", + "dario amodei": "dario-amodei", + "amodei": "dario-amodei", + "karpathy": "karpathy", + "andrej karpathy": "karpathy", + "metaproph3t": "proph3t", + "proph3t": "proph3t", + "nallok": "nallok", + "metanallok": "nallok", + "ben hawkins": "ben-hawkins", + "aquino-michaels": "aquino-michaels", + "conitzer": "conitzer", + "conitzer et al.": "conitzer", + "ramstead": "ramstead", + "maxwell ramstead": "ramstead", + "christensen": "clayton-christensen", + "clayton christensen": "clayton-christensen", + "blackmore": "blackmore", + "susan blackmore": "blackmore", + "leopold aschenbrenner": "leopold-aschenbrenner", + "aschenbrenner": "leopold-aschenbrenner", + "bessemer venture partners": "bessemer-venture-partners", + "kaiser family foundation": "kaiser-family-foundation", + "theia research": "theia-research", + "alea research": "alea-research", + "architectural investing": "architectural-investing", + "kaufmann": "kaufmann", + "stuart kaufmann": "kaufmann", + "stuart kauffman": "kaufmann", + "knuth": "knuth", + "donald knuth": "knuth", + "ward whitt": "ward-whitt", + "centola": "centola", + "damon centola": "centola", + "hidalgo": "hidalgo", + "cesar hidalgo": "hidalgo", + "juarrero": "juarrero", + "alicia juarrero": "juarrero", + "larsson": "larsson", + "pine analytics": "pine-analytics", + "pineanalytics": "pine-analytics", + "@01resolved": "01resolved", + "01resolved": "01resolved", + "drew": "01resolved", + "galaxy research": "galaxy-research", + "fortune": "fortune", +} + +# Skip these — they're agent synthesis, not external sources +SKIP_SOURCES = { + "rio", "leo", "clay", "theseus", "vida", "astra", + "web research compilation", "web research", "synthesis", + "strategy session journal", "living capital thesis development", + "attractor state historical backtesting", "teleohumanity manifesto", + "governance - meritocratic voting + futarchy", +} + + +def extract_authors(source_field: str) -> list[str]: + """Extract author names from a source: field. Returns canonical handles.""" + if not source_field: + return [] + + source = str(source_field).strip().strip('"').strip("'").lower() + + # Skip agent/internal sources + for skip in SKIP_SOURCES: + if source.startswith(skip): + return [] + + authors = [] + + # Try direct match first + if source in MANUAL_AUTHOR_MAP: + return [MANUAL_AUTHOR_MAP[source]] + + # Extract first author (before comma, parenthesis, or connecting words) + # "Bostrom, Superintelligence (2014)" → "bostrom" + # "Conitzer et al., 2024" → "conitzer" + # "rio, based on Solomon DAO" → skip (agent) + match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source) + if match: + candidate = match.group(1).strip() + if candidate in MANUAL_AUTHOR_MAP: + authors.append(MANUAL_AUTHOR_MAP[candidate]) + elif candidate in SKIP_SOURCES: + pass + elif len(candidate) > 2 and len(candidate) < 50: + # Check entity map (built at runtime) + authors.append(candidate) # Will be matched against entity map later + + # Also check for "analysis by Rio" pattern — credit the source, not the agent + by_match = re.search(r'analysis by (\w+)', source) + if by_match and by_match.group(1).lower() in SKIP_SOURCES: + pass # Agent analysis, already handled + + return authors + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + # Build entity map + entity_map = _build_entity_map() + print(f"Entity map: {len(entity_map)} entries") + + # Merge with manual map + full_map = {**MANUAL_AUTHOR_MAP, **entity_map} + + # Walk all claims + claim_dirs = ["domains", "core", "foundations", "decisions"] + author_counts = Counter() + unmatched = Counter() + + for d in claim_dirs: + base = REPO_DIR / d + if not base.exists(): + continue + for md_file in base.rglob("*.md"): + if md_file.name.startswith("_"): + continue + try: + text = md_file.read_text(errors="replace") + if not text.startswith("---"): + continue + end = text.find("\n---", 3) + if end == -1: + continue + fm = yaml.safe_load(text[3:end]) + if not fm or not fm.get("source"): + continue + + authors = extract_authors(fm["source"]) + for author in authors: + # Resolve through full map + canonical = full_map.get(author, author) + if canonical in full_map.values() or canonical in full_map: + # Known author + final = full_map.get(canonical, canonical) + author_counts[final] += 1 + else: + unmatched[author] += 1 + + except Exception: + pass + + print(f"\n=== Matched authors ({len(author_counts)}) ===") + for author, count in author_counts.most_common(25): + print(f" {count}x: {author}") + + print(f"\n=== Unmatched ({len(unmatched)}) ===") + for author, count in unmatched.most_common(15): + print(f" {count}x: {author}") + + if args.dry_run: + print("\nDry run — no DB changes") + return + + # Update contributors table + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + + updated = 0 + created = 0 + for handle, count in author_counts.items(): + existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone() + if existing: + new_count = (existing["sourcer_count"] or 0) + count + conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?", + (new_count, count, handle)) + updated += 1 + else: + conn.execute("""INSERT INTO contributors + (handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier) + VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""", + (handle, count, count)) + created += 1 + + conn.commit() + print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created") + + # Show results + weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20} + print("\n=== Top contributors after source-author backfill ===") + for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged + FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall(): + ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05 + p = f" -> {r['principal']}" if r['principal'] else "" + print(f" {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}") + + +if __name__ == "__main__": + main()