teleo-infrastructure/backfill-source-authors.py

#!/usr/bin/env python3
# ONE-SHOT BACKFILL — do not cron. Credits source authors as sourcers.
"""Backfill sourcer attribution from claim source: fields.

Parses every claim's source: frontmatter, matches against entity files
and known author patterns, credits sourcer_count in contributors table.

Usage:
    python3 backfill-source-authors.py [--dry-run]

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
"""

import argparse
import os
import re
import sqlite3
from collections import Counter
from pathlib import Path

import yaml

DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")

# Entity name → canonical handle mapping (built from entities/ files)
def _build_entity_map() -> dict[str, str]:
    """Build lowercase name → handle map from entity files."""
    entity_map = {}
    entities_dir = REPO_DIR / "entities"
    for md_file in entities_dir.rglob("*.md"):
        try:
            text = md_file.read_text(errors="replace")
            if not text.startswith("---"):
                continue
            end = text.find("\n---", 3)
            if end == -1:
                continue
            fm = yaml.safe_load(text[3:end])
            if not fm:
                continue
            handle = md_file.stem  # filename without .md
            name = fm.get("name", handle)
            entity_map[name.lower()] = handle
            entity_map[handle.lower()] = handle
            # Add aliases
            for alias in (fm.get("aliases", []) or []):
                entity_map[alias.lower()] = handle
            for h in (fm.get("handles", []) or []):
                entity_map[h.lower().lstrip("@")] = handle
        except Exception:
            pass
    return entity_map


# Known author patterns that don't have entity files
MANUAL_AUTHOR_MAP = {
    "bostrom": "bostrom",
    "nick bostrom": "bostrom",
    "hanson": "hanson",
    "robin hanson": "hanson",
    "doug shapiro": "doug-shapiro",
    "shapiro": "doug-shapiro",
    "matthew ball shapiro": "doug-shapiro",
    "heavey": "heavey",
    "noah smith": "noah-smith",
    "noahpinion": "noah-smith",
    "bak": "bak",
    "per bak": "bak",
    "ostrom": "ostrom",
    "elinor ostrom": "ostrom",
    "coase": "coase",
    "ronald coase": "coase",
    "hayek": "hayek",
    "f.a. hayek": "hayek",
    "friston": "friston",
    "karl friston": "friston",
    "dario amodei": "dario-amodei",
    "amodei": "dario-amodei",
    "karpathy": "karpathy",
    "andrej karpathy": "karpathy",
    "metaproph3t": "proph3t",
    "proph3t": "proph3t",
    "nallok": "nallok",
    "metanallok": "nallok",
    "ben hawkins": "ben-hawkins",
    "aquino-michaels": "aquino-michaels",
    "conitzer": "conitzer",
    "conitzer et al.": "conitzer",
    "ramstead": "ramstead",
    "maxwell ramstead": "ramstead",
    "christensen": "clayton-christensen",
    "clayton christensen": "clayton-christensen",
    "blackmore": "blackmore",
    "susan blackmore": "blackmore",
    "leopold aschenbrenner": "leopold-aschenbrenner",
    "aschenbrenner": "leopold-aschenbrenner",
    "bessemer venture partners": "bessemer-venture-partners",
    "kaiser family foundation": "kaiser-family-foundation",
    "theia research": "theia-research",
    "alea research": "alea-research",
    "architectural investing": "architectural-investing",
    "kaufmann": "kaufmann",
    "stuart kaufmann": "kaufmann",
    "stuart kauffman": "kaufmann",
    "knuth": "knuth",
    "donald knuth": "knuth",
    "ward whitt": "ward-whitt",
    "centola": "centola",
    "damon centola": "centola",
    "hidalgo": "hidalgo",
    "cesar hidalgo": "hidalgo",
    "juarrero": "juarrero",
    "alicia juarrero": "juarrero",
    "larsson": "larsson",
    "pine analytics": "pine-analytics",
    "pineanalytics": "pine-analytics",
    "@01resolved": "01resolved",
    "01resolved": "01resolved",
    "drew": "01resolved",
    "galaxy research": "galaxy-research",
    "fortune": "fortune",
}

# Skip these — they're agent synthesis, not external sources
SKIP_SOURCES = {
    "rio", "leo", "clay", "theseus", "vida", "astra",
    "web research compilation", "web research", "synthesis",
    "strategy session journal", "living capital thesis development",
    "attractor state historical backtesting", "teleohumanity manifesto",
    "governance - meritocratic voting + futarchy",
}


def extract_authors(source_field: str) -> list[str]:
    """Extract author names from a source: field. Returns canonical handles."""
    if not source_field:
        return []

    source = str(source_field).strip().strip('"').strip("'").lower()

    # Skip agent/internal sources
    for skip in SKIP_SOURCES:
        if source.startswith(skip):
            return []

    authors = []

    # Try direct match first
    if source in MANUAL_AUTHOR_MAP:
        return [MANUAL_AUTHOR_MAP[source]]

    # Extract first author (before comma, parenthesis, or connecting words)
    # "Bostrom, Superintelligence (2014)" → "bostrom"
    # "Conitzer et al., 2024" → "conitzer"
    # "rio, based on Solomon DAO" → skip (agent)
    match = re.match(r'^([^,(]+?)(?:\s*,|\s*\(|\s+et al|\s+based on|\s+analysis|\s+\d{4})', source)
    if match:
        candidate = match.group(1).strip()
        if candidate in MANUAL_AUTHOR_MAP:
            authors.append(MANUAL_AUTHOR_MAP[candidate])
        elif candidate in SKIP_SOURCES:
            pass
        elif len(candidate) > 2 and len(candidate) < 50:
            # Check entity map (built at runtime)
            authors.append(candidate)  # Will be matched against entity map later

    # Also check for "analysis by Rio" pattern — credit the source, not the agent
    by_match = re.search(r'analysis by (\w+)', source)
    if by_match and by_match.group(1).lower() in SKIP_SOURCES:
        pass  # Agent analysis, already handled

    return authors


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    # Build entity map
    entity_map = _build_entity_map()
    print(f"Entity map: {len(entity_map)} entries")

    # Merge with manual map
    full_map = {**MANUAL_AUTHOR_MAP, **entity_map}

    # Walk all claims
    claim_dirs = ["domains", "core", "foundations", "decisions"]
    author_counts = Counter()
    unmatched = Counter()

    for d in claim_dirs:
        base = REPO_DIR / d
        if not base.exists():
            continue
        for md_file in base.rglob("*.md"):
            if md_file.name.startswith("_"):
                continue
            try:
                text = md_file.read_text(errors="replace")
                if not text.startswith("---"):
                    continue
                end = text.find("\n---", 3)
                if end == -1:
                    continue
                fm = yaml.safe_load(text[3:end])
                if not fm or not fm.get("source"):
                    continue

                authors = extract_authors(fm["source"])
                for author in authors:
                    # Resolve through full map
                    canonical = full_map.get(author, author)
                    if canonical in full_map.values() or canonical in full_map:
                        # Known author
                        final = full_map.get(canonical, canonical)
                        author_counts[final] += 1
                    else:
                        unmatched[author] += 1

            except Exception:
                pass

    print(f"\n=== Matched authors ({len(author_counts)}) ===")
    for author, count in author_counts.most_common(25):
        print(f"  {count}x: {author}")

    print(f"\n=== Unmatched ({len(unmatched)}) ===")
    for author, count in unmatched.most_common(15):
        print(f"  {count}x: {author}")

    if args.dry_run:
        print("\nDry run — no DB changes")
        return

    # Update contributors table
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row

    updated = 0
    created = 0
    for handle, count in author_counts.items():
        existing = conn.execute("SELECT handle, sourcer_count FROM contributors WHERE handle=?", (handle,)).fetchone()
        if existing:
            new_count = (existing["sourcer_count"] or 0) + count
            conn.execute("UPDATE contributors SET sourcer_count=?, claims_merged=claims_merged+? WHERE handle=?",
                         (new_count, count, handle))
            updated += 1
        else:
            conn.execute("""INSERT INTO contributors
                (handle, sourcer_count, claims_merged, first_contribution, last_contribution, tier)
                VALUES (?, ?, ?, date('now'), date('now'), 'contributor')""",
                (handle, count, count))
            created += 1

    conn.commit()
    print(f"\nDB updated: {updated} existing contributors updated, {created} new contributors created")

    # Show results
    weights = {"sourcer": 0.15, "extractor": 0.05, "challenger": 0.35, "synthesizer": 0.25, "reviewer": 0.20}
    print("\n=== Top contributors after source-author backfill ===")
    for r in conn.execute("""SELECT handle, principal, sourcer_count, extractor_count, claims_merged
        FROM contributors ORDER BY claims_merged DESC LIMIT 15""").fetchall():
        ci = (r["sourcer_count"] or 0) * 0.15 + (r["extractor_count"] or 0) * 0.05
        p = f" -> {r['principal']}" if r['principal'] else ""
        print(f"  {r['handle']}{p}: claims={r['claims_merged']}, src={r['sourcer_count']}, CI={round(ci, 2)}")


if __name__ == "__main__":
    main()