teleo-infrastructure/scripts/bootstrap-contributors.py

#!/usr/bin/env python3
"""Bootstrap contributors table from git history + claim files.

One-time script. Idempotent (safe to re-run — upserts, doesn't duplicate).
Walks:
1. Git log on main — Pentagon-Agent trailers → extractor credit
2. Claim files in domains/ — source field → sourcer credit (best-effort)
3. PR review comments (if available) → reviewer credit

Run as teleo user on VPS:
    cd /opt/teleo-eval/workspaces/main
    python3 /opt/teleo-eval/pipeline/bootstrap-contributors.py

Epimetheus owns this script. Run once after initial deploy, then
post-merge callback handles ongoing attribution.
"""

import glob
import os
import re
import sqlite3
import subprocess
import sys
from datetime import date, datetime
from pathlib import Path

# Add pipeline lib/ to path
sys.path.insert(0, str(Path(__file__).parent))

from lib.attribution import parse_attribution, VALID_ROLES
from lib.post_extract import parse_frontmatter

DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
REPO_DIR = os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")

# Known agent handles — these are real contributors
AGENT_HANDLES = {"leo", "rio", "clay", "theseus", "vida", "astra", "ganymede", "epimetheus", "rhea"}

# m3taversal directed all agent research — credit as sourcer on agent-extracted claims
DIRECTOR_HANDLE = "m3taversal"

# Patterns that indicate a source slug, not a real contributor handle
_SLUG_SUFFIXES = {
    "-thesis", "-analysis", "-development", "-compilation", "-journal",
    "-manifesto", "-report", "-backtesting", "-plan", "-investing",
    "-research", "-overview", "-session", "-strategy",
}

_SLUG_PATTERNS = [
    re.compile(r".*\(.*\)"),           # parentheses: "conitzer-et-al.-(2024)"
    re.compile(r".*[&+].*"),           # special chars
    re.compile(r".*---.*"),            # triple hyphen
    re.compile(r".*\d{4}$"),           # ends in year: "knuth-2026"
    re.compile(r".*\d{4}-\d{2}.*"),   # dates in handle
    re.compile(r".*et-al\.?$"),        # academic citations: "chakraborty-et-al."
    re.compile(r".*-dao$"),            # DAO names as handles: "areal-dao"
    re.compile(r".*case-study$"),      # "boardy-ai-case-study"
    re.compile(r"^multiple-sources"),  # "multiple-sources-(pymnts"
    re.compile(r".*-for-humanity$"),   # "grand-strategy-for-humanity"
]

# Known real people/orgs that might look like slugs but aren't
# Known real people and organizations — verified manually
_REAL_HANDLES = {
    # People
    "doug-shapiro", "noah-smith", "dario-amodei", "ward-whitt",
    "clayton-christensen", "heavey", "bostrom", "hanson", "karpathy",
    "metaproph3t", "metanallok", "mmdhrumil", "simonw", "swyx",
    "ceterispar1bus", "oxranga", "tamim-ansary", "dan-slimmon",
    "hayek", "blackmore", "ostrom", "kaufmann", "ramstead", "hidalgo",
    "bak", "coase", "wiener", "juarrero", "centola", "larsson",
    "corless", "vlahakis", "van-leeuwaarden", "spizzirri", "adams",
    "marshall-mcluhan",
    # Organizations
    "bessemer-venture-partners", "kaiser-family-foundation",
    "alea-research", "galaxy-research", "theiaresearch", "numerai",
    "tubefilter", "anthropic", "fortune", "dagster",
}


def _is_valid_handle(handle: str) -> bool:
    """Check if a handle represents a real person/agent, not a source slug.

    Inverted logic from _is_source_slug — WHITELIST approach.
    Only accept: known agents, known real handles, and handles that look like
    real X handles or human names (short, no special chars, few hyphens).
    (Ganymede: tighten parser, stop extracting from free-text source fields)
    """
    if handle in AGENT_HANDLES:
        return True
    if handle in _REAL_HANDLES:
        return True
    # Reject obvious garbage
    if len(handle) > 30:
        return False
    if len(handle) < 2:
        return False
    # Reject anything with parentheses, ampersands, periods, numbers-only suffixes
    if re.search(r"[()&+|]", handle):
        return False
    if re.search(r"\.\d", handle):  # "et-al.-(2024)"
        return False
    if re.search(r"\d{4}$", handle):  # ends in year
        return False
    # Reject content descriptor suffixes
    for suffix in _SLUG_SUFFIXES:
        if handle.endswith(suffix):
            return False
    # Reject 4+ hyphenated segments (source titles, not names)
    if handle.count("-") >= 3:
        return False
    # Reject known non-person patterns
    if re.search(r"et-al|case-study|multiple-sources|proposal-on|strategy-for", handle):
        return False
    # Reject handles containing content-type words
    if re.search(r"proposal|token-structure|conversation$|launchpad$|capital$|^some-|^living-|/", handle):
        return False
    # Reject academic citation patterns "name-YYYY-journal"
    if re.search(r"-\d{4}-", handle):
        return False
    return True


def get_connection():
    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA busy_timeout=10000")
    return conn


def upsert_contributor(conn, handle, role, contribution_date=None):
    """Upsert a contributor, incrementing the role count."""
    if not handle or handle in ("unknown", "none", "null"):
        return

    handle = handle.strip().lower().lstrip("@")
    if len(handle) < 2:
        return

    # Only accept valid handles — whitelist approach (Ganymede review)
    if not _is_valid_handle(handle):
        return

    role_col = f"{role}_count"
    if role_col not in {f"{r}_count" for r in VALID_ROLES}:
        return

    today = contribution_date or date.today().isoformat()

    existing = conn.execute("SELECT handle FROM contributors WHERE handle = ?", (handle,)).fetchone()
    if existing:
        conn.execute(
            f"""UPDATE contributors SET
                {role_col} = {role_col} + 1,
                claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END,
                last_contribution = MAX(last_contribution, ?),
                updated_at = datetime('now')
            WHERE handle = ?""",
            (role, today, handle),
        )
    else:
        conn.execute(
            f"""INSERT INTO contributors (handle, first_contribution, last_contribution, {role_col}, claims_merged)
            VALUES (?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""",
            (handle, today, today, role),
        )


def bootstrap_from_git_log(conn):
    """Walk git log for Pentagon-Agent trailers → extractor credit."""
    print("Phase 1: Walking git log for Pentagon-Agent trailers...")

    result = subprocess.run(
        ["git", "log", "--format=%H|%aI|%b%N", "main"],
        cwd=REPO_DIR, capture_output=True, text=True, timeout=30,
    )
    if result.returncode != 0:
        print(f"  ERROR: git log failed: {result.stderr[:200]}")
        return 0

    count = 0
    for block in result.stdout.split("\n\n"):
        lines = block.strip().split("\n")
        if not lines:
            continue

        # First line has commit hash and date
        first = lines[0]
        parts = first.split("|", 2)
        if len(parts) < 2:
            continue
        commit_date = parts[1][:10]  # YYYY-MM-DD

        # Search all lines for Pentagon-Agent trailer
        for line in lines:
            match = re.search(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", line)
            if match:
                agent_name = match.group(1).lower()
                upsert_contributor(conn, agent_name, "extractor", commit_date)
                count += 1

    print(f"  Found {count} extractor credits from git trailers")
    return count


def bootstrap_from_claim_files(conn):
    """Walk claim files for source field → sourcer credit."""
    print("Phase 2: Walking claim files for sourcer attribution...")

    count = 0
    for pattern in ["domains/**/*.md", "core/**/*.md", "foundations/**/*.md"]:
        for filepath in glob.glob(os.path.join(REPO_DIR, pattern), recursive=True):
            basename = os.path.basename(filepath)
            if basename.startswith("_"):
                continue

            try:
                content = Path(filepath).read_text()
            except Exception:
                continue

            fm, _ = parse_frontmatter(content)
            if fm is None or fm.get("type") not in ("claim", "framework"):
                continue

            created = fm.get("created")
            if isinstance(created, date):
                created = created.isoformat()
            elif isinstance(created, str):
                pass  # already string
            else:
                created = None

            # Try structured attribution first
            attribution = parse_attribution(fm)
            for role, entries in attribution.items():
                for entry in entries:
                    if entry.get("handle"):
                        upsert_contributor(conn, entry["handle"], role, created)
                        count += 1

            # Only extract handles from structured attribution blocks, NOT from
            # free-text source: fields. Source fields produce garbage handles like
            # "nejm-flow-trial-(n=3" (Ganymede review — Priority 2 fix).
            # Exception: @ handles are reliable even in free text.
            if not any(attribution[r] for r in VALID_ROLES):
                source = fm.get("source", "")
                if isinstance(source, str):
                    handle_match = re.search(r"@(\w+)", source)
                    if handle_match:
                        upsert_contributor(conn, handle_match.group(1), "sourcer", created)
                        count += 1

            # Credit m3taversal as sourcer/director on all agent-extracted claims.
            # m3taversal directed every research mission that produced these claims.
            # Check if any agent is the extractor — if so, m3taversal is the director.
            has_agent_extractor = any(
                entry.get("handle") in AGENT_HANDLES
                for entry in attribution.get("extractor", [])
            )
            if not has_agent_extractor:
                # Also check git trailer pattern — if source mentions an agent name
                raw_source = fm.get("source", "") or ""
                source_lower = (raw_source if isinstance(raw_source, str) else str(raw_source)).lower()
                has_agent_extractor = any(a in source_lower for a in AGENT_HANDLES)

            if has_agent_extractor:
                upsert_contributor(conn, DIRECTOR_HANDLE, "sourcer", created)
                count += 1

    print(f"  Found {count} attribution credits from claim files")
    return count


def main():
    print(f"Bootstrap contributors from {REPO_DIR}")
    print(f"Database: {DB_PATH}")

    conn = get_connection()

    # Check current state
    existing = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
    print(f"Current contributors: {existing}")

    total = 0
    total += bootstrap_from_git_log(conn)
    total += bootstrap_from_claim_files(conn)

    conn.commit()

    # Summary
    final = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
    top = conn.execute(
        """SELECT handle, claims_merged, sourcer_count, extractor_count,
                  challenger_count, synthesizer_count, reviewer_count
           FROM contributors ORDER BY claims_merged DESC LIMIT 10"""
    ).fetchall()

    print(f"\n{'='*60}")
    print(f"  BOOTSTRAP COMPLETE")
    print(f"  Credits processed: {total}")
    print(f"  Contributors before: {existing}")
    print(f"  Contributors after: {final}")
    print(f"\n  Top 10 by claims_merged:")
    for row in top:
        roles = f"S:{row['sourcer_count']} E:{row['extractor_count']} C:{row['challenger_count']} Y:{row['synthesizer_count']} R:{row['reviewer_count']}"
        print(f"    {row['handle']:20s}  merged:{row['claims_merged']:>4d}  {roles}")
    print(f"{'='*60}")

    conn.close()


if __name__ == "__main__":
    main()