#!/usr/bin/env python3 """Bootstrap contributors table from git history + claim files. One-time script. Idempotent (safe to re-run — upserts, doesn't duplicate). Walks: 1. Git log on main — Pentagon-Agent trailers → extractor credit 2. Claim files in domains/ — source field → sourcer credit (best-effort) 3. PR review comments (if available) → reviewer credit Run as teleo user on VPS: cd /opt/teleo-eval/workspaces/main python3 /opt/teleo-eval/pipeline/bootstrap-contributors.py Epimetheus owns this script. Run once after initial deploy, then post-merge callback handles ongoing attribution. """ import glob import os import re import sqlite3 import subprocess import sys from datetime import date, datetime from pathlib import Path # Add pipeline lib/ to path sys.path.insert(0, str(Path(__file__).parent)) from lib.attribution import parse_attribution, VALID_ROLES from lib.post_extract import parse_frontmatter DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") REPO_DIR = os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main") # Known agent handles — these are real contributors AGENT_HANDLES = {"leo", "rio", "clay", "theseus", "vida", "astra", "ganymede", "epimetheus", "rhea"} # m3taversal directed all agent research — credit as sourcer on agent-extracted claims DIRECTOR_HANDLE = "m3taversal" # Patterns that indicate a source slug, not a real contributor handle _SLUG_SUFFIXES = { "-thesis", "-analysis", "-development", "-compilation", "-journal", "-manifesto", "-report", "-backtesting", "-plan", "-investing", "-research", "-overview", "-session", "-strategy", } _SLUG_PATTERNS = [ re.compile(r".*\(.*\)"), # parentheses: "conitzer-et-al.-(2024)" re.compile(r".*[&+].*"), # special chars re.compile(r".*---.*"), # triple hyphen re.compile(r".*\d{4}$"), # ends in year: "knuth-2026" re.compile(r".*\d{4}-\d{2}.*"), # dates in handle re.compile(r".*et-al\.?$"), # academic citations: "chakraborty-et-al." re.compile(r".*-dao$"), # DAO names as handles: "areal-dao" re.compile(r".*case-study$"), # "boardy-ai-case-study" re.compile(r"^multiple-sources"), # "multiple-sources-(pymnts" re.compile(r".*-for-humanity$"), # "grand-strategy-for-humanity" ] # Known real people/orgs that might look like slugs but aren't # Known real people and organizations — verified manually _REAL_HANDLES = { # People "doug-shapiro", "noah-smith", "dario-amodei", "ward-whitt", "clayton-christensen", "heavey", "bostrom", "hanson", "karpathy", "metaproph3t", "metanallok", "mmdhrumil", "simonw", "swyx", "ceterispar1bus", "oxranga", "tamim-ansary", "dan-slimmon", "hayek", "blackmore", "ostrom", "kaufmann", "ramstead", "hidalgo", "bak", "coase", "wiener", "juarrero", "centola", "larsson", "corless", "vlahakis", "van-leeuwaarden", "spizzirri", "adams", "marshall-mcluhan", # Organizations "bessemer-venture-partners", "kaiser-family-foundation", "alea-research", "galaxy-research", "theiaresearch", "numerai", "tubefilter", "anthropic", "fortune", "dagster", } def _is_valid_handle(handle: str) -> bool: """Check if a handle represents a real person/agent, not a source slug. Inverted logic from _is_source_slug — WHITELIST approach. Only accept: known agents, known real handles, and handles that look like real X handles or human names (short, no special chars, few hyphens). (Ganymede: tighten parser, stop extracting from free-text source fields) """ if handle in AGENT_HANDLES: return True if handle in _REAL_HANDLES: return True # Reject obvious garbage if len(handle) > 30: return False if len(handle) < 2: return False # Reject anything with parentheses, ampersands, periods, numbers-only suffixes if re.search(r"[()&+|]", handle): return False if re.search(r"\.\d", handle): # "et-al.-(2024)" return False if re.search(r"\d{4}$", handle): # ends in year return False # Reject content descriptor suffixes for suffix in _SLUG_SUFFIXES: if handle.endswith(suffix): return False # Reject 4+ hyphenated segments (source titles, not names) if handle.count("-") >= 3: return False # Reject known non-person patterns if re.search(r"et-al|case-study|multiple-sources|proposal-on|strategy-for", handle): return False # Reject handles containing content-type words if re.search(r"proposal|token-structure|conversation$|launchpad$|capital$|^some-|^living-|/", handle): return False # Reject academic citation patterns "name-YYYY-journal" if re.search(r"-\d{4}-", handle): return False return True def get_connection(): conn = sqlite3.connect(DB_PATH, timeout=30) conn.row_factory = sqlite3.Row conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=10000") return conn def upsert_contributor(conn, handle, role, contribution_date=None): """Upsert a contributor, incrementing the role count.""" if not handle or handle in ("unknown", "none", "null"): return handle = handle.strip().lower().lstrip("@") if len(handle) < 2: return # Only accept valid handles — whitelist approach (Ganymede review) if not _is_valid_handle(handle): return role_col = f"{role}_count" if role_col not in {f"{r}_count" for r in VALID_ROLES}: return today = contribution_date or date.today().isoformat() existing = conn.execute("SELECT handle FROM contributors WHERE handle = ?", (handle,)).fetchone() if existing: conn.execute( f"""UPDATE contributors SET {role_col} = {role_col} + 1, claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END, last_contribution = MAX(last_contribution, ?), updated_at = datetime('now') WHERE handle = ?""", (role, today, handle), ) else: conn.execute( f"""INSERT INTO contributors (handle, first_contribution, last_contribution, {role_col}, claims_merged) VALUES (?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""", (handle, today, today, role), ) def bootstrap_from_git_log(conn): """Walk git log for Pentagon-Agent trailers → extractor credit.""" print("Phase 1: Walking git log for Pentagon-Agent trailers...") result = subprocess.run( ["git", "log", "--format=%H|%aI|%b%N", "main"], cwd=REPO_DIR, capture_output=True, text=True, timeout=30, ) if result.returncode != 0: print(f" ERROR: git log failed: {result.stderr[:200]}") return 0 count = 0 for block in result.stdout.split("\n\n"): lines = block.strip().split("\n") if not lines: continue # First line has commit hash and date first = lines[0] parts = first.split("|", 2) if len(parts) < 2: continue commit_date = parts[1][:10] # YYYY-MM-DD # Search all lines for Pentagon-Agent trailer for line in lines: match = re.search(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", line) if match: agent_name = match.group(1).lower() upsert_contributor(conn, agent_name, "extractor", commit_date) count += 1 print(f" Found {count} extractor credits from git trailers") return count def bootstrap_from_claim_files(conn): """Walk claim files for source field → sourcer credit.""" print("Phase 2: Walking claim files for sourcer attribution...") count = 0 for pattern in ["domains/**/*.md", "core/**/*.md", "foundations/**/*.md"]: for filepath in glob.glob(os.path.join(REPO_DIR, pattern), recursive=True): basename = os.path.basename(filepath) if basename.startswith("_"): continue try: content = Path(filepath).read_text() except Exception: continue fm, _ = parse_frontmatter(content) if fm is None or fm.get("type") not in ("claim", "framework"): continue created = fm.get("created") if isinstance(created, date): created = created.isoformat() elif isinstance(created, str): pass # already string else: created = None # Try structured attribution first attribution = parse_attribution(fm) for role, entries in attribution.items(): for entry in entries: if entry.get("handle"): upsert_contributor(conn, entry["handle"], role, created) count += 1 # Only extract handles from structured attribution blocks, NOT from # free-text source: fields. Source fields produce garbage handles like # "nejm-flow-trial-(n=3" (Ganymede review — Priority 2 fix). # Exception: @ handles are reliable even in free text. if not any(attribution[r] for r in VALID_ROLES): source = fm.get("source", "") if isinstance(source, str): handle_match = re.search(r"@(\w+)", source) if handle_match: upsert_contributor(conn, handle_match.group(1), "sourcer", created) count += 1 # Credit m3taversal as sourcer/director on all agent-extracted claims. # m3taversal directed every research mission that produced these claims. # Check if any agent is the extractor — if so, m3taversal is the director. has_agent_extractor = any( entry.get("handle") in AGENT_HANDLES for entry in attribution.get("extractor", []) ) if not has_agent_extractor: # Also check git trailer pattern — if source mentions an agent name raw_source = fm.get("source", "") or "" source_lower = (raw_source if isinstance(raw_source, str) else str(raw_source)).lower() has_agent_extractor = any(a in source_lower for a in AGENT_HANDLES) if has_agent_extractor: upsert_contributor(conn, DIRECTOR_HANDLE, "sourcer", created) count += 1 print(f" Found {count} attribution credits from claim files") return count def main(): print(f"Bootstrap contributors from {REPO_DIR}") print(f"Database: {DB_PATH}") conn = get_connection() # Check current state existing = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"] print(f"Current contributors: {existing}") total = 0 total += bootstrap_from_git_log(conn) total += bootstrap_from_claim_files(conn) conn.commit() # Summary final = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"] top = conn.execute( """SELECT handle, claims_merged, sourcer_count, extractor_count, challenger_count, synthesizer_count, reviewer_count FROM contributors ORDER BY claims_merged DESC LIMIT 10""" ).fetchall() print(f"\n{'='*60}") print(f" BOOTSTRAP COMPLETE") print(f" Credits processed: {total}") print(f" Contributors before: {existing}") print(f" Contributors after: {final}") print(f"\n Top 10 by claims_merged:") for row in top: roles = f"S:{row['sourcer_count']} E:{row['extractor_count']} C:{row['challenger_count']} Y:{row['synthesizer_count']} R:{row['reviewer_count']}" print(f" {row['handle']:20s} merged:{row['claims_merged']:>4d} {roles}") print(f"{'='*60}") conn.close() if __name__ == "__main__": main()