teleo-infrastructure/scripts/bootstrap-contributors.py
m3taversal d2aec7fee3
Some checks are pending
CI / lint-and-test (push) Waiting to run
feat: reorganize repo with clear directory boundaries and agent ownership
Move scattered root-level files into categorized directories:
- deploy/ — deployment + mirror scripts (Ship)
- scripts/ — one-off backfills + migrations (Ship)
- research/ — nightly research + prompts (Ship)
- docs/ — all operational documentation (shared)

Delete 3 dead cron scripts replaced by pipeline daemon:
- batch-extract-50.sh, evaluate-trigger.sh, extract-cron.sh

Add CODEOWNERS mapping every path to its owning agent.
Add README with directory structure, ownership table, and VPS layout.
Update deploy.sh paths to match new structure.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 18:20:13 +01:00

315 lines
12 KiB
Python

#!/usr/bin/env python3
"""Bootstrap contributors table from git history + claim files.
One-time script. Idempotent (safe to re-run — upserts, doesn't duplicate).
Walks:
1. Git log on main — Pentagon-Agent trailers → extractor credit
2. Claim files in domains/ — source field → sourcer credit (best-effort)
3. PR review comments (if available) → reviewer credit
Run as teleo user on VPS:
cd /opt/teleo-eval/workspaces/main
python3 /opt/teleo-eval/pipeline/bootstrap-contributors.py
Epimetheus owns this script. Run once after initial deploy, then
post-merge callback handles ongoing attribution.
"""
import glob
import os
import re
import sqlite3
import subprocess
import sys
from datetime import date, datetime
from pathlib import Path
# Add pipeline lib/ to path
sys.path.insert(0, str(Path(__file__).parent))
from lib.attribution import parse_attribution, VALID_ROLES
from lib.post_extract import parse_frontmatter
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
REPO_DIR = os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")
# Known agent handles — these are real contributors
AGENT_HANDLES = {"leo", "rio", "clay", "theseus", "vida", "astra", "ganymede", "epimetheus", "rhea"}
# m3taversal directed all agent research — credit as sourcer on agent-extracted claims
DIRECTOR_HANDLE = "m3taversal"
# Patterns that indicate a source slug, not a real contributor handle
_SLUG_SUFFIXES = {
"-thesis", "-analysis", "-development", "-compilation", "-journal",
"-manifesto", "-report", "-backtesting", "-plan", "-investing",
"-research", "-overview", "-session", "-strategy",
}
_SLUG_PATTERNS = [
re.compile(r".*\(.*\)"), # parentheses: "conitzer-et-al.-(2024)"
re.compile(r".*[&+].*"), # special chars
re.compile(r".*---.*"), # triple hyphen
re.compile(r".*\d{4}$"), # ends in year: "knuth-2026"
re.compile(r".*\d{4}-\d{2}.*"), # dates in handle
re.compile(r".*et-al\.?$"), # academic citations: "chakraborty-et-al."
re.compile(r".*-dao$"), # DAO names as handles: "areal-dao"
re.compile(r".*case-study$"), # "boardy-ai-case-study"
re.compile(r"^multiple-sources"), # "multiple-sources-(pymnts"
re.compile(r".*-for-humanity$"), # "grand-strategy-for-humanity"
]
# Known real people/orgs that might look like slugs but aren't
# Known real people and organizations — verified manually
_REAL_HANDLES = {
# People
"doug-shapiro", "noah-smith", "dario-amodei", "ward-whitt",
"clayton-christensen", "heavey", "bostrom", "hanson", "karpathy",
"metaproph3t", "metanallok", "mmdhrumil", "simonw", "swyx",
"ceterispar1bus", "oxranga", "tamim-ansary", "dan-slimmon",
"hayek", "blackmore", "ostrom", "kaufmann", "ramstead", "hidalgo",
"bak", "coase", "wiener", "juarrero", "centola", "larsson",
"corless", "vlahakis", "van-leeuwaarden", "spizzirri", "adams",
"marshall-mcluhan",
# Organizations
"bessemer-venture-partners", "kaiser-family-foundation",
"alea-research", "galaxy-research", "theiaresearch", "numerai",
"tubefilter", "anthropic", "fortune", "dagster",
}
def _is_valid_handle(handle: str) -> bool:
"""Check if a handle represents a real person/agent, not a source slug.
Inverted logic from _is_source_slug — WHITELIST approach.
Only accept: known agents, known real handles, and handles that look like
real X handles or human names (short, no special chars, few hyphens).
(Ganymede: tighten parser, stop extracting from free-text source fields)
"""
if handle in AGENT_HANDLES:
return True
if handle in _REAL_HANDLES:
return True
# Reject obvious garbage
if len(handle) > 30:
return False
if len(handle) < 2:
return False
# Reject anything with parentheses, ampersands, periods, numbers-only suffixes
if re.search(r"[()&+|]", handle):
return False
if re.search(r"\.\d", handle): # "et-al.-(2024)"
return False
if re.search(r"\d{4}$", handle): # ends in year
return False
# Reject content descriptor suffixes
for suffix in _SLUG_SUFFIXES:
if handle.endswith(suffix):
return False
# Reject 4+ hyphenated segments (source titles, not names)
if handle.count("-") >= 3:
return False
# Reject known non-person patterns
if re.search(r"et-al|case-study|multiple-sources|proposal-on|strategy-for", handle):
return False
# Reject handles containing content-type words
if re.search(r"proposal|token-structure|conversation$|launchpad$|capital$|^some-|^living-|/", handle):
return False
# Reject academic citation patterns "name-YYYY-journal"
if re.search(r"-\d{4}-", handle):
return False
return True
def get_connection():
conn = sqlite3.connect(DB_PATH, timeout=30)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=10000")
return conn
def upsert_contributor(conn, handle, role, contribution_date=None):
"""Upsert a contributor, incrementing the role count."""
if not handle or handle in ("unknown", "none", "null"):
return
handle = handle.strip().lower().lstrip("@")
if len(handle) < 2:
return
# Only accept valid handles — whitelist approach (Ganymede review)
if not _is_valid_handle(handle):
return
role_col = f"{role}_count"
if role_col not in {f"{r}_count" for r in VALID_ROLES}:
return
today = contribution_date or date.today().isoformat()
existing = conn.execute("SELECT handle FROM contributors WHERE handle = ?", (handle,)).fetchone()
if existing:
conn.execute(
f"""UPDATE contributors SET
{role_col} = {role_col} + 1,
claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END,
last_contribution = MAX(last_contribution, ?),
updated_at = datetime('now')
WHERE handle = ?""",
(role, today, handle),
)
else:
conn.execute(
f"""INSERT INTO contributors (handle, first_contribution, last_contribution, {role_col}, claims_merged)
VALUES (?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""",
(handle, today, today, role),
)
def bootstrap_from_git_log(conn):
"""Walk git log for Pentagon-Agent trailers → extractor credit."""
print("Phase 1: Walking git log for Pentagon-Agent trailers...")
result = subprocess.run(
["git", "log", "--format=%H|%aI|%b%N", "main"],
cwd=REPO_DIR, capture_output=True, text=True, timeout=30,
)
if result.returncode != 0:
print(f" ERROR: git log failed: {result.stderr[:200]}")
return 0
count = 0
for block in result.stdout.split("\n\n"):
lines = block.strip().split("\n")
if not lines:
continue
# First line has commit hash and date
first = lines[0]
parts = first.split("|", 2)
if len(parts) < 2:
continue
commit_date = parts[1][:10] # YYYY-MM-DD
# Search all lines for Pentagon-Agent trailer
for line in lines:
match = re.search(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", line)
if match:
agent_name = match.group(1).lower()
upsert_contributor(conn, agent_name, "extractor", commit_date)
count += 1
print(f" Found {count} extractor credits from git trailers")
return count
def bootstrap_from_claim_files(conn):
"""Walk claim files for source field → sourcer credit."""
print("Phase 2: Walking claim files for sourcer attribution...")
count = 0
for pattern in ["domains/**/*.md", "core/**/*.md", "foundations/**/*.md"]:
for filepath in glob.glob(os.path.join(REPO_DIR, pattern), recursive=True):
basename = os.path.basename(filepath)
if basename.startswith("_"):
continue
try:
content = Path(filepath).read_text()
except Exception:
continue
fm, _ = parse_frontmatter(content)
if fm is None or fm.get("type") not in ("claim", "framework"):
continue
created = fm.get("created")
if isinstance(created, date):
created = created.isoformat()
elif isinstance(created, str):
pass # already string
else:
created = None
# Try structured attribution first
attribution = parse_attribution(fm)
for role, entries in attribution.items():
for entry in entries:
if entry.get("handle"):
upsert_contributor(conn, entry["handle"], role, created)
count += 1
# Only extract handles from structured attribution blocks, NOT from
# free-text source: fields. Source fields produce garbage handles like
# "nejm-flow-trial-(n=3" (Ganymede review — Priority 2 fix).
# Exception: @ handles are reliable even in free text.
if not any(attribution[r] for r in VALID_ROLES):
source = fm.get("source", "")
if isinstance(source, str):
handle_match = re.search(r"@(\w+)", source)
if handle_match:
upsert_contributor(conn, handle_match.group(1), "sourcer", created)
count += 1
# Credit m3taversal as sourcer/director on all agent-extracted claims.
# m3taversal directed every research mission that produced these claims.
# Check if any agent is the extractor — if so, m3taversal is the director.
has_agent_extractor = any(
entry.get("handle") in AGENT_HANDLES
for entry in attribution.get("extractor", [])
)
if not has_agent_extractor:
# Also check git trailer pattern — if source mentions an agent name
raw_source = fm.get("source", "") or ""
source_lower = (raw_source if isinstance(raw_source, str) else str(raw_source)).lower()
has_agent_extractor = any(a in source_lower for a in AGENT_HANDLES)
if has_agent_extractor:
upsert_contributor(conn, DIRECTOR_HANDLE, "sourcer", created)
count += 1
print(f" Found {count} attribution credits from claim files")
return count
def main():
print(f"Bootstrap contributors from {REPO_DIR}")
print(f"Database: {DB_PATH}")
conn = get_connection()
# Check current state
existing = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
print(f"Current contributors: {existing}")
total = 0
total += bootstrap_from_git_log(conn)
total += bootstrap_from_claim_files(conn)
conn.commit()
# Summary
final = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
top = conn.execute(
"""SELECT handle, claims_merged, sourcer_count, extractor_count,
challenger_count, synthesizer_count, reviewer_count
FROM contributors ORDER BY claims_merged DESC LIMIT 10"""
).fetchall()
print(f"\n{'='*60}")
print(f" BOOTSTRAP COMPLETE")
print(f" Credits processed: {total}")
print(f" Contributors before: {existing}")
print(f" Contributors after: {final}")
print(f"\n Top 10 by claims_merged:")
for row in top:
roles = f"S:{row['sourcer_count']} E:{row['extractor_count']} C:{row['challenger_count']} Y:{row['synthesizer_count']} R:{row['reviewer_count']}"
print(f" {row['handle']:20s} merged:{row['claims_merged']:>4d} {roles}")
print(f"{'='*60}")
conn.close()
if __name__ == "__main__":
main()