Some checks are pending
CI / lint-and-test (push) Waiting to run
Move scattered root-level files into categorized directories: - deploy/ — deployment + mirror scripts (Ship) - scripts/ — one-off backfills + migrations (Ship) - research/ — nightly research + prompts (Ship) - docs/ — all operational documentation (shared) Delete 3 dead cron scripts replaced by pipeline daemon: - batch-extract-50.sh, evaluate-trigger.sh, extract-cron.sh Add CODEOWNERS mapping every path to its owning agent. Add README with directory structure, ownership table, and VPS layout. Update deploy.sh paths to match new structure. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
315 lines
12 KiB
Python
315 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Bootstrap contributors table from git history + claim files.
|
|
|
|
One-time script. Idempotent (safe to re-run — upserts, doesn't duplicate).
|
|
Walks:
|
|
1. Git log on main — Pentagon-Agent trailers → extractor credit
|
|
2. Claim files in domains/ — source field → sourcer credit (best-effort)
|
|
3. PR review comments (if available) → reviewer credit
|
|
|
|
Run as teleo user on VPS:
|
|
cd /opt/teleo-eval/workspaces/main
|
|
python3 /opt/teleo-eval/pipeline/bootstrap-contributors.py
|
|
|
|
Epimetheus owns this script. Run once after initial deploy, then
|
|
post-merge callback handles ongoing attribution.
|
|
"""
|
|
|
|
import glob
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import sys
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
|
|
# Add pipeline lib/ to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from lib.attribution import parse_attribution, VALID_ROLES
|
|
from lib.post_extract import parse_frontmatter
|
|
|
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
REPO_DIR = os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")
|
|
|
|
# Known agent handles — these are real contributors
|
|
AGENT_HANDLES = {"leo", "rio", "clay", "theseus", "vida", "astra", "ganymede", "epimetheus", "rhea"}
|
|
|
|
# m3taversal directed all agent research — credit as sourcer on agent-extracted claims
|
|
DIRECTOR_HANDLE = "m3taversal"
|
|
|
|
# Patterns that indicate a source slug, not a real contributor handle
|
|
_SLUG_SUFFIXES = {
|
|
"-thesis", "-analysis", "-development", "-compilation", "-journal",
|
|
"-manifesto", "-report", "-backtesting", "-plan", "-investing",
|
|
"-research", "-overview", "-session", "-strategy",
|
|
}
|
|
|
|
_SLUG_PATTERNS = [
|
|
re.compile(r".*\(.*\)"), # parentheses: "conitzer-et-al.-(2024)"
|
|
re.compile(r".*[&+].*"), # special chars
|
|
re.compile(r".*---.*"), # triple hyphen
|
|
re.compile(r".*\d{4}$"), # ends in year: "knuth-2026"
|
|
re.compile(r".*\d{4}-\d{2}.*"), # dates in handle
|
|
re.compile(r".*et-al\.?$"), # academic citations: "chakraborty-et-al."
|
|
re.compile(r".*-dao$"), # DAO names as handles: "areal-dao"
|
|
re.compile(r".*case-study$"), # "boardy-ai-case-study"
|
|
re.compile(r"^multiple-sources"), # "multiple-sources-(pymnts"
|
|
re.compile(r".*-for-humanity$"), # "grand-strategy-for-humanity"
|
|
]
|
|
|
|
# Known real people/orgs that might look like slugs but aren't
|
|
# Known real people and organizations — verified manually
|
|
_REAL_HANDLES = {
|
|
# People
|
|
"doug-shapiro", "noah-smith", "dario-amodei", "ward-whitt",
|
|
"clayton-christensen", "heavey", "bostrom", "hanson", "karpathy",
|
|
"metaproph3t", "metanallok", "mmdhrumil", "simonw", "swyx",
|
|
"ceterispar1bus", "oxranga", "tamim-ansary", "dan-slimmon",
|
|
"hayek", "blackmore", "ostrom", "kaufmann", "ramstead", "hidalgo",
|
|
"bak", "coase", "wiener", "juarrero", "centola", "larsson",
|
|
"corless", "vlahakis", "van-leeuwaarden", "spizzirri", "adams",
|
|
"marshall-mcluhan",
|
|
# Organizations
|
|
"bessemer-venture-partners", "kaiser-family-foundation",
|
|
"alea-research", "galaxy-research", "theiaresearch", "numerai",
|
|
"tubefilter", "anthropic", "fortune", "dagster",
|
|
}
|
|
|
|
|
|
def _is_valid_handle(handle: str) -> bool:
|
|
"""Check if a handle represents a real person/agent, not a source slug.
|
|
|
|
Inverted logic from _is_source_slug — WHITELIST approach.
|
|
Only accept: known agents, known real handles, and handles that look like
|
|
real X handles or human names (short, no special chars, few hyphens).
|
|
(Ganymede: tighten parser, stop extracting from free-text source fields)
|
|
"""
|
|
if handle in AGENT_HANDLES:
|
|
return True
|
|
if handle in _REAL_HANDLES:
|
|
return True
|
|
# Reject obvious garbage
|
|
if len(handle) > 30:
|
|
return False
|
|
if len(handle) < 2:
|
|
return False
|
|
# Reject anything with parentheses, ampersands, periods, numbers-only suffixes
|
|
if re.search(r"[()&+|]", handle):
|
|
return False
|
|
if re.search(r"\.\d", handle): # "et-al.-(2024)"
|
|
return False
|
|
if re.search(r"\d{4}$", handle): # ends in year
|
|
return False
|
|
# Reject content descriptor suffixes
|
|
for suffix in _SLUG_SUFFIXES:
|
|
if handle.endswith(suffix):
|
|
return False
|
|
# Reject 4+ hyphenated segments (source titles, not names)
|
|
if handle.count("-") >= 3:
|
|
return False
|
|
# Reject known non-person patterns
|
|
if re.search(r"et-al|case-study|multiple-sources|proposal-on|strategy-for", handle):
|
|
return False
|
|
# Reject handles containing content-type words
|
|
if re.search(r"proposal|token-structure|conversation$|launchpad$|capital$|^some-|^living-|/", handle):
|
|
return False
|
|
# Reject academic citation patterns "name-YYYY-journal"
|
|
if re.search(r"-\d{4}-", handle):
|
|
return False
|
|
return True
|
|
|
|
|
|
def get_connection():
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
conn.execute("PRAGMA busy_timeout=10000")
|
|
return conn
|
|
|
|
|
|
def upsert_contributor(conn, handle, role, contribution_date=None):
|
|
"""Upsert a contributor, incrementing the role count."""
|
|
if not handle or handle in ("unknown", "none", "null"):
|
|
return
|
|
|
|
handle = handle.strip().lower().lstrip("@")
|
|
if len(handle) < 2:
|
|
return
|
|
|
|
# Only accept valid handles — whitelist approach (Ganymede review)
|
|
if not _is_valid_handle(handle):
|
|
return
|
|
|
|
role_col = f"{role}_count"
|
|
if role_col not in {f"{r}_count" for r in VALID_ROLES}:
|
|
return
|
|
|
|
today = contribution_date or date.today().isoformat()
|
|
|
|
existing = conn.execute("SELECT handle FROM contributors WHERE handle = ?", (handle,)).fetchone()
|
|
if existing:
|
|
conn.execute(
|
|
f"""UPDATE contributors SET
|
|
{role_col} = {role_col} + 1,
|
|
claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END,
|
|
last_contribution = MAX(last_contribution, ?),
|
|
updated_at = datetime('now')
|
|
WHERE handle = ?""",
|
|
(role, today, handle),
|
|
)
|
|
else:
|
|
conn.execute(
|
|
f"""INSERT INTO contributors (handle, first_contribution, last_contribution, {role_col}, claims_merged)
|
|
VALUES (?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""",
|
|
(handle, today, today, role),
|
|
)
|
|
|
|
|
|
def bootstrap_from_git_log(conn):
|
|
"""Walk git log for Pentagon-Agent trailers → extractor credit."""
|
|
print("Phase 1: Walking git log for Pentagon-Agent trailers...")
|
|
|
|
result = subprocess.run(
|
|
["git", "log", "--format=%H|%aI|%b%N", "main"],
|
|
cwd=REPO_DIR, capture_output=True, text=True, timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" ERROR: git log failed: {result.stderr[:200]}")
|
|
return 0
|
|
|
|
count = 0
|
|
for block in result.stdout.split("\n\n"):
|
|
lines = block.strip().split("\n")
|
|
if not lines:
|
|
continue
|
|
|
|
# First line has commit hash and date
|
|
first = lines[0]
|
|
parts = first.split("|", 2)
|
|
if len(parts) < 2:
|
|
continue
|
|
commit_date = parts[1][:10] # YYYY-MM-DD
|
|
|
|
# Search all lines for Pentagon-Agent trailer
|
|
for line in lines:
|
|
match = re.search(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", line)
|
|
if match:
|
|
agent_name = match.group(1).lower()
|
|
upsert_contributor(conn, agent_name, "extractor", commit_date)
|
|
count += 1
|
|
|
|
print(f" Found {count} extractor credits from git trailers")
|
|
return count
|
|
|
|
|
|
def bootstrap_from_claim_files(conn):
|
|
"""Walk claim files for source field → sourcer credit."""
|
|
print("Phase 2: Walking claim files for sourcer attribution...")
|
|
|
|
count = 0
|
|
for pattern in ["domains/**/*.md", "core/**/*.md", "foundations/**/*.md"]:
|
|
for filepath in glob.glob(os.path.join(REPO_DIR, pattern), recursive=True):
|
|
basename = os.path.basename(filepath)
|
|
if basename.startswith("_"):
|
|
continue
|
|
|
|
try:
|
|
content = Path(filepath).read_text()
|
|
except Exception:
|
|
continue
|
|
|
|
fm, _ = parse_frontmatter(content)
|
|
if fm is None or fm.get("type") not in ("claim", "framework"):
|
|
continue
|
|
|
|
created = fm.get("created")
|
|
if isinstance(created, date):
|
|
created = created.isoformat()
|
|
elif isinstance(created, str):
|
|
pass # already string
|
|
else:
|
|
created = None
|
|
|
|
# Try structured attribution first
|
|
attribution = parse_attribution(fm)
|
|
for role, entries in attribution.items():
|
|
for entry in entries:
|
|
if entry.get("handle"):
|
|
upsert_contributor(conn, entry["handle"], role, created)
|
|
count += 1
|
|
|
|
# Only extract handles from structured attribution blocks, NOT from
|
|
# free-text source: fields. Source fields produce garbage handles like
|
|
# "nejm-flow-trial-(n=3" (Ganymede review — Priority 2 fix).
|
|
# Exception: @ handles are reliable even in free text.
|
|
if not any(attribution[r] for r in VALID_ROLES):
|
|
source = fm.get("source", "")
|
|
if isinstance(source, str):
|
|
handle_match = re.search(r"@(\w+)", source)
|
|
if handle_match:
|
|
upsert_contributor(conn, handle_match.group(1), "sourcer", created)
|
|
count += 1
|
|
|
|
# Credit m3taversal as sourcer/director on all agent-extracted claims.
|
|
# m3taversal directed every research mission that produced these claims.
|
|
# Check if any agent is the extractor — if so, m3taversal is the director.
|
|
has_agent_extractor = any(
|
|
entry.get("handle") in AGENT_HANDLES
|
|
for entry in attribution.get("extractor", [])
|
|
)
|
|
if not has_agent_extractor:
|
|
# Also check git trailer pattern — if source mentions an agent name
|
|
raw_source = fm.get("source", "") or ""
|
|
source_lower = (raw_source if isinstance(raw_source, str) else str(raw_source)).lower()
|
|
has_agent_extractor = any(a in source_lower for a in AGENT_HANDLES)
|
|
|
|
if has_agent_extractor:
|
|
upsert_contributor(conn, DIRECTOR_HANDLE, "sourcer", created)
|
|
count += 1
|
|
|
|
print(f" Found {count} attribution credits from claim files")
|
|
return count
|
|
|
|
|
|
def main():
|
|
print(f"Bootstrap contributors from {REPO_DIR}")
|
|
print(f"Database: {DB_PATH}")
|
|
|
|
conn = get_connection()
|
|
|
|
# Check current state
|
|
existing = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
|
|
print(f"Current contributors: {existing}")
|
|
|
|
total = 0
|
|
total += bootstrap_from_git_log(conn)
|
|
total += bootstrap_from_claim_files(conn)
|
|
|
|
conn.commit()
|
|
|
|
# Summary
|
|
final = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
|
|
top = conn.execute(
|
|
"""SELECT handle, claims_merged, sourcer_count, extractor_count,
|
|
challenger_count, synthesizer_count, reviewer_count
|
|
FROM contributors ORDER BY claims_merged DESC LIMIT 10"""
|
|
).fetchall()
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f" BOOTSTRAP COMPLETE")
|
|
print(f" Credits processed: {total}")
|
|
print(f" Contributors before: {existing}")
|
|
print(f" Contributors after: {final}")
|
|
print(f"\n Top 10 by claims_merged:")
|
|
for row in top:
|
|
roles = f"S:{row['sourcer_count']} E:{row['extractor_count']} C:{row['challenger_count']} Y:{row['synthesizer_count']} R:{row['reviewer_count']}"
|
|
print(f" {row['handle']:20s} merged:{row['claims_merged']:>4d} {roles}")
|
|
print(f"{'='*60}")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|