feat: add /api/contributor-growth endpoint + cumulative growth script

Adds async git-log-based endpoint for cumulative contributor and claim tracking. 5-minute cache, excludes bot accounts, tags founding contributors. Standalone CLI script also included for ad-hoc data generation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 22:19:42 +01:00 · 2026-04-20 22:19:42 +01:00 · 9505e5b40a
commit 9505e5b40a
parent f0cf772182
2 changed files with 333 additions and 0 deletions
--- a/diagnostics/dashboard_routes.py
+++ b/diagnostics/dashboard_routes.py
@ -10,6 +10,7 @@ Endpoints:
 Owner: Argus
 """
 import asyncio
 import json
 import logging
 import os
@ -17,6 +18,7 @@ import sqlite3
 import statistics
 import time
 import urllib.request
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
@ -1182,6 +1184,113 @@ async def handle_telegram_extractions(request):
        conn.close()
 # ─── GET /api/contributor-growth ─────────────────────────────────────────
 CODEX_WORKTREE = Path(os.environ.get("MAIN_WORKTREE", "/opt/teleo-eval/workspaces/main"))
 FOUNDING_CUTOFF = "2026-03-15"
 CONTRIBUTOR_EXCLUDE = {"Teleo Agents", "Teleo Pipeline"}
 _growth_cache: dict | None = None
 _growth_cache_ts: float = 0
 GROWTH_CACHE_TTL = 300
 async def handle_contributor_growth(request):
    """Cumulative unique contributors and claims over time from git log.
    Returns time-series data for Chart.js line charts.
    Cached for 5 minutes since git log is expensive.
    """
    global _growth_cache, _growth_cache_ts
    now = time.monotonic()
    if _growth_cache is not None and (now - _growth_cache_ts) < GROWTH_CACHE_TTL:
        return web.json_response(_growth_cache)
    codex_path = str(CODEX_WORKTREE)
    if not CODEX_WORKTREE.exists():
        return web.json_response(
            {"error": "codex worktree not found", "path": codex_path}, status=404
        )
    proc = await asyncio.create_subprocess_exec(
        "git", "log", "--format=%ad|%an", "--date=format:%Y-%m-%d", "--all",
        cwd=codex_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate()
    if proc.returncode != 0:
        return web.json_response(
            {"error": "git log failed", "detail": stderr.decode()[:500]}, status=500
        )
    first_seen: dict[str, str] = {}
    daily_commits: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
    for line in stdout.decode().strip().split("\n"):
        if "|" not in line:
            continue
        date, author = line.split("|", 1)
        if author in CONTRIBUTOR_EXCLUDE:
            continue
        daily_commits[date][author] += 1
        if author not in first_seen or date < first_seen[author]:
            first_seen[author] = date
    by_date: dict[str, list[str]] = defaultdict(list)
    for author, date in first_seen.items():
        by_date[date].append(author)
    contributors_timeline = []
    seen: set[str] = set()
    for date in sorted(by_date.keys()):
        new_authors = by_date[date]
        seen.update(new_authors)
        contributors_timeline.append({
            "date": date,
            "cumulative": len(seen),
            "new": [{"name": a, "founding": date <= FOUNDING_CUTOFF} for a in sorted(new_authors)],
        })
    proc2 = await asyncio.create_subprocess_exec(
        "git", "log", "--format=%ad", "--date=format:%Y-%m-%d",
        "--all", "--diff-filter=A", "--", "domains/*.md",
        cwd=codex_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout2, _ = await proc2.communicate()
    claim_counts: dict[str, int] = defaultdict(int)
    for line in stdout2.decode().strip().split("\n"):
        line = line.strip()
        if line:
            claim_counts[line] += 1
    claims_timeline = []
    cumulative = 0
    for date in sorted(claim_counts.keys()):
        cumulative += claim_counts[date]
        claims_timeline.append({"date": date, "cumulative": cumulative, "added": claim_counts[date]})
    all_contributors = set(first_seen.keys())
    founding = sorted(a for a in all_contributors if first_seen[a] <= FOUNDING_CUTOFF)
    result = {
        "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "summary": {
            "total_contributors": len(all_contributors),
            "founding_contributors": founding,
            "total_claims": cumulative,
            "days_active": (datetime.now(timezone.utc) - datetime(2026, 3, 5, tzinfo=timezone.utc)).days,
        },
        "cumulative_contributors": contributors_timeline,
        "cumulative_claims": claims_timeline,
    }
    _growth_cache = result
    _growth_cache_ts = now
    return web.json_response(result)
 # ─── Registration ──────────────────────────────────────────────────────────
 def register_dashboard_routes(app: web.Application, get_conn):
@ -1199,3 +1308,4 @@ def register_dashboard_routes(app: web.Application, get_conn):
    app.router.add_get("/api/growth", handle_growth)
    app.router.add_get("/api/pr-lifecycle", handle_pr_lifecycle)
    app.router.add_get("/api/telegram-extractions", handle_telegram_extractions)
    app.router.add_get("/api/contributor-growth", handle_contributor_growth)
--- a/scripts/cumulative-growth.py
+++ b/scripts/cumulative-growth.py
@ -0,0 +1,223 @@
 #!/usr/bin/env python3
 """Generate cumulative growth time-series data for public dashboard.
 Produces JSON with three series:
  - cumulative_contributors: unique git authors over time
  - cumulative_claims: domain claim files added over time
  - github_stars: star count snapshots (requires GitHub API)
 Data sources: git log (codex repo), GitHub API.
 Output: JSON to stdout or file, suitable for Chart.js line charts.
 Usage:
  python3 cumulative-growth.py --codex-path /path/to/teleo-codex [--output /path/to/output.json]
  python3 cumulative-growth.py --codex-path /path/to/teleo-codex --format csv
 """
 import argparse
 import json
 import subprocess
 import sys
 from collections import defaultdict
 from datetime import datetime, timedelta
 # Map bot/service accounts to their human principal or exclude them.
 # "Teleo Agents" and "Teleo Pipeline" are bot accounts — attribute to system.
 CONTRIBUTOR_ALIASES = {
    "Teleo Agents": None,   # system automation, not a contributor
    "Teleo Pipeline": None, # pipeline bot
 }
 # Founding contributors get a badge — anyone who contributed before this date.
 FOUNDING_CUTOFF = "2026-03-15"
 def git_log_contributors(codex_path: str) -> list[dict]:
    """Extract per-commit author and date from git log."""
    result = subprocess.run(
        ["git", "log", "--format=%ad|%an", "--date=format:%Y-%m-%d", "--all"],
        capture_output=True, text=True, cwd=codex_path
    )
    if result.returncode != 0:
        print(f"git log failed: {result.stderr}", file=sys.stderr)
        sys.exit(1)
    entries = []
    for line in result.stdout.strip().split("\n"):
        if "|" not in line:
            continue
        date, author = line.split("|", 1)
        canonical = CONTRIBUTOR_ALIASES.get(author, author)
        if canonical is None:
            continue
        entries.append({"date": date, "author": canonical})
    return entries
 def git_log_claims(codex_path: str) -> list[dict]:
    """Extract claim file additions over time from git log."""
    result = subprocess.run(
        ["git", "log", "--format=%ad", "--date=format:%Y-%m-%d",
         "--all", "--diff-filter=A", "--", "domains/*.md"],
        capture_output=True, text=True, cwd=codex_path
    )
    if result.returncode != 0:
        print(f"git log failed: {result.stderr}", file=sys.stderr)
        sys.exit(1)
    counts = defaultdict(int)
    for line in result.stdout.strip().split("\n"):
        line = line.strip()
        if line:
            counts[line] += 1
    return [{"date": d, "count": c} for d, c in sorted(counts.items())]
 def github_stars(repo: str = "living-ip/teleo-codex") -> int | None:
    """Fetch current star count from GitHub API. Returns None on failure."""
    try:
        result = subprocess.run(
            ["gh", "api", f"repos/{repo}", "--jq", ".stargazers_count"],
            capture_output=True, text=True, timeout=10
        )
        if result.returncode == 0:
            return int(result.stdout.strip())
    except (subprocess.TimeoutExpired, ValueError):
        pass
    return None
 def build_cumulative_contributors(entries: list[dict]) -> list[dict]:
    """Build cumulative unique contributor count by date."""
    first_seen = {}
    for e in entries:
        author, date = e["author"], e["date"]
        if author not in first_seen or date < first_seen[author]:
            first_seen[author] = date
    by_date = defaultdict(list)
    for author, date in first_seen.items():
        by_date[date].append(author)
    timeline = []
    seen = set()
    for date in sorted(by_date.keys()):
        new_authors = by_date[date]
        seen.update(new_authors)
        is_founding = date <= FOUNDING_CUTOFF
        timeline.append({
            "date": date,
            "cumulative": len(seen),
            "new": [
                {"name": a, "founding": is_founding}
                for a in sorted(new_authors)
            ],
        })
    return timeline
 def build_cumulative_claims(claim_entries: list[dict]) -> list[dict]:
    """Build cumulative claim count by date."""
    timeline = []
    cumulative = 0
    for entry in claim_entries:
        cumulative += entry["count"]
        timeline.append({
            "date": entry["date"],
            "cumulative": cumulative,
            "added": entry["count"],
        })
    return timeline
 def build_daily_commits(entries: list[dict]) -> list[dict]:
    """Build daily commit volume by contributor."""
    daily = defaultdict(lambda: defaultdict(int))
    for e in entries:
        daily[e["date"]][e["author"]] += 1
    timeline = []
    for date in sorted(daily.keys()):
        authors = daily[date]
        timeline.append({
            "date": date,
            "total": sum(authors.values()),
            "by_contributor": dict(sorted(authors.items())),
        })
    return timeline
 def generate_report(codex_path: str) -> dict:
    entries = git_log_contributors(codex_path)
    claim_entries = git_log_claims(codex_path)
    stars = github_stars()
    contributors_timeline = build_cumulative_contributors(entries)
    claims_timeline = build_cumulative_claims(claim_entries)
    commits_timeline = build_daily_commits(entries)
    all_contributors = set(e["author"] for e in entries)
    founding = [
        a for a in all_contributors
        if any(
            e["date"] <= FOUNDING_CUTOFF and e["author"] == a
            for e in entries
        )
    ]
    return {
        "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "summary": {
            "total_contributors": len(all_contributors),
            "founding_contributors": sorted(founding),
            "total_claims": claims_timeline[-1]["cumulative"] if claims_timeline else 0,
            "github_stars": stars,
            "codex_start_date": "2026-03-05",
            "days_active": (datetime.utcnow() - datetime(2026, 3, 5)).days,
        },
        "cumulative_contributors": contributors_timeline,
        "cumulative_claims": claims_timeline,
        "daily_activity": commits_timeline,
    }
 def format_csv(report: dict) -> str:
    lines = ["date,cumulative_contributors,cumulative_claims"]
    contrib_map = {e["date"]: e["cumulative"] for e in report["cumulative_contributors"]}
    claims_map = {e["date"]: e["cumulative"] for e in report["cumulative_claims"]}
    all_dates = sorted(set(list(contrib_map.keys()) + list(claims_map.keys())))
    last_contrib = 0
    last_claims = 0
    for d in all_dates:
        last_contrib = contrib_map.get(d, last_contrib)
        last_claims = claims_map.get(d, last_claims)
        lines.append(f"{d},{last_contrib},{last_claims}")
    return "\n".join(lines)
 def main():
    parser = argparse.ArgumentParser(description="Generate cumulative growth data")
    parser.add_argument("--codex-path", required=True, help="Path to teleo-codex repo")
    parser.add_argument("--output", help="Output file path (default: stdout)")
    parser.add_argument("--format", choices=["json", "csv"], default="json")
    args = parser.parse_args()
    report = generate_report(args.codex_path)
    if args.format == "csv":
        output = format_csv(report)
    else:
        output = json.dumps(report, indent=2)
    if args.output:
        with open(args.output, "w") as f:
            f.write(output)
        print(f"Written to {args.output}", file=sys.stderr)
    else:
        print(output)
 if __name__ == "__main__":
    main()