fix(activity): re-apply source classifier + add date-prefix slug fallback

Regression: aeae712's source/create distinction was lost — VPS reverted to pre-aeae712 behavior where every extract/* knowledge PR returned type=create regardless of whether a claim was written. Source archives surfaced as "New claim" chips with date-prefix slugs that 404 on click. Root cause: aeae712 was deployed via local file copy and never pushed to origin; a subsequent rsync from origin/main overwrote it with the older classifier. This branch ships from origin so deploy.sh's repo-first gate makes recurrence impossible. - Restore aeae712: extract/* + empty description -> source, with empty claim_slug + source_slug field, ci_earned 0.15 - Add Leo's regex fallback: candidate_slug matching ^\d{4}-\d{2}-\d{2}-.+-[a-f0-9]{4}$ -> source regardless of branch /commit_type/description state. Catches edge cases where description leaks but is just a source title (slugified into the inbox filename pattern), not a claim insight. - Add 'challenge' to _FEED_COMMIT_TYPES (latent bug — challenge PRs would be filtered out before classification because the filter list omitted them; memory says 0 challenges exist so it never triggered, but schema support belongs in the filter) - _build_events: compute candidate slug before classify so the regex fallback has a slug to inspect Verified locally on Leo's example PRs (#4014, #4016) — both classify as source. VPS smoke pending deploy. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
fix(classify): Ganymede review fixes — alias cleanup + counter accuracy + handle alignment
2026-04-26 13:47:00 +01:00 · 2026-04-24 20:47:21 +01:00 · 2026-04-24 20:47:21 +01:00 · 2026-04-24 17:58:30 +01:00 · 2026-04-24 16:49:12 +01:00 · 2026-04-24 16:33:37 +01:00
28 changed files with 4965 additions and 99 deletions
--- a/diagnostics/activity_endpoint.py
+++ b/diagnostics/activity_endpoint.py
@ -28,12 +28,9 @@ import sqlite3
 import json
-# Map PR status to Clay's operation color palette
+# Non-merged statuses map directly to operation — no semantic classification yet.
-# extract (cyan), new (green), enrich (amber), challenge (red-orange),
+NON_MERGED_STATUS_TO_OPERATION = {
-# decision (violet), infra (grey)
+    'approved': 'new',         # about to become knowledge
 STATUS_TO_OPERATION = {
    'merged': 'new',           # green — new knowledge merged
    'approved': 'enrich',      # amber — approved, enriching KB
    'open': 'extract',         # cyan — new extraction in progress
    'validating': 'extract',   # cyan — being validated
    'reviewing': 'extract',    # cyan — under review
@ -43,6 +40,51 @@ STATUS_TO_OPERATION = {
    'conflict': 'challenge',   # red-orange — conflict detected
 }
 # Maintenance commit_types that land on main but don't represent new knowledge.
 _MAINTENANCE_COMMIT_TYPES = {'fix', 'pipeline', 'reweave'}
 def classify_pr_operation(status, commit_type, branch, description=None):
    """Derive a Timeline operation from a PR row.
    Priority order for MERGED PRs (commit_type wins over branch prefix —
    extract/* branches with commit_type='enrich' or 'challenge' classify
    by commit_type, matching the contributor-role wiring fix):
      1. commit_type == 'challenge' OR branch.startswith('challenge/') OR
         description contains 'challenged_by' → 'challenge'
      2. commit_type == 'enrich' OR branch.startswith('enrich/' | 'reweave/')
         → 'enrich'
      3. commit_type in _MAINTENANCE_COMMIT_TYPES → 'infra'
      4. default (commit_type='knowledge'|'extract'|'research'|'entity' or
         anything else) → 'new'
    For non-merged PRs, falls back to NON_MERGED_STATUS_TO_OPERATION.
    """
    commit_type = (commit_type or '').lower()
    branch = branch or ''
    description_lower = (description or '').lower()
    if status != 'merged':
        return NON_MERGED_STATUS_TO_OPERATION.get(status, 'infra')
    # Challenge takes precedence — the signal is inherently more specific.
    if (commit_type == 'challenge'
            or branch.startswith('challenge/')
            or 'challenged_by' in description_lower):
        return 'challenge'
    if (commit_type == 'enrich'
            or branch.startswith('enrich/')
            or branch.startswith('reweave/')):
        return 'enrich'
    if commit_type in _MAINTENANCE_COMMIT_TYPES:
        return 'infra'
    # Default: legacy 'knowledge', new 'extract', 'research', 'entity',
    # unknown/null commit_type → treat as new knowledge.
    return 'new'
 # Map audit_log stage to operation type
 STAGE_TO_OPERATION = {
    'ingest': 'extract',
@ -118,6 +160,8 @@ async def handle_activity(request):
    Query params:
        limit (int, default 100, max 500): number of events to return
        cursor (ISO timestamp): return events older than this timestamp
        type (str, optional): comma-separated operation types to include
            (extract|new|enrich|challenge|infra). If absent, returns all types.
    Derives events from two sources:
        1. prs table — per-PR events with domain, agent, status
@ -131,6 +175,13 @@ async def handle_activity(request):
        limit = 100
    cursor = request.query.get('cursor')
    type_param = request.query.get('type', '').strip()
    allowed_ops = None
    if type_param:
        allowed_ops = {t.strip() for t in type_param.split(',') if t.strip()}
        if not allowed_ops:
            allowed_ops = None
    db_path = request.app['db_path']
    try:
@ -143,22 +194,27 @@ async def handle_activity(request):
        # Each PR generates events at created_at and merged_at timestamps
        pr_query = """
            SELECT number, status, domain, agent, branch, source_path,
-                   created_at, merged_at
+                   created_at, merged_at, source_channel, commit_type,
                   description
            FROM prs
            WHERE {where_clause}
            ORDER BY COALESCE(merged_at, created_at) DESC
            LIMIT ?
        """
        # Over-fetch when filtering by type so we have enough matching rows after
        # post-build filtering. Cap at 2000 to avoid runaway queries.
        fetch_limit = min(2000, limit * 5) if allowed_ops else limit + 1
        if cursor:
            rows = conn.execute(
                pr_query.format(where_clause="COALESCE(merged_at, created_at) < ?"),
-                (cursor, limit + 1)
+                (cursor, fetch_limit)
            ).fetchall()
        else:
            rows = conn.execute(
                pr_query.format(where_clause="1=1"),
-                (limit + 1,)
+                (fetch_limit,)
            ).fetchall()
        # Known knowledge agents for branch-prefix inference
@ -166,7 +222,14 @@ async def handle_activity(request):
        for row in rows:
            row_dict = dict(row)
-            operation = STATUS_TO_OPERATION.get(row_dict['status'], 'infra')
+            operation = classify_pr_operation(
                row_dict['status'],
                row_dict.get('commit_type'),
                row_dict.get('branch'),
                row_dict.get('description'),
            )
            if allowed_ops and operation not in allowed_ops:
                continue
            description = pr_description(row_dict)
            # Use merged_at if available (more interesting event), else created_at
@ -189,6 +252,7 @@ async def handle_activity(request):
                'description': description,
                'status': row_dict['status'],
                'pr_number': row_dict['number'],
                'source_channel': row_dict.get('source_channel') or 'unknown',
            })
        # Source 2: Audit log events (secondary — pipeline-level)
@ -217,6 +281,8 @@ async def handle_activity(request):
            for row in audit_rows:
                row_dict = dict(row)
                operation = STAGE_TO_OPERATION.get(row_dict['stage'], 'infra')
                if allowed_ops and operation not in allowed_ops:
                    continue
                description = audit_description(row_dict)
                events.append({
@ -228,6 +294,7 @@ async def handle_activity(request):
                    'description': description,
                    'status': None,
                    'pr_number': None,
                    'source_channel': None,  # audit events not tied to a PR
                })
        conn.close()
--- a/diagnostics/activity_feed_api.py
+++ b/diagnostics/activity_feed_api.py
@ -0,0 +1,288 @@
 """Activity feed API — serves contribution events from pipeline.db."""
 import re
 import sqlite3
 import math
 import time
 from aiohttp import web
 DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
 _cache = {"data": None, "ts": 0}
 CACHE_TTL = 60  # 1 minute — activity should feel fresh
 # commit_types we surface in the activity feed. `pipeline` is system
 # maintenance (reweave/fix auto-runs, zombie cleanup) and stays hidden.
 _FEED_COMMIT_TYPES = ("knowledge", "enrich", "challenge", "research", "entity", "extract", "reweave")
 # Source-archive slugs follow YYYY-MM-DD-publisher-topic-HASH4 — they're
 # inbox archive filenames, not claim slugs. Used as a fallback signal when
 # branch/description heuristics miss (e.g. populated descriptions that
 # happen to be source titles, not claim insights).
 _SOURCE_SLUG_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}-.+-[a-f0-9]{4}$")
 def _get_conn():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    conn.execute("PRAGMA busy_timeout = 10000")
    return conn
 def _is_source_slug(slug):
    return bool(slug and _SOURCE_SLUG_PATTERN.match(slug))
 def _classify_event(branch, description, commit_type, candidate_slug=None):
    """Return one of: create | enrich | challenge | source | None.
    Source-archive PRs are extract/* branches that filed a source into
    inbox/archive/ but didn't produce a claim. Two signals classify them
    as 'source' (defense in depth):
      1. extract/* branch with empty description (no claim title produced)
      2. candidate_slug matches YYYY-MM-DD-...-HASH4 (inbox filename pattern)
    """
    commit_type_l = (commit_type or "").lower()
    branch = branch or ""
    description_lower = (description or "").lower()
    has_desc = bool(description and description.strip())
    if commit_type_l not in _FEED_COMMIT_TYPES:
        return None
    # Explicit challenge signals win first.
    if (commit_type_l == "challenge"
            or branch.startswith("challenge/")
            or "challenged_by" in description_lower):
        return "challenge"
    # Enrichment: reweave edge-connects, enrich/ branches, or commit_type=enrich.
    if (commit_type_l == "enrich"
            or branch.startswith("enrich/")
            or branch.startswith("reweave/")):
        return "enrich"
    # Source-only: extract/* with no claim description means inbox archive
    # landed but no domain claim was written.
    if branch.startswith("extract/") and not has_desc:
        return "source"
    # Belt-and-suspenders: if the slug we'd surface to the frontend looks
    # like an inbox archive filename (date-prefix-hash), treat as source
    # regardless of branch/commit_type/description state. Catches cases
    # where description leaked but is just a source title, not a claim.
    if _is_source_slug(candidate_slug):
        return "source"
    # Everything else with a description is a new claim.
    return "create"
 def _normalize_contributor(submitted_by, agent):
    if submitted_by and submitted_by.strip():
        name = submitted_by.strip().lstrip("@")
        return name
    if agent and agent.strip() and agent != "pipeline":
        return agent.strip()
    return "pipeline"
 def _summary_from_branch(branch):
    if not branch:
        return ""
    parts = branch.split("/", 1)
    if len(parts) < 2:
        return ""
    slug = parts[1]
    slug = re.sub(r"^[\d-]+-", "", slug)  # strip date prefix
    slug = re.sub(r"-[a-f0-9]{4}$", "", slug)  # strip hash suffix
    return slug.replace("-", " ").strip().capitalize()
 def _extract_claim_slugs(description, branch=None):
    if not description:
        if branch:
            parts = branch.split("/", 1)
            if len(parts) > 1:
                return [parts[1][:120]]
        return []
    titles = [t.strip() for t in description.split("|") if t.strip()]
    slugs = []
    for title in titles:
        slug = title.lower().strip()
        slug = "".join(c if c.isalnum() or c in (" ", "-") else "" for c in slug)
        slug = slug.replace(" ", "-").strip("-")
        if len(slug) > 10:
            slugs.append(slug[:120])
    return slugs
 def _hot_score(challenge_count, enrich_count, signal_count, hours_since):
    numerator = challenge_count * 3 + enrich_count * 2 + signal_count
    denominator = max(hours_since, 0.5) ** 1.5
    return numerator / denominator
 def _build_events():
    conn = _get_conn()
    try:
        placeholders = ",".join("?" * len(_FEED_COMMIT_TYPES))
        rows = conn.execute(f"""
            SELECT p.number, p.branch, p.domain, p.agent, p.submitted_by,
                   p.merged_at, p.description, p.commit_type, p.cost_usd,
                   p.source_channel, p.source_path
            FROM prs p
            WHERE p.status = 'merged'
              AND p.commit_type IN ({placeholders})
              AND p.merged_at IS NOT NULL
            ORDER BY p.merged_at DESC
            LIMIT 2000
        """, _FEED_COMMIT_TYPES).fetchall()
        events = []
        claim_activity = {}  # slug -> {challenges, enriches, signals, first_seen}
        for row in rows:
            slugs = _extract_claim_slugs(row["description"], row["branch"])
            candidate_slug = slugs[0] if slugs else ""
            event_type = _classify_event(
                row["branch"], row["description"], row["commit_type"],
                candidate_slug=candidate_slug,
            )
            if not event_type:
                continue
            contributor = _normalize_contributor(row["submitted_by"], row["agent"])
            merged_at = row["merged_at"] or ""
            ci_map = {"create": 0.35, "enrich": 0.25, "challenge": 0.40, "source": 0.15}
            ci_earned = ci_map.get(event_type, 0)
            # Source events never carry a claim_slug — no claim was written —
            # so the frontend can't produce a 404-ing claim link.
            if event_type == "source":
                summary_text = _summary_from_branch(row["branch"])
                source_slug = (
                    _summary_from_branch(row["branch"]).lower().replace(" ", "-")
                    or row["branch"]
                )
                events.append({
                    "type": "source",
                    "claim_slug": "",
                    "source_slug": source_slug,
                    "domain": row["domain"] or "unknown",
                    "contributor": contributor,
                    "timestamp": merged_at,
                    "ci_earned": round(ci_earned, 2),
                    "summary": summary_text,
                    "pr_number": row["number"],
                    "source_channel": row["source_channel"] or "unknown",
                })
                continue
            for slug in slugs:
                if slug not in claim_activity:
                    claim_activity[slug] = {
                        "challenges": 0, "enriches": 0, "signals": 0,
                        "first_seen": merged_at,
                    }
                if event_type == "challenge":
                    claim_activity[slug]["challenges"] += 1
                elif event_type == "enrich":
                    claim_activity[slug]["enriches"] += 1
                else:
                    claim_activity[slug]["signals"] += 1
            summary_text = ""
            if row["description"]:
                first_title = row["description"].split("|")[0].strip()
                if len(first_title) > 120:
                    first_title = first_title[:117] + "..."
                summary_text = first_title
            elif row["branch"]:
                summary_text = _summary_from_branch(row["branch"])
            for slug in (slugs[:1] if slugs else [""]):
                events.append({
                    "type": event_type,
                    "claim_slug": slug,
                    "domain": row["domain"] or "unknown",
                    "contributor": contributor,
                    "timestamp": merged_at,
                    "ci_earned": round(ci_earned, 2),
                    "summary": summary_text,
                    "pr_number": row["number"],
                    "source_channel": row["source_channel"] or "unknown",
                })
        return events, claim_activity
    finally:
        conn.close()
 def _sort_events(events, claim_activity, sort_mode, now_ts):
    if sort_mode == "recent":
        events.sort(key=lambda e: e["timestamp"], reverse=True)
    elif sort_mode == "hot":
        def hot_key(e):
            slug = e["claim_slug"]
            ca = claim_activity.get(slug, {"challenges": 0, "enriches": 0, "signals": 0})
            try:
                from datetime import datetime
                evt_time = datetime.fromisoformat(e["timestamp"].replace("Z", "+00:00"))
                hours = (now_ts - evt_time.timestamp()) / 3600
            except (ValueError, AttributeError):
                hours = 9999
            return _hot_score(ca["challenges"], ca["enriches"], ca["signals"], hours)
        events.sort(key=hot_key, reverse=True)
    elif sort_mode == "important":
        type_rank = {"challenge": 0, "enrich": 1, "create": 2, "source": 3}
        events.sort(key=lambda e: (type_rank.get(e["type"], 4), -len(e["summary"])))
    return events
 async def handle_activity_feed(request):
    sort_mode = request.query.get("sort", "recent")
    if sort_mode not in ("hot", "recent", "important"):
        sort_mode = "recent"
    domain = request.query.get("domain", "")
    contributor = request.query.get("contributor", "")
    type_param = request.query.get("type", "")
    type_filter = {t.strip() for t in type_param.split(",") if t.strip()} if type_param else None
    try:
        limit = min(int(request.query.get("limit", "20")), 100)
    except ValueError:
        limit = 20
    try:
        offset = max(int(request.query.get("offset", "0")), 0)
    except ValueError:
        offset = 0
    now = time.time()
    if _cache["data"] is None or (now - _cache["ts"]) > CACHE_TTL:
        _cache["data"] = _build_events()
        _cache["ts"] = now
    events, claim_activity = _cache["data"]
    filtered = events
    if domain:
        filtered = [e for e in filtered if e["domain"] == domain]
    if contributor:
        filtered = [e for e in filtered if e["contributor"] == contributor]
    if type_filter:
        filtered = [e for e in filtered if e["type"] in type_filter]
    sorted_events = _sort_events(list(filtered), claim_activity, sort_mode, now)
    total = len(sorted_events)
    page = sorted_events[offset:offset + limit]
    return web.json_response({
        "events": page,
        "total": total,
        "sort": sort_mode,
        "offset": offset,
        "limit": limit,
    }, headers={"Access-Control-Allow-Origin": "*"})
 def register(app):
    app.router.add_get("/api/activity-feed", handle_activity_feed)
--- a/diagnostics/app.py
+++ b/diagnostics/app.py
@ -42,7 +42,7 @@ API_KEY_FILE = Path(os.environ.get("ARGUS_API_KEY_FILE", "/opt/teleo-eval/secret
 # Endpoints that skip auth (dashboard is public for now, can lock later)
 _PUBLIC_PATHS = frozenset({"/", "/prs", "/ops", "/health", "/agents", "/epistemic", "/legacy", "/audit", "/api/metrics", "/api/snapshots", "/api/vital-signs",
-                           "/api/contributors", "/api/domains", "/api/audit", "/api/yield", "/api/cost-per-claim", "/api/fix-rates", "/api/compute-profile", "/api/review-queue", "/api/daily-digest"})
+                           "/api/contributors", "/api/domains", "/api/audit", "/api/yield", "/api/cost-per-claim", "/api/fix-rates", "/api/compute-profile", "/api/review-queue", "/api/daily-digest", "/api/search"})
 def _get_db() -> sqlite3.Connection:
@ -663,38 +663,115 @@ async def handle_api_domains(request):
    return web.json_response({"domains": breakdown})
-async def handle_api_search(request):
+def _qdrant_hits_to_results(hits, include_expanded=False):
-    """GET /api/search — semantic search over claims via Qdrant + graph expansion.
+    """Shape raw Qdrant hits into Ship's chat-API contract."""
    results = []
    for h in hits:
        payload = h.get("payload", {}) or {}
        path = payload.get("claim_path", "") or ""
        slug = path.rsplit("/", 1)[-1]
        if slug.endswith(".md"):
            slug = slug[:-3]
        results.append({
            "slug": slug,
            "path": path,
            "title": payload.get("claim_title", ""),
            "domain": payload.get("domain"),
            "confidence": payload.get("confidence"),
            "score": round(float(h.get("score", 0.0) or 0.0), 4),
            "body_excerpt": payload.get("snippet", "") or "",
        })
    return results
-    Query params:
+
-      q:          search query (required)
+async def handle_api_search(request):
-      domain:     filter by domain (optional)
+    """Semantic search over claims via Qdrant.
-      confidence: filter by confidence level (optional)
+
-      limit:      max results, default 10 (optional)
+    POST contract (Ship's chat API):
-      exclude:    comma-separated claim paths to exclude (optional)
+      body: {"query": str, "limit": int, "min_score": float?, "domain": str?, "confidence": str?, "exclude": [str]?}
-      expand:     enable graph expansion, default true (optional)
+      response: {"query": str, "results": [{"slug","path","title","domain","confidence","score","body_excerpt"}], "total": int}
    GET (legacy + hackathon debug):
      q: search query (required)
      limit, domain, confidence, exclude, expand
      min_score: if set, bypasses two-pass lib threshold (default lib behavior otherwise)
    """
    if request.method == "POST":
        try:
            body = await request.json()
        except Exception:
            return web.json_response({"error": "invalid JSON body"}, status=400)
        query = (body.get("query") or "").strip()
        if not query:
            return web.json_response({"error": "query required"}, status=400)
        try:
            limit = min(int(body.get("limit") or 5), 50)
        except (TypeError, ValueError):
            return web.json_response({"error": "limit must be int"}, status=400)
        try:
            min_score = float(body.get("min_score") if body.get("min_score") is not None else 0.25)
        except (TypeError, ValueError):
            return web.json_response({"error": "min_score must be float"}, status=400)
        domain = body.get("domain")
        confidence = body.get("confidence")
        exclude = body.get("exclude") or None
        vector = embed_query(query)
        if vector is None:
            return web.json_response({"error": "embedding failed"}, status=502)
        hits = search_qdrant(vector, limit=limit, domain=domain,
                             confidence=confidence, exclude=exclude,
                             score_threshold=min_score)
        results = _qdrant_hits_to_results(hits)
        return web.json_response({"query": query, "results": results, "total": len(results)})
    # GET path
    query = request.query.get("q", "").strip()
    if not query:
        return web.json_response({"error": "q parameter required"}, status=400)
    domain = request.query.get("domain")
    confidence = request.query.get("confidence")
-    limit = min(int(request.query.get("limit", "10")), 50)
+    try:
        limit = min(int(request.query.get("limit", "10")), 50)
    except ValueError:
        return web.json_response({"error": "limit must be int"}, status=400)
    exclude_raw = request.query.get("exclude", "")
    exclude = [p.strip() for p in exclude_raw.split(",") if p.strip()] if exclude_raw else None
    expand = request.query.get("expand", "true").lower() != "false"
    min_score_raw = request.query.get("min_score")
-    # Use shared search library (Layer 1 + Layer 2)
+    if min_score_raw is not None:
        try:
            min_score = float(min_score_raw)
        except ValueError:
            return web.json_response({"error": "min_score must be float"}, status=400)
        vector = embed_query(query)
        if vector is None:
            return web.json_response({"error": "embedding failed"}, status=502)
        hits = search_qdrant(vector, limit=limit, domain=domain,
                             confidence=confidence, exclude=exclude,
                             score_threshold=min_score)
        direct = _qdrant_hits_to_results(hits)
        return web.json_response({
            "query": query,
            "direct_results": direct,
            "expanded_results": [],
            "total": len(direct),
        })
    # Default GET: Layer 1 + Layer 2 via lib
    result = kb_search(query, expand=expand,
                       domain=domain, confidence=confidence, exclude=exclude)
    if "error" in result:
        error = result["error"]
        if error == "embedding_failed":
            return web.json_response({"error": "embedding failed"}, status=502)
        return web.json_response({"error": error}, status=500)
    return web.json_response(result)
@ -2268,6 +2345,7 @@ def create_app() -> web.Application:
    app.router.add_get("/api/contributors", handle_api_contributors)
    app.router.add_get("/api/domains", handle_api_domains)
    app.router.add_get("/api/search", handle_api_search)
    app.router.add_post("/api/search", handle_api_search)
    app.router.add_get("/api/audit", handle_api_audit)
    app.router.add_get("/audit", handle_audit_page)
    app.router.add_post("/api/usage", handle_api_usage)
@ -2283,6 +2361,18 @@ def create_app() -> web.Application:
    # Response audit - cost tracking + reasoning traces
    app["db_path"] = str(DB_PATH)
    register_response_audit_routes(app)
    # Timeline activity feed (per-PR + audit_log events for dashboard v2)
    from activity_endpoint import handle_activity
    app.router.add_get("/api/activity", handle_activity)
    # Gamification activity feed (hot/recent/important sort)
    from activity_feed_api import register as register_activity_feed
    register_activity_feed(app)
    # Claims browser + detail
    from claims_api import register_claims_routes
    register_claims_routes(app)
    # Contributor profile (handle lookup, leaderboard with action CI)
    from contributor_profile_api import register_contributor_routes
    register_contributor_routes(app)
    app.on_cleanup.append(_cleanup)
    return app
--- a/diagnostics/claims_api.py
+++ b/diagnostics/claims_api.py
@ -0,0 +1,161 @@
 """Claims API endpoint — serves claim data from the codex filesystem."""
 import os
 import re
 import time
 import yaml
 from pathlib import Path
 from aiohttp import web
 CODEX_ROOT = Path("/opt/teleo-eval/workspaces/main/domains")
 _cache = {"data": None, "ts": 0}
 CACHE_TTL = 300  # 5 minutes
 def _parse_frontmatter(filepath):
    try:
        text = filepath.read_text(encoding="utf-8")
        if not text.startswith("---"):
            return None
        end = text.index("---", 3)
        fm = yaml.safe_load(text[3:end])
        if not fm or fm.get("type") != "claim":
            return None
        body = text[end+3:].strip()
        # Count wiki-links
        links = re.findall(r"\[\[([^\]]+)\]\]", body)
        # Extract first paragraph as summary
        paragraphs = [p.strip() for p in body.split("\n\n") if p.strip() and not p.strip().startswith("#")]
        summary = paragraphs[0][:300] if paragraphs else ""
        return {
            "slug": filepath.stem,
            "title": fm.get("title", filepath.stem.replace("-", " ")),
            "domain": fm.get("domain", "unknown"),
            "confidence": fm.get("confidence", "unknown"),
            "agent": fm.get("agent"),
            "scope": fm.get("scope"),
            "created": str(fm.get("created", "")),
            "source": fm.get("source", "") if isinstance(fm.get("source"), str) else "",
            "sourcer": fm.get("sourcer", ""),
            "wiki_link_count": len(links),
            "summary": summary,
            "challenged_by": fm.get("challenged_by"),
            "related_claims": fm.get("related_claims", []),
        }
    except Exception:
        return None
 def _load_all_claims():
    now = time.time()
    if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
        return _cache["data"]
    claims = []
    for domain_dir in sorted(CODEX_ROOT.iterdir()):
        if not domain_dir.is_dir():
            continue
        for f in sorted(domain_dir.glob("*.md")):
            if f.name == "_map.md":
                continue
            c = _parse_frontmatter(f)
            if c:
                claims.append(c)
    _cache["data"] = claims
    _cache["ts"] = now
    return claims
 async def handle_claims(request):
    claims = _load_all_claims()
    # Filters
    domain = request.query.get("domain")
    search = request.query.get("q", "").lower()
    confidence = request.query.get("confidence")
    agent = request.query.get("agent")
    sort = request.query.get("sort", "recent")  # recent, alpha, domain
    filtered = claims
    if domain:
        filtered = [c for c in filtered if c["domain"] == domain]
    if confidence:
        filtered = [c for c in filtered if c["confidence"] == confidence]
    if agent:
        filtered = [c for c in filtered if c["agent"] == agent]
    if search:
        filtered = [c for c in filtered if search in c["title"].lower() or search in c["summary"].lower()]
    # Sort
    if sort == "recent":
        filtered.sort(key=lambda c: c["created"], reverse=True)
    elif sort == "alpha":
        filtered.sort(key=lambda c: c["title"].lower())
    elif sort == "domain":
        filtered.sort(key=lambda c: (c["domain"], c["title"].lower()))
    # Pagination
    limit = min(int(request.query.get("limit", "50")), 200)
    offset = int(request.query.get("offset", "0"))
    page = filtered[offset:offset+limit]
    # Domain counts for sidebar
    domain_counts = {}
    for c in claims:
        domain_counts[c["domain"]] = domain_counts.get(c["domain"], 0) + 1
    return web.json_response({
        "claims": page,
        "total": len(filtered),
        "offset": offset,
        "limit": limit,
        "domains": dict(sorted(domain_counts.items(), key=lambda x: -x[1])),
        "confidence_levels": sorted(set(c["confidence"] for c in claims)),
        "agents": sorted(set(c["agent"] for c in claims if c["agent"])),
    }, headers={"Access-Control-Allow-Origin": "*"})
 async def handle_claim_detail(request):
    slug = request.match_info["slug"]
    claims = _load_all_claims()
    for c in claims:
        if c["slug"] == slug:
            # Read full body for detail view
            for domain_dir in CODEX_ROOT.iterdir():
                if not domain_dir.is_dir():
                    continue
                f = domain_dir / f"{slug}.md"
                if f.exists():
                    text = f.read_text(encoding="utf-8")
                    end = text.index("---", 3)
                    body = text[end+3:].strip()
                    c["body"] = body
                    break
            return web.json_response(c, headers={"Access-Control-Allow-Origin": "*"})
    return web.json_response({"error": "claim not found"}, status=404)
 async def handle_domains(request):
    claims = _load_all_claims()
    domains = {}
    for c in claims:
        d = c["domain"]
        if d not in domains:
            domains[d] = {"name": d, "count": 0, "agents": set(), "confidence_dist": {}}
        domains[d]["count"] += 1
        if c["agent"]:
            domains[d]["agents"].add(c["agent"])
        conf = c["confidence"]
        domains[d]["confidence_dist"][conf] = domains[d]["confidence_dist"].get(conf, 0) + 1
    result = []
    for d in sorted(domains.values(), key=lambda x: -x["count"]):
        d["agents"] = sorted(d["agents"])
        result.append(d)
    return web.json_response(result, headers={"Access-Control-Allow-Origin": "*"})
 def register_claims_routes(app):
    app.router.add_get("/api/claims", handle_claims)
    app.router.add_get("/api/claims/{slug}", handle_claim_detail)
    app.router.add_get("/api/domains", handle_domains)
--- a/diagnostics/contributor_profile_api.py
+++ b/diagnostics/contributor_profile_api.py
@ -0,0 +1,365 @@
 """Contributor profile API — GET /api/contributors/{handle}"""
 import sqlite3
 import json
 import os
 import re
 import subprocess
 from datetime import datetime
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 SYSTEM_ACCOUNTS = {"pipeline", "unknown", "teleo-agents", "teleo pipeline"}
 CODEX_PATH = "/opt/teleo-eval/workspaces/main"
 CI_WEIGHTS = {
    "sourcer": 0.15,
    "extractor": 0.05,
    "challenger": 0.35,
    "synthesizer": 0.25,
    "reviewer": 0.20,
 }
 FOUNDING_CUTOFF = "2026-03-15"
 BADGE_DEFS = {
    "FOUNDING CONTRIBUTOR": {"rarity": "limited", "desc": "Contributed during pre-launch phase"},
    "BELIEF MOVER": {"rarity": "rare", "desc": "Challenge that led to a claim revision"},
    "KNOWLEDGE SOURCER": {"rarity": "uncommon", "desc": "Source that generated 3+ claims"},
    "DOMAIN SPECIALIST": {"rarity": "rare", "desc": "Top 3 CI contributor in a domain"},
    "VETERAN": {"rarity": "uncommon", "desc": "10+ accepted contributions"},
    "FIRST BLOOD": {"rarity": "common", "desc": "First contribution of any kind"},
    "CONTRIBUTOR": {"rarity": "common", "desc": "Account created + first accepted contribution"},
 }
 def _get_conn():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    return conn
 def _compute_ci(row):
    total = 0
    for role, weight in CI_WEIGHTS.items():
        total += (row.get(f"{role}_count", 0) or 0) * weight
    return round(total, 2)
 def _compute_badges(handle, row, domain_breakdown, conn):
    badges = []
    first = row.get("first_contribution", "")
    if first and first <= FOUNDING_CUTOFF:
        badges.append("FOUNDING CONTRIBUTOR")
    claims = row.get("claims_merged", 0) or 0
    if claims > 0:
        badges.append("CONTRIBUTOR")
        badges.append("FIRST BLOOD")
    if claims >= 10:
        badges.append("VETERAN")
    challenger = row.get("challenger_count", 0) or 0
    challenge_ci = row.get("_challenge_count_from_scores", 0)
    if challenger > 0 or challenge_ci > 0:
        badges.append("BELIEF MOVER")
    sourcer = row.get("sourcer_count", 0) or 0
    if sourcer >= 3:
        badges.append("KNOWLEDGE SOURCER")
    return badges
 def _get_domain_breakdown(handle, conn):
    rows = conn.execute("""
        SELECT domain, COUNT(*) as cnt
        FROM prs
        WHERE status='merged' AND (LOWER(agent)=LOWER(?) OR LOWER(submitted_by)=LOWER(?))
        AND domain IS NOT NULL
        GROUP BY domain ORDER BY cnt DESC
    """, (handle, handle)).fetchall()
    return {r["domain"]: r["cnt"] for r in rows}
 def _get_contribution_timeline(handle, conn, limit=20):
    rows = conn.execute("""
        SELECT number, domain, status, created_at, description, commit_type, source_path
        FROM prs
        WHERE status='merged' AND (LOWER(agent)=LOWER(?) OR LOWER(submitted_by)=LOWER(?))
        ORDER BY created_at DESC LIMIT ?
    """, (handle, handle, limit)).fetchall()
    timeline = []
    for r in rows:
        desc = r["description"] or ""
        if not desc and r["source_path"]:
            desc = os.path.basename(r["source_path"]).replace("-", " ").replace(".md", "")
        timeline.append({
            "pr_number": r["number"],
            "domain": r["domain"],
            "date": r["created_at"][:10] if r["created_at"] else None,
            "type": _classify_commit(r["commit_type"]),
            "summary": desc[:200] if desc else None,
        })
    return timeline
 def _classify_commit(commit_type):
    if not commit_type:
        return "create"
    ct = commit_type.lower()
    if "challenge" in ct:
        return "challenge"
    if "enrich" in ct or "update" in ct or "reweave" in ct:
        return "enrich"
    return "create"
 def _get_review_stats(handle, conn):
    rows = conn.execute("""
        SELECT outcome, COUNT(*) as cnt
        FROM review_records
        WHERE LOWER(agent) = LOWER(?)
        GROUP BY outcome
    """, (handle,)).fetchall()
    stats = {}
    for r in rows:
        stats[r["outcome"]] = r["cnt"]
    return stats
 def _get_action_ci(handle, conn):
    """Get action-type CI from contribution_scores table.
    Checks both exact handle and common variants (with/without suffix).
    """
    h = handle.lower()
    base = re.sub(r"[-_]\w+\d+$", "", h)
    variants = list({h, base}) if base and base != h else [h]
    try:
        placeholders = ",".join("?" for _ in variants)
        rows = conn.execute(f"""
            SELECT event_type, SUM(ci_earned) as total, COUNT(*) as cnt
            FROM contribution_scores
            WHERE LOWER(contributor) IN ({placeholders})
            GROUP BY event_type
        """, variants).fetchall()
    except Exception:
        return None
    if not rows:
        return None
    breakdown = {}
    total = 0.0
    for r in rows:
        breakdown[r["event_type"]] = {
            "count": r["cnt"],
            "ci": round(r["total"], 4),
        }
        total += r["total"]
    return {
        "total": round(total, 4),
        "breakdown": breakdown,
    }
 def _get_git_contributor(handle):
    """Fallback: check git log for contributors not in pipeline.db."""
    try:
        result = subprocess.run(
            ["git", "log", "--all", "--format=%H|%an|%ae|%aI", "--diff-filter=A", "--", "domains/"],
            capture_output=True, text=True, cwd=CODEX_PATH, timeout=30
        )
        if result.returncode != 0:
            return None
        claims = []
        for line in result.stdout.strip().split("\n"):
            if not line:
                continue
            parts = line.split("|", 3)
            if len(parts) < 4:
                continue
            sha, name, email, date = parts
            if handle.lower() in name.lower() or handle.lower() in email.lower():
                claims.append({"sha": sha, "author": name, "email": email, "date": date[:10]})
        if not claims:
            return None
        return {
            "handle": handle,
            "display_name": claims[0]["author"],
            "email": claims[0]["email"],
            "first_contribution": min(c["date"] for c in claims),
            "last_contribution": max(c["date"] for c in claims),
            "claims_merged": len(claims),
            "sourcer_count": 0,
            "extractor_count": 0,
            "challenger_count": 0,
            "synthesizer_count": 0,
            "reviewer_count": 0,
        }
    except Exception:
        return None
 def get_contributor_profile(handle):
    conn = _get_conn()
    try:
        row = conn.execute(
            "SELECT * FROM contributors WHERE LOWER(handle) = LOWER(?)", (handle,)
        ).fetchone()
        if row:
            data = dict(row)
        else:
            git_data = _get_git_contributor(handle)
            if git_data:
                data = git_data
            else:
                return None
        ci_score = _compute_ci(data)
        action_ci = _get_action_ci(handle, conn)
        domain_breakdown = _get_domain_breakdown(handle, conn)
        timeline = _get_contribution_timeline(handle, conn)
        review_stats = _get_review_stats(handle, conn)
        if action_ci and "challenge" in action_ci.get("breakdown", {}):
            data["_challenge_count_from_scores"] = action_ci["breakdown"]["challenge"]["count"]
        badges = _compute_badges(handle, data, domain_breakdown, conn)
        # For git-only contributors, build domain breakdown from git
        if not domain_breakdown and not row:
            domain_breakdown = _git_domain_breakdown(handle)
        hero_badge = None
        rarity_order = ["limited", "rare", "uncommon", "common"]
        for rarity in rarity_order:
            for b in badges:
                if BADGE_DEFS.get(b, {}).get("rarity") == rarity:
                    hero_badge = b
                    break
            if hero_badge:
                break
        role_breakdown = {
            "sourcer": data.get("sourcer_count", 0) or 0,
            "extractor": data.get("extractor_count", 0) or 0,
            "challenger": data.get("challenger_count", 0) or 0,
            "synthesizer": data.get("synthesizer_count", 0) or 0,
            "reviewer": data.get("reviewer_count", 0) or 0,
        }
        total_roles = sum(role_breakdown.values())
        role_pct = {}
        for k, v in role_breakdown.items():
            role_pct[k] = round(v / total_roles * 100) if total_roles > 0 else 0
        return {
            "handle": data.get("handle", handle),
            "display_name": data.get("display_name"),
            "ci_score": ci_score,
            "action_ci": action_ci,
            "primary_ci": action_ci["total"] if action_ci else ci_score,
            "hero_badge": hero_badge,
            "badges": [{"name": b, **BADGE_DEFS.get(b, {})} for b in badges],
            "joined": data.get("first_contribution"),
            "last_active": data.get("last_contribution"),
            "claims_merged": data.get("claims_merged", 0) or 0,
            "principal": data.get("principal"),
            "role_breakdown": role_breakdown,
            "role_percentages": role_pct,
            "domain_breakdown": domain_breakdown,
            "review_stats": review_stats,
            "contribution_timeline": timeline,
            "active_domains": list(domain_breakdown.keys()),
        }
    finally:
        conn.close()
 def _git_domain_breakdown(handle):
    """For git-only contributors, count claims by domain from file paths."""
    try:
        result = subprocess.run(
            ["git", "log", "--all", "--name-only", "--format=COMMIT|%an", "--diff-filter=A", "--", "domains/"],
            capture_output=True, text=True, cwd=CODEX_PATH, timeout=30
        )
        if result.returncode != 0:
            return {}
        domains = {}
        current_match = False
        for line in result.stdout.strip().split("\n"):
            if line.startswith("COMMIT|"):
                author = line.split("|", 1)[1]
                current_match = handle.lower() in author.lower()
            elif current_match and line.startswith("domains/"):
                parts = line.split("/")
                if len(parts) >= 2:
                    domain = parts[1]
                    domains[domain] = domains.get(domain, 0) + 1
        return domains
    except Exception:
        return {}
 async def handle_contributor_profile(request):
    from aiohttp import web
    handle = request.match_info["handle"]
    profile = get_contributor_profile(handle)
    if profile is None:
        return web.json_response({"error": f"Contributor '{handle}' not found"}, status=404)
    return web.json_response(profile)
 async def handle_contributors_list(request):
    from aiohttp import web
    conn = _get_conn()
    try:
        min_claims = int(request.query.get("min_claims", "1"))
        rows = conn.execute("""
            SELECT handle, display_name, first_contribution, last_contribution, 
                   sourcer_count, extractor_count, challenger_count, synthesizer_count,
                   reviewer_count, claims_merged, principal
            FROM contributors
            WHERE claims_merged >= ?
            ORDER BY claims_merged DESC
        """, (min_claims,)).fetchall()
        contributors = []
        for r in rows:
            data = dict(r)
            if data["handle"].lower() in SYSTEM_ACCOUNTS:
                continue
            ci = _compute_ci(data)
            action_ci = _get_action_ci(data["handle"], conn)
            action_total = action_ci["total"] if action_ci else 0.0
            contributors.append({
                "handle": data["handle"],
                "display_name": data["display_name"],
                "ci_score": ci,
                "action_ci": action_total,
                "primary_ci": action_total if action_total > 0 else ci,
                "claims_merged": data["claims_merged"],
                "first_contribution": data["first_contribution"],
                "last_contribution": data["last_contribution"],
                "principal": data["principal"],
            })
        return web.json_response({
            "contributors": contributors,
            "total": len(contributors),
        })
    finally:
        conn.close()
 def register_contributor_routes(app):
    app.router.add_get("/api/contributors/list", handle_contributors_list)
    app.router.add_get("/api/contributors/{handle}", handle_contributor_profile)
--- a/diagnostics/dashboard_routes.py
+++ b/diagnostics/dashboard_routes.py
@ -10,6 +10,7 @@ Endpoints:
 Owner: Argus
 """
 import asyncio
 import json
 import logging
 import os
@ -17,6 +18,7 @@ import sqlite3
 import statistics
 import time
 import urllib.request
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
@ -1182,6 +1184,113 @@ async def handle_telegram_extractions(request):
        conn.close()
 # ─── GET /api/contributor-growth ─────────────────────────────────────────
 CODEX_WORKTREE = Path(os.environ.get("MAIN_WORKTREE", "/opt/teleo-eval/workspaces/main"))
 FOUNDING_CUTOFF = "2026-03-15"
 CONTRIBUTOR_EXCLUDE = {"Teleo Agents", "Teleo Pipeline"}
 _growth_cache: dict | None = None
 _growth_cache_ts: float = 0
 GROWTH_CACHE_TTL = 300
 async def handle_contributor_growth(request):
    """Cumulative unique contributors and claims over time from git log.
    Returns time-series data for Chart.js line charts.
    Cached for 5 minutes since git log is expensive.
    """
    global _growth_cache, _growth_cache_ts
    now = time.monotonic()
    if _growth_cache is not None and (now - _growth_cache_ts) < GROWTH_CACHE_TTL:
        return web.json_response(_growth_cache)
    codex_path = str(CODEX_WORKTREE)
    if not CODEX_WORKTREE.exists():
        return web.json_response(
            {"error": "codex worktree not found", "path": codex_path}, status=404
        )
    proc = await asyncio.create_subprocess_exec(
        "git", "log", "--format=%ad|%an", "--date=format:%Y-%m-%d", "--all",
        cwd=codex_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate()
    if proc.returncode != 0:
        return web.json_response(
            {"error": "git log failed", "detail": stderr.decode()[:500]}, status=500
        )
    first_seen: dict[str, str] = {}
    daily_commits: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
    for line in stdout.decode().strip().split("\n"):
        if "|" not in line:
            continue
        date, author = line.split("|", 1)
        if author in CONTRIBUTOR_EXCLUDE:
            continue
        daily_commits[date][author] += 1
        if author not in first_seen or date < first_seen[author]:
            first_seen[author] = date
    by_date: dict[str, list[str]] = defaultdict(list)
    for author, date in first_seen.items():
        by_date[date].append(author)
    contributors_timeline = []
    seen: set[str] = set()
    for date in sorted(by_date.keys()):
        new_authors = by_date[date]
        seen.update(new_authors)
        contributors_timeline.append({
            "date": date,
            "cumulative": len(seen),
            "new": [{"name": a, "founding": date <= FOUNDING_CUTOFF} for a in sorted(new_authors)],
        })
    proc2 = await asyncio.create_subprocess_exec(
        "git", "log", "--format=%ad", "--date=format:%Y-%m-%d",
        "--all", "--diff-filter=A", "--", "domains/*.md",
        cwd=codex_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout2, _ = await proc2.communicate()
    claim_counts: dict[str, int] = defaultdict(int)
    for line in stdout2.decode().strip().split("\n"):
        line = line.strip()
        if line:
            claim_counts[line] += 1
    claims_timeline = []
    cumulative = 0
    for date in sorted(claim_counts.keys()):
        cumulative += claim_counts[date]
        claims_timeline.append({"date": date, "cumulative": cumulative, "added": claim_counts[date]})
    all_contributors = set(first_seen.keys())
    founding = sorted(a for a in all_contributors if first_seen[a] <= FOUNDING_CUTOFF)
    result = {
        "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "summary": {
            "total_contributors": len(all_contributors),
            "founding_contributors": founding,
            "total_claims": cumulative,
            "days_active": (datetime.now(timezone.utc) - datetime(2026, 3, 5, tzinfo=timezone.utc)).days,
        },
        "cumulative_contributors": contributors_timeline,
        "cumulative_claims": claims_timeline,
    }
    _growth_cache = result
    _growth_cache_ts = now
    return web.json_response(result)
 # ─── Registration ──────────────────────────────────────────────────────────
 def register_dashboard_routes(app: web.Application, get_conn):
@ -1199,3 +1308,42 @@ def register_dashboard_routes(app: web.Application, get_conn):
    app.router.add_get("/api/growth", handle_growth)
    app.router.add_get("/api/pr-lifecycle", handle_pr_lifecycle)
    app.router.add_get("/api/telegram-extractions", handle_telegram_extractions)
    app.router.add_get("/api/contributor-growth", handle_contributor_growth)
    app.router.add_get("/api/digest/latest", handle_digest_latest)
    app.router.add_get("/api/contributor-graph", handle_contributor_graph)
 async def handle_digest_latest(request):
    """GET /api/digest/latest — return the most recent scoring digest."""
    import json as _json
    digest_path = "/opt/teleo-eval/logs/scoring-digest-latest.json"
    try:
        with open(digest_path) as f:
            data = _json.load(f)
        return web.json_response(data)
    except FileNotFoundError:
        return web.json_response({"error": "No digest available yet"}, status=404)
    except Exception as e:
        return web.json_response({"error": str(e)}, status=500)
 async def handle_contributor_graph(request):
    """GET /api/contributor-graph — serve the PNG chart."""
    import subprocess, os
    png_path = "/opt/teleo-eval/static/contributor-graph.png"
    # Regenerate if older than 1 hour or missing
    regen = not os.path.exists(png_path)
    if not regen:
        age = __import__('time').time() - os.path.getmtime(png_path)
        regen = age > 3600
    if regen:
        try:
            subprocess.run(
                ['python3', '/opt/teleo-eval/scripts/contributor-graph.py'],
                timeout=30, capture_output=True
            )
        except Exception:
            pass
    if not os.path.exists(png_path):
        return web.Response(text='Chart not available', status=503)
    return web.FileResponse(png_path, headers={'Content-Type': 'image/png'})
--- a/fetch_coins.py
+++ b/fetch_coins.py
@ -90,6 +90,8 @@ def load_ownership_coins():
            continue
        if fm.get("subtype") != "ownership-coin":
            continue
        if fm.get("status") == "liquidated":
            continue
        chain = fm.get("chain") or {}
        if isinstance(chain, str):
--- a/lib/attribution.py
+++ b/lib/attribution.py
@ -21,6 +21,92 @@ logger = logging.getLogger("pipeline.attribution")
 VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
 # Agent-owned branch prefixes — PRs from these branches get Pentagon-Agent trailer
 # credit for challenger/synthesizer roles. Pipeline-infra branches (extract/ reweave/
 # fix/ ingestion/) are deliberately excluded: they're automation, not contribution.
 # Single source of truth; imported by contributor.py and backfill-events.py.
 AGENT_BRANCH_PREFIXES = (
    "rio/", "theseus/", "leo/", "vida/", "astra/", "clay/", "oberon/",
 )
 # Handle sanity: lowercase alphanumerics, hyphens, underscores. 1-39 chars (matches
 # GitHub's handle rules). Rejects garbage like "governance---meritocratic-voting-+-futarchy"
 # or "sec-interpretive-release-s7-2026-09-(march-17" that upstream frontmatter hygiene
 # bugs produce. Apply at parse time so bad handles never reach the contributors table.
 _HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")
 def _valid_handle(handle: str) -> bool:
    """Return True if handle matches the handle format (alphanum + _-, ≤39 chars)."""
    if not handle or not isinstance(handle, str):
        return False
    h = handle.strip().lower().lstrip("@")
    if h.endswith("-") or h.endswith("_"):
        return False
    return bool(_HANDLE_RE.match(h))
 def _filter_valid_handles(result: dict) -> dict:
    """Drop entries with invalid handles from a parsed attribution dict."""
    filtered: dict[str, list[dict]] = {role: [] for role in VALID_ROLES}
    for role, entries in result.items():
        for entry in entries:
            if _valid_handle(entry.get("handle", "")):
                filtered[role].append(entry)
    return filtered
 # ─── Handle normalization + kind classification (schema v24) ──────────────
 # Known Pentagon agents. Used to classify contributor kind='agent' so the
 # leaderboard can filter them out of the default person view.
 PENTAGON_AGENTS = frozenset({
    "rio", "leo", "theseus", "vida", "clay", "astra",
    "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
    "pipeline",  # pipeline-owned commits (extract/*, reweave/*, fix/*)
 })
 def normalize_handle(handle: str, conn=None) -> str:
    """Canonicalize a handle: lowercase, strip @, resolve alias if conn provided.
    Examples:
      '@thesensatore' → 'thesensatore'
      'Cameron' → 'cameron' → 'cameron-s1' (via alias if seeded)
      'CNBC' → 'cnbc'
    Always lowercases and strips @ prefix. Alias resolution requires a conn
    argument (not always available at parse time; merge-time writer passes it).
    """
    if not handle:
        return ""
    h = handle.strip().lower().lstrip("@")
    if conn is None:
        return h
    try:
        row = conn.execute(
            "SELECT canonical FROM contributor_aliases WHERE alias = ?", (h,),
        ).fetchone()
        if row:
            return row["canonical"] if isinstance(row, dict) or hasattr(row, "keys") else row[0]
    except Exception:
        # Alias table might not exist yet on pre-v24 DBs — degrade gracefully.
        logger.debug("normalize_handle: alias lookup failed for %r", h, exc_info=True)
    return h
 def classify_kind(handle: str) -> str:
    """Return 'agent' for known Pentagon agents, 'person' otherwise.
    The 'org' kind (CNBC, SpaceNews, etc.) is assigned by operator review,
    not inferred here. Keeping heuristics narrow: we know our own agents;
    everything else defaults to person until explicitly classified.
    """
    h = handle.strip().lower().lstrip("@")
    if h in PENTAGON_AGENTS:
        return "agent"
    return "person"
 # ─── Parse attribution from claim content ──────────────────────────────────
@ -51,7 +137,11 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
            elif isinstance(entries, str):
                # Single entry as string
                result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
-        return result
+        # Fall through to the filter at the end (don't early-return). The nested
        # block path was skipping the handle sanity filter, letting garbage like
        # "senator-elissa-slotkin-/-the-hill" through when it was written into
        # frontmatter during the legacy-fallback era.
        return _filter_valid_handles(result)
    # Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
    for role in VALID_ROLES:
@ -64,22 +154,40 @@ def parse_attribution(fm: dict) -> dict[str, list[dict]]:
                    if isinstance(v, str):
                        result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
-    # Legacy fallback: infer from source field
+    # Bare-key flat format: `sourcer: alexastrum`, `extractor: leo`, etc.
-    if not any(result[r] for r in VALID_ROLES):
+    # This is what extract.py writes (line 290: f'sourcer: "{sourcer}"') — the most
-        source = fm.get("source", "")
+    # common format in practice (~42% of claim files). The Apr 24 incident traced
-        if isinstance(source, str) and source:
+    # missing leaderboard entries to this format being silently dropped because the
-            # Try to extract author handle from source string
+    # parser only checked the `attribution_*` prefix.
-            # Patterns: "@handle", "Author Name", "org, description"
+    # Only fill if the role wasn't already populated by the prefixed form, to avoid
-            handle_match = re.search(r"@(\w+)", source)
+    # double-counting when both formats coexist on the same claim.
-            if handle_match:
+    for role in VALID_ROLES:
-                result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source})
+        if result[role]:
-            else:
+            continue
-                # Use first word/phrase before comma as sourcer handle
+        bare_val = fm.get(role)
-                author = source.split(",")[0].strip().lower().replace(" ", "-")
+        if isinstance(bare_val, str) and bare_val.strip():
-                if author and len(author) > 1:
+            result[role].append({"handle": bare_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
-                    result["sourcer"].append({"handle": author, "agent_id": None, "context": source})
+        elif isinstance(bare_val, list):
            for v in bare_val:
                if isinstance(v, str) and v.strip():
                    result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
                elif isinstance(v, dict) and v.get("handle"):
                    result[role].append({
                        "handle": v["handle"].strip().lower().lstrip("@"),
                        "agent_id": v.get("agent_id"),
                        "context": v.get("context"),
                    })
-    return result
+    # Legacy `source` heuristic REMOVED (Ganymede review, Apr 24). It fabricated
    # handles from descriptive source strings — "governance---meritocratic-voting-+-
    # futarchy", "cameron-(contributor)", "sec-interpretive-release-s7-2026-09-
    # (march-17". Hit rate on real handles was near-zero, false-positive rate was
    # high. Claims without explicit attribution now return empty (better surface as
    # data hygiene than invent fake contributors).
    # Filter to valid handles only. Bad handles (garbage from upstream frontmatter
    # bugs) get dropped rather than written to the contributors table.
    return _filter_valid_handles(result)
 def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
--- a/lib/config.py
+++ b/lib/config.py
@ -156,13 +156,13 @@ CONTRIBUTOR_TIER_RULES = {
    },
 }
-# Role weights for CI computation (must match schemas/contribution-weights.yaml)
+# Role weights for CI computation (must match core/contribution-architecture.md)
 CONTRIBUTION_ROLE_WEIGHTS = {
    "challenger": 0.35,
    "synthesizer": 0.25,
    "reviewer": 0.20,
    "sourcer": 0.15,
-    "extractor": 0.40,
+    "extractor": 0.05,
    "challenger": 0.20,
    "synthesizer": 0.15,
    "reviewer": 0.10,
 }
 # --- Circuit breakers ---
@ -200,6 +200,9 @@ MERGE_INTERVAL = 30
 FIX_INTERVAL = 60
 HEALTH_CHECK_INTERVAL = 60
 # --- Extraction gates ---
 EXTRACTION_COOLDOWN_HOURS = 4  # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
 # --- Retrieval (Telegram bot) ---
 RETRIEVAL_RRF_K = 20  # RRF smoothing constant — tuned for 5-10 results per source
 RETRIEVAL_ENTITY_BOOST = 1.5  # RRF score multiplier for claims wiki-linked from matched entities
--- a/lib/contributor.py
+++ b/lib/contributor.py
@ -5,6 +5,7 @@ Extracted from merge.py (Phase 5 decomposition). Functions:
 - refine_commit_type: extract → challenge/enrich refinement from diff content
 - record_contributor_attribution: parse trailers + frontmatter, upsert contributors
 - upsert_contributor: insert/update contributor record with role counts
 - insert_contribution_event: event-sourced credit log (schema v24)
 - recalculate_tier: tier promotion based on config rules
 """
@ -13,11 +14,69 @@ import logging
 import re
 from . import config, db
 from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle
 from .forgejo import get_pr_diff
 logger = logging.getLogger("pipeline.contributor")
 # ─── Event schema (v24) ───────────────────────────────────────────────────
 # Role → CI weight, per Cory's confirmed schema (Apr 24 conversation).
 # Humans-are-always-author rule: agents never accumulate author credit;
 # evaluator (0.05) is the only agent-facing role. Internal agents still earn
 # author/challenger/synthesizer on their own autonomous research PRs but
 # surface in the kind='agent' leaderboard, not the default person view.
 ROLE_WEIGHTS = {
    "author": 0.30,
    "challenger": 0.25,
    "synthesizer": 0.20,
    "originator": 0.15,
    "evaluator": 0.05,
 }
 def insert_contribution_event(
    conn,
    handle: str,
    role: str,
    pr_number: int,
    *,
    claim_path: str | None = None,
    domain: str | None = None,
    channel: str | None = None,
    timestamp: str | None = None,
 ) -> bool:
    """Emit a contribution_events row. Idempotent via UNIQUE constraint.
    Returns True if the event was inserted, False if the constraint blocked it
    (same handle/role/pr/claim_path combo already recorded — safe to replay).
    Canonicalizes handle via alias table. Classifies kind from handle.
    Falls back silently if contribution_events table doesn't exist yet (pre-v24).
    """
    if role not in ROLE_WEIGHTS:
        logger.warning("insert_contribution_event: unknown role %r", role)
        return False
    weight = ROLE_WEIGHTS[role]
    canonical = normalize_handle(handle, conn=conn)
    if not canonical:
        return False
    kind = classify_kind(canonical)
    try:
        cur = conn.execute(
            """INSERT OR IGNORE INTO contribution_events
               (handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?, COALESCE(?, datetime('now')))""",
            (canonical, kind, role, weight, pr_number, claim_path, domain, channel, timestamp),
        )
        return cur.rowcount > 0
    except Exception:
        logger.debug("insert_contribution_event failed for pr=%d handle=%r role=%r",
                     pr_number, canonical, role, exc_info=True)
        return False
 def is_knowledge_pr(diff: str) -> bool:
    """Check if a PR touches knowledge files (claims, decisions, core, foundations).
@ -38,6 +97,22 @@ def is_knowledge_pr(diff: str) -> bool:
    return False
 COMMIT_TYPE_TO_ROLE = {
    "challenge": "challenger",
    "enrich": "synthesizer",
    "extract": "extractor",
    "research": "synthesizer",
    "entity": "extractor",
    "reweave": "synthesizer",
    "fix": "extractor",
 }
 def commit_type_to_role(commit_type: str) -> str:
    """Map a refined commit_type to a contributor role."""
    return COMMIT_TYPE_TO_ROLE.get(commit_type, "extractor")
 def refine_commit_type(diff: str, branch_commit_type: str) -> str:
    """Refine commit_type from diff content when branch prefix is ambiguous.
@ -109,15 +184,98 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
        return
    # Refine commit_type from diff content (branch prefix may be too broad)
-    row = conn.execute("SELECT commit_type FROM prs WHERE number = ?", (pr_number,)).fetchone()
+    row = conn.execute(
        "SELECT commit_type, submitted_by, domain, source_channel, leo_verdict, "
        "domain_verdict, domain_agent, merged_at FROM prs WHERE number = ?",
        (pr_number,),
    ).fetchone()
    branch_type = row["commit_type"] if row and row["commit_type"] else "extract"
    refined_type = refine_commit_type(diff, branch_type)
    if refined_type != branch_type:
        conn.execute("UPDATE prs SET commit_type = ? WHERE number = ?", (refined_type, pr_number))
        logger.info("PR #%d: commit_type refined %s → %s", pr_number, branch_type, refined_type)
    # Schema v24 event-sourcing context. Fetched once per PR, reused across emit sites.
    pr_domain = row["domain"] if row else None
    pr_channel = row["source_channel"] if row else None
    pr_submitted_by = row["submitted_by"] if row else None
    # Use the PR's merged_at timestamp so event time matches the actual merge.
    # If a merge retries after a crash, this keeps forward-emitted and backfilled
    # events on the same timeline. Falls back to datetime('now') in the writer.
    pr_merged_at = row["merged_at"] if row and row["merged_at"] else None
    # ── AUTHOR event (schema v24, double-write) ──
    # Humans-are-always-author rule: the human in the loop gets author credit.
    # Precedence: prs.submitted_by (set by extract.py from source proposed_by, or
    # by discover for human PRs) → git author of first commit → branch-prefix agent.
    # Pentagon-owned infra branches (extract/ reweave/ fix/ ingestion/) don't get
    # author events from branch prefix; extract/ PRs carry submitted_by from the
    # source's proposed_by field so the human who submitted gets credit via path 1.
    author_candidate: str | None = None
    if pr_submitted_by:
        author_candidate = pr_submitted_by
    else:
        # External GitHub PRs: git author of the FIRST commit on the branch is
        # the real submitter. `git log -1` would return the latest commit, which
        # mis-credits multi-commit PRs where a reviewer rebased or force-pushed.
        # Take the last line of the unreversed log (= oldest commit, since git
        # log defaults to reverse-chronological). Ganymede review, Apr 24.
        rc_author_log, author_log = await git_fn(
            "log", f"origin/main..origin/{branch}", "--no-merges",
            "--format=%an", timeout=5,
        )
        if rc_author_log == 0 and author_log.strip():
            lines = [line for line in author_log.strip().split("\n") if line.strip()]
            if lines:
                candidate = lines[-1].strip().lower()
                if candidate and candidate not in {"teleo", "teleo-bot", "pipeline",
                                                   "github-actions[bot]", "forgejo-actions"}:
                    author_candidate = candidate
        # Agent-owned branches with no submitted_by: theseus/research-*, leo/*, etc.
        if not author_candidate and branch.startswith(AGENT_BRANCH_PREFIXES):
            # Autonomous agent PR (theseus/research-*, leo/entity-*, etc.) —
            # credit goes to the agent as author per Cory's directive.
            author_candidate = branch.split("/", 1)[0]
    if author_candidate:
        insert_contribution_event(
            conn, author_candidate, "author", pr_number,
            claim_path=None, domain=pr_domain, channel=pr_channel,
            timestamp=pr_merged_at,
        )
    # ── EVALUATOR events (schema v24) ──
    # Leo reviews every PR (STANDARD/DEEP tiers). domain_agent is the second
    # reviewer. Both earn evaluator credit (0.05) per approved PR. Skip when
    # verdict is 'request_changes' — failed review isn't contribution credit.
    if row:
        if row["leo_verdict"] == "approve":
            insert_contribution_event(
                conn, "leo", "evaluator", pr_number,
                claim_path=None, domain=pr_domain, channel=pr_channel,
                timestamp=pr_merged_at,
            )
        if row["domain_verdict"] == "approve" and row["domain_agent"]:
            dagent = row["domain_agent"].strip().lower()
            if dagent and dagent != "leo":  # don't double-credit leo
                insert_contribution_event(
                    conn, dagent, "evaluator", pr_number,
                    claim_path=None, domain=pr_domain, channel=pr_channel,
                    timestamp=pr_merged_at,
                )
    # Parse Pentagon-Agent trailer from branch commit messages
    agents_found: set[str] = set()
    # Agent-owned branches (theseus/*, rio/*, etc.) give the trailer-named agent
    # challenger/synthesizer credit based on refined commit_type. Pipeline-owned
    # branches (extract/*, reweave/*, etc.) don't — those are infra, not work.
    is_agent_branch = branch.startswith(AGENT_BRANCH_PREFIXES)
    _TRAILER_EVENT_ROLE = {
        "challenge": "challenger",
        "enrich": "synthesizer",
        "research": "synthesizer",
        "reweave": "synthesizer",
    }
    rc, log_output = await git_fn(
        "log", f"origin/main..origin/{branch}", "--format=%b%n%N",
        timeout=10,
@ -126,32 +284,81 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
        for match in re.finditer(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", log_output):
            agent_name = match.group(1).lower()
            agent_uuid = match.group(2)
            role = commit_type_to_role(refined_type)
            upsert_contributor(
-                conn, agent_name, agent_uuid, "extractor", today,
+                conn, agent_name, agent_uuid, role, today,
            )
            # Event-emit only for agent-owned branches where the trailer's agent
            # actually did the substantive work (challenger/synthesizer).
            event_role = _TRAILER_EVENT_ROLE.get(refined_type)
            if is_agent_branch and event_role:
                insert_contribution_event(
                    conn, agent_name, event_role, pr_number,
                    claim_path=None, domain=pr_domain, channel=pr_channel,
                    timestamp=pr_merged_at,
                )
            agents_found.add(agent_name)
-    # Parse attribution blocks from claim frontmatter in diff
+    # Parse attribution from NEWLY ADDED knowledge files via the canonical attribution
-    # Look for added lines with attribution YAML
+    # parser (lib/attribution.py). The previous diff-line regex parser dropped
-    current_role = None
+    # both the bare-key flat format (`sourcer: alexastrum`) and the nested
-    for line in diff.split("\n"):
+    # `attribution:` block format because it only matched `- handle: "X"` lines.
-        if not line.startswith("+") or line.startswith("+++"):
+    # The Apr 24 incident traced missing leaderboard entries (alexastrum=0,
-            continue
+    # thesensatore=0, cameron-s1=0) directly to this parser's blind spots.
-        stripped = line[1:].strip()
+    #
    # --diff-filter=A restricts to added files only (Ganymede review): enrich and
    # challenge PRs modify existing claims, and re-crediting the existing sourcer on
    # every modification would inflate counts. The synthesizer/challenger/reviewer
    # roles for those PRs are credited via the Pentagon-Agent trailer path above.
    rc_files, files_output = await git_fn(
        "diff", "--name-only", "--diff-filter=A",
        f"origin/main...origin/{branch}", timeout=10,
    )
    if rc_files == 0 and files_output:
        from pathlib import Path
        from . import config
        from .attribution import parse_attribution_from_file
-        # Detect role sections in attribution block
+        main_root = Path(config.MAIN_WORKTREE)
-        for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"):
+        # Match is_knowledge_pr's gate exactly. Entities/convictions are excluded
-            if stripped.startswith(f"{role}:"):
+        # here because is_knowledge_pr skips entity-only PRs at line 123 — so a
-                current_role = role
+        # broader list here only matters for mixed PRs where the narrower list
-                break
+        # already matches via the claim file. Widening requires Cory sign-off
-
+        # since it would change leaderboard accounting (entity-only PRs → CI credit).
-        # Extract handle from attribution entries
+        knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/")
-        handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped)
+        author_canonical = normalize_handle(author_candidate, conn=conn) if author_candidate else None
-        if handle_match and current_role:
+        for rel_path in files_output.strip().split("\n"):
-            handle = handle_match.group(1).strip().lower()
+            rel_path = rel_path.strip()
-            agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped)
+            if not rel_path.endswith(".md"):
-            agent_id = agent_id_match.group(1).strip() if agent_id_match else None
+                continue
-            upsert_contributor(conn, handle, agent_id, current_role, today)
+            if not rel_path.startswith(knowledge_prefixes):
                continue
            full = main_root / rel_path
            if not full.exists():
                continue  # file removed in this PR
            attribution = parse_attribution_from_file(str(full))
            for role, entries in attribution.items():
                for entry in entries:
                    handle = entry.get("handle")
                    if handle:
                        upsert_contributor(
                            conn, handle, entry.get("agent_id"), role, today,
                        )
                        # Event-emit: only 'sourcer' frontmatter entries become
                        # originator events. 'extractor' frontmatter = infrastructure
                        # (the Sonnet extraction agent), no event. challenger/
                        # synthesizer frontmatter is extremely rare at extract time.
                        # Skip originator if same as author — avoids double-credit
                        # when someone submits their own content (self-authored).
                        if role == "sourcer":
                            origin_canonical = normalize_handle(handle, conn=conn)
                            if origin_canonical and origin_canonical != author_canonical:
                                insert_contribution_event(
                                    conn, handle, "originator", pr_number,
                                    claim_path=rel_path,
                                    domain=pr_domain, channel=pr_channel,
                                    timestamp=pr_merged_at,
                                )
    # Fallback: if no Pentagon-Agent trailer found, try git commit authors
    _BOT_AUTHORS = frozenset({
@ -167,13 +374,37 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
            for author_line in author_output.strip().split("\n"):
                author_name = author_line.strip().lower()
                if author_name and author_name not in _BOT_AUTHORS:
-                    upsert_contributor(conn, author_name, None, "extractor", today)
+                    role = commit_type_to_role(refined_type)
                    upsert_contributor(conn, author_name, None, role, today)
                    # Event-model parity: emit challenger/synthesizer event when
                    # the fallback credits a human/agent for that kind of work.
                    # Without this, external-contributor challenge/enrich PRs
                    # accumulate legacy counts but disappear from event-sourced
                    # leaderboards when Phase B cuts over. (Ganymede review.)
                    event_role_fb = _TRAILER_EVENT_ROLE.get(refined_type)
                    if event_role_fb:
                        insert_contribution_event(
                            conn, author_name, event_role_fb, pr_number,
                            claim_path=None, domain=pr_domain, channel=pr_channel,
                            timestamp=pr_merged_at,
                        )
                    agents_found.add(author_name)
        if not agents_found:
-            row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone()
+            fb_row = conn.execute(
-            if row and row["agent"] and row["agent"] != "external":
+                "SELECT agent FROM prs WHERE number = ?", (pr_number,)
-                upsert_contributor(conn, row["agent"].lower(), None, "extractor", today)
+            ).fetchone()
            if fb_row and fb_row["agent"] and fb_row["agent"] != "external":
                pr_agent = fb_row["agent"].lower()
                role = commit_type_to_role(refined_type)
                upsert_contributor(conn, pr_agent, None, role, today)
                event_role_fb = _TRAILER_EVENT_ROLE.get(refined_type)
                if event_role_fb:
                    insert_contribution_event(
                        conn, pr_agent, event_role_fb, pr_number,
                        claim_path=None, domain=pr_domain, channel=pr_channel,
                        timestamp=pr_merged_at,
                    )
 def upsert_contributor(
--- a/lib/db.py
+++ b/lib/db.py
@ -9,7 +9,7 @@ from . import config
 logger = logging.getLogger("pipeline.db")
-SCHEMA_VERSION = 22
+SCHEMA_VERSION = 26
 SCHEMA_SQL = """
 CREATE TABLE IF NOT EXISTS schema_version (
@ -35,6 +35,15 @@ CREATE TABLE IF NOT EXISTS sources (
    feedback TEXT,
    -- eval feedback for re-extraction (JSON)
    cost_usd REAL DEFAULT 0,
    -- v26: provenance — publisher (news org / venue) + content author.
    -- publisher_id references publishers(id) when source is from a known org.
    -- original_author_handle references contributors(handle) when author is in our system.
    -- original_author is free-text fallback ("Kim et al.", "Robin Hanson") — not credit-bearing.
    publisher_id INTEGER REFERENCES publishers(id),
    content_type TEXT,
    -- article | paper | tweet | conversation | self_authored | webpage | podcast
    original_author TEXT,
    original_author_handle TEXT REFERENCES contributors(handle),
    created_at TEXT DEFAULT (datetime('now')),
    updated_at TEXT DEFAULT (datetime('now'))
 );
@ -157,11 +166,83 @@ CREATE TABLE IF NOT EXISTS response_audit (
 CREATE INDEX IF NOT EXISTS idx_sources_status ON sources(status);
 CREATE INDEX IF NOT EXISTS idx_prs_status ON prs(status);
 CREATE INDEX IF NOT EXISTS idx_prs_domain ON prs(domain);
 CREATE INDEX IF NOT EXISTS idx_prs_source_path ON prs(source_path) WHERE source_path IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_costs_date ON costs(date);
 CREATE INDEX IF NOT EXISTS idx_audit_stage ON audit_log(stage);
 CREATE INDEX IF NOT EXISTS idx_response_audit_ts ON response_audit(timestamp);
 CREATE INDEX IF NOT EXISTS idx_response_audit_agent ON response_audit(agent);
 CREATE INDEX IF NOT EXISTS idx_response_audit_chat_ts ON response_audit(chat_id, timestamp);
 -- Event-sourced contributions (schema v24).
 -- One row per credit-earning event. Idempotent via two partial UNIQUE indexes
 -- (SQLite treats NULL != NULL in UNIQUE constraints, so a single composite
 -- UNIQUE with nullable claim_path would allow evaluator-event duplicates).
 -- Leaderboards are SQL aggregations over this table; contributors becomes a materialized cache.
 CREATE TABLE IF NOT EXISTS contribution_events (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    handle TEXT NOT NULL,
    kind TEXT NOT NULL DEFAULT 'person',
    -- person | org | agent
    role TEXT NOT NULL,
    -- author | originator | challenger | synthesizer | evaluator
    weight REAL NOT NULL,
    pr_number INTEGER NOT NULL,
    claim_path TEXT,
    -- NULL for PR-level events (e.g. evaluator). Set for per-claim events.
    domain TEXT,
    channel TEXT,
    -- telegram | github | agent | web | unknown
    timestamp TEXT NOT NULL DEFAULT (datetime('now'))
 );
 -- Per-claim events: unique on (handle, role, pr_number, claim_path) when path IS NOT NULL.
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ce_unique_claim ON contribution_events(
    handle, role, pr_number, claim_path
 ) WHERE claim_path IS NOT NULL;
 -- PR-level events (evaluator, author, trailer-based): unique on (handle, role, pr_number) when path IS NULL.
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ce_unique_pr ON contribution_events(
    handle, role, pr_number
 ) WHERE claim_path IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ce_handle_ts ON contribution_events(handle, timestamp);
 CREATE INDEX IF NOT EXISTS idx_ce_domain_ts ON contribution_events(domain, timestamp);
 CREATE INDEX IF NOT EXISTS idx_ce_pr ON contribution_events(pr_number);
 CREATE INDEX IF NOT EXISTS idx_ce_role_ts ON contribution_events(role, timestamp);
 CREATE INDEX IF NOT EXISTS idx_ce_kind_ts ON contribution_events(kind, timestamp);
 -- Handle aliasing. @thesensatore → thesensatore. cameron → cameron-s1.
 -- Writers call resolve_alias(handle) before inserting events or upserting contributors.
 CREATE TABLE IF NOT EXISTS contributor_aliases (
    alias TEXT PRIMARY KEY,
    canonical TEXT NOT NULL,
    created_at TEXT DEFAULT (datetime('now'))
 );
 CREATE INDEX IF NOT EXISTS idx_aliases_canonical ON contributor_aliases(canonical);
 -- Publishers: news orgs, academic venues, social platforms. NOT contributors — these
 -- provide metadata/provenance for sources, never earn leaderboard credit. Separating
 -- these from contributors prevents CNBC/SpaceNews from dominating the leaderboard.
 -- (Apr 24 Cory directive: "only credit the original source if its on X or tg")
 CREATE TABLE IF NOT EXISTS publishers (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT NOT NULL UNIQUE,
    kind TEXT CHECK(kind IN ('news', 'academic', 'social_platform', 'podcast', 'self', 'internal', 'legal', 'government', 'research_org', 'commercial', 'other')),
    url_pattern TEXT,
    created_at TEXT DEFAULT (datetime('now'))
 );
 CREATE INDEX IF NOT EXISTS idx_publishers_name ON publishers(name);
 CREATE INDEX IF NOT EXISTS idx_publishers_kind ON publishers(kind);
 -- Multi-platform identity: one contributor, many handles. Enables the leaderboard to
 -- unify @thesensatore (X) + thesensatore (TG) + thesensatore@github into one person.
 -- Writers check this table after resolving aliases to find canonical contributor handle.
 CREATE TABLE IF NOT EXISTS contributor_identities (
    contributor_handle TEXT NOT NULL,
    platform TEXT NOT NULL CHECK(platform IN ('x', 'telegram', 'github', 'email', 'web', 'internal')),
    platform_handle TEXT NOT NULL,
    verified INTEGER DEFAULT 0,
    created_at TEXT DEFAULT (datetime('now')),
    PRIMARY KEY (platform, platform_handle)
 );
 CREATE INDEX IF NOT EXISTS idx_identities_contributor ON contributor_identities(contributor_handle);
 """
@ -231,9 +312,20 @@ def classify_branch(branch: str) -> tuple[str, str]:
 # Keep in sync with BRANCH_PREFIX_MAP above.
 #
 # Valid source_channel values: github | telegram | agent | maintenance | web | unknown
 #   - github: external contributor PR (set via sync-mirror.sh github_pr linking,
 #     or from gh-pr-* branches, or any time github_pr is provided)
 #   - telegram: message captured by telegram bot (must be tagged explicitly by
 #     ingestion — extract/* default is "unknown" because the bare branch prefix
 #     can no longer distinguish telegram-origin from github-origin extractions)
 #   - agent: per-agent research branches (rio/, theseus/, etc.)
 #   - maintenance: pipeline housekeeping (reweave/, epimetheus/, fix/)
 #   - web: future in-app submissions (chat UI or form posts)
 #   - unknown: fallback when provenance cannot be determined
 _CHANNEL_MAP = {
-    "extract": "telegram",
+    "extract": "unknown",
-    "ingestion": "telegram",
+    "ingestion": "unknown",
    "rio": "agent",
    "theseus": "agent",
    "astra": "agent",
@ -248,7 +340,12 @@ _CHANNEL_MAP = {
 def classify_source_channel(branch: str, *, github_pr: int = None) -> str:
-    """Derive source_channel from branch prefix and github_pr flag."""
+    """Derive source_channel from branch prefix and github_pr flag.
    Precedence: github_pr flag > gh-pr- branch prefix > _CHANNEL_MAP lookup.
    extract/* defaults to "unknown" — callers with better provenance (telegram
    bot, web submission handler) must override at PR-insert time.
    """
    if github_pr is not None or branch.startswith("gh-pr-"):
        return "github"
    prefix = branch.split("/", 1)[0] if "/" in branch else branch
@ -617,6 +714,137 @@ def migrate(conn: sqlite3.Connection):
        conn.commit()
        logger.info("Migration v22: added source_channel to prs + backfilled from branch prefix")
    if current < 23:
        conn.execute(
            "CREATE INDEX IF NOT EXISTS idx_prs_source_path ON prs(source_path) WHERE source_path IS NOT NULL"
        )
        conn.commit()
        logger.info("Migration v23: added idx_prs_source_path for auto-close dedup lookup")
    if current < 24:
        # Event-sourced contributions table + alias table + kind column on contributors.
        # Non-breaking: contributors table stays; events are written in addition via
        # double-write in merge.py. Leaderboards switch to events in Phase B.
        conn.executescript("""
            CREATE TABLE IF NOT EXISTS contribution_events (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                handle TEXT NOT NULL,
                kind TEXT NOT NULL DEFAULT 'person',
                role TEXT NOT NULL,
                weight REAL NOT NULL,
                pr_number INTEGER NOT NULL,
                claim_path TEXT,
                domain TEXT,
                channel TEXT,
                timestamp TEXT NOT NULL DEFAULT (datetime('now'))
            );
            -- Partial unique indexes handle SQLite's NULL != NULL UNIQUE semantics.
            -- Per-claim events dedup on 4-tuple; PR-level events dedup on 3-tuple.
            CREATE UNIQUE INDEX IF NOT EXISTS idx_ce_unique_claim ON contribution_events(
                handle, role, pr_number, claim_path
            ) WHERE claim_path IS NOT NULL;
            CREATE UNIQUE INDEX IF NOT EXISTS idx_ce_unique_pr ON contribution_events(
                handle, role, pr_number
            ) WHERE claim_path IS NULL;
            CREATE INDEX IF NOT EXISTS idx_ce_handle_ts ON contribution_events(handle, timestamp);
            CREATE INDEX IF NOT EXISTS idx_ce_domain_ts ON contribution_events(domain, timestamp);
            CREATE INDEX IF NOT EXISTS idx_ce_pr ON contribution_events(pr_number);
            CREATE INDEX IF NOT EXISTS idx_ce_role_ts ON contribution_events(role, timestamp);
            CREATE INDEX IF NOT EXISTS idx_ce_kind_ts ON contribution_events(kind, timestamp);
            CREATE TABLE IF NOT EXISTS contributor_aliases (
                alias TEXT PRIMARY KEY,
                canonical TEXT NOT NULL,
                created_at TEXT DEFAULT (datetime('now'))
            );
            CREATE INDEX IF NOT EXISTS idx_aliases_canonical ON contributor_aliases(canonical);
        """)
        try:
            conn.execute("ALTER TABLE contributors ADD COLUMN kind TEXT DEFAULT 'person'")
        except sqlite3.OperationalError:
            pass  # column already exists
        # Seed known aliases. @thesensatore → thesensatore catches the zombie row Argus flagged.
        # cameron → cameron-s1 reconciles the Leo-flagged missing contributor.
        conn.executemany(
            "INSERT OR IGNORE INTO contributor_aliases (alias, canonical) VALUES (?, ?)",
            [
                ("@thesensatore", "thesensatore"),
                ("cameron", "cameron-s1"),
            ],
        )
        # Seed kind='agent' for known Pentagon agents so the events writer picks it up.
        # Must stay in sync with lib/attribution.PENTAGON_AGENTS — drift causes
        # contributors.kind to disagree with classify_kind() output for future
        # inserts. (Ganymede review: "pipeline" was missing until Apr 24.)
        pentagon_agents = [
            "rio", "leo", "theseus", "vida", "clay", "astra",
            "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
            "pipeline",
        ]
        for agent in pentagon_agents:
            conn.execute(
                "UPDATE contributors SET kind = 'agent' WHERE handle = ?",
                (agent,),
            )
        conn.commit()
        logger.info("Migration v24: added contribution_events + contributor_aliases tables, kind column")
    if current < 25:
        # v24 seeded 13 Pentagon agents but missed "pipeline" — classify_kind()
        # treats it as agent so contributors.kind drifted from event-insert output.
        # Idempotent corrective UPDATE: fresh installs have no "pipeline" row
        # (no-op), upgraded envs flip it if it exists. (Ganymede review Apr 24.)
        conn.execute(
            "UPDATE contributors SET kind = 'agent' WHERE handle = 'pipeline'"
        )
        conn.commit()
        logger.info("Migration v25: patched kind='agent' for pipeline handle")
    if current < 26:
        # Add publishers + contributor_identities. Non-breaking — new tables only.
        # No existing data moved. Classification into publishers happens via a
        # separate script (scripts/reclassify-contributors.py) with Cory-reviewed
        # seed list. CHECK constraint on contributors.kind deferred to v27 after
        # classification completes. (Apr 24 Cory directive: "fix schema, don't
        # filter output" — separate contributors from publishers at the data layer.)
        conn.executescript("""
            CREATE TABLE IF NOT EXISTS publishers (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT NOT NULL UNIQUE,
                kind TEXT CHECK(kind IN ('news', 'academic', 'social_platform', 'podcast', 'self', 'internal', 'legal', 'government', 'research_org', 'commercial', 'other')),
                url_pattern TEXT,
                created_at TEXT DEFAULT (datetime('now'))
            );
            CREATE INDEX IF NOT EXISTS idx_publishers_name ON publishers(name);
            CREATE INDEX IF NOT EXISTS idx_publishers_kind ON publishers(kind);
            CREATE TABLE IF NOT EXISTS contributor_identities (
                contributor_handle TEXT NOT NULL,
                platform TEXT NOT NULL CHECK(platform IN ('x', 'telegram', 'github', 'email', 'web', 'internal')),
                platform_handle TEXT NOT NULL,
                verified INTEGER DEFAULT 0,
                created_at TEXT DEFAULT (datetime('now')),
                PRIMARY KEY (platform, platform_handle)
            );
            CREATE INDEX IF NOT EXISTS idx_identities_contributor ON contributor_identities(contributor_handle);
        """)
        # Extend sources with provenance columns. ALTER TABLE ADD COLUMN is
        # idempotent-safe via try/except because SQLite doesn't support IF NOT EXISTS
        # on column adds.
        for col_sql in (
            "ALTER TABLE sources ADD COLUMN publisher_id INTEGER REFERENCES publishers(id)",
            "ALTER TABLE sources ADD COLUMN content_type TEXT",
            "ALTER TABLE sources ADD COLUMN original_author TEXT",
            "ALTER TABLE sources ADD COLUMN original_author_handle TEXT REFERENCES contributors(handle)",
        ):
            try:
                conn.execute(col_sql)
            except sqlite3.OperationalError as e:
                if "duplicate column" not in str(e).lower():
                    raise
        conn.commit()
        logger.info("Migration v26: added publishers + contributor_identities tables + sources provenance columns")
    if current < SCHEMA_VERSION:
        conn.execute(
            "INSERT OR REPLACE INTO schema_version (version) VALUES (?)",
--- a/lib/domains.py
+++ b/lib/domains.py
@ -37,6 +37,11 @@ _AGENT_PRIMARY_DOMAIN: dict[str, str] = {
    "leo": "grand-strategy",
 }
 _INGESTION_SOURCE_DOMAIN: dict[str, str] = {
    "futardio": "internet-finance",
    "metadao": "internet-finance",
 }
 def agent_for_domain(domain: str | None) -> str:
    """Get the reviewing agent for a domain. Falls back to Leo."""
@ -82,6 +87,14 @@ def detect_domain_from_branch(branch: str) -> str | None:
    """Extract domain from branch name like 'rio/claims-futarchy' → 'internet-finance'.
    Uses agent prefix → primary domain mapping for pipeline branches.
    For ingestion branches, checks the rest of the name for source-type hints.
    """
    prefix = branch.split("/")[0].lower() if "/" in branch else ""
-    return _AGENT_PRIMARY_DOMAIN.get(prefix)
+    if prefix in _AGENT_PRIMARY_DOMAIN:
        return _AGENT_PRIMARY_DOMAIN[prefix]
    if prefix == "ingestion":
        rest = branch.split("/", 1)[1].lower() if "/" in branch else ""
        for source_key, domain in _INGESTION_SOURCE_DOMAIN.items():
            if source_key in rest:
                return domain
    return None
--- a/lib/eval_actions.py
+++ b/lib/eval_actions.py
@ -11,13 +11,14 @@ All functions are async (Forgejo API calls). Dependencies: forgejo, db, config,
 pr_state, feedback, eval_parse.
 """
 import asyncio
 import json
 import logging
 from . import config, db
 from .eval_parse import classify_issues
 from .feedback import format_rejection_comment
-from .forgejo import api as forgejo_api, get_agent_token, repo_path
+from .forgejo import api as forgejo_api, get_agent_token, get_pr_diff, repo_path
 from .github_feedback import on_closed, on_eval_complete
 from .pr_state import close_pr
@ -114,12 +115,98 @@ async def terminate_pr(conn, pr_number: int, reason: str):
 async def dispose_rejected_pr(conn, pr_number: int, eval_attempts: int, all_issues: list[str]):
    """Disposition logic for rejected PRs on attempt 2+.
    Auto-close gate (all attempts): near-duplicate of an already-merged PR for
    the same source — close immediately. Avoids the Apr 22 runaway-damage
    pattern where a source extracted 20+ times in a short window produced
    dozens of open PRs that all had to be closed manually.
    Attempt 1: normal — back to open, wait for fix.
    Attempt 2: check issue classification.
      - Mechanical only: keep open for one more attempt (auto-fix future).
      - Substantive or mixed: close PR, requeue source.
    Attempt 3+: terminal.
    """
    # Auto-close near-duplicate when a merged sibling for the same source exists.
    # Runs before the attempt-count branches so it catches the common runaway
    # case on attempt 1 instead of waiting for attempt 2's terminate path.
    #
    # Exact-match requirement (Ganymede review): compound rejections like
    # ["near_duplicate", "factual_discrepancy"] carry signal about the merged
    # sibling being wrong or limited — we want humans to see those. Only the
    # pure single-issue case is safe to auto-close.
    if all_issues == ["near_duplicate"]:
        existing_merged = conn.execute(
            """SELECT p2.number, p1.source_path FROM prs p1
               JOIN prs p2 ON p2.source_path = p1.source_path
               WHERE p1.number = ?
                 AND p1.source_path IS NOT NULL
                 AND p2.number != p1.number
                 AND p2.status = 'merged'
               LIMIT 1""",
            (pr_number,),
        ).fetchone()
        if existing_merged:
            sibling = existing_merged[0]
            source_path = existing_merged[1]
            # Enrichment guard: LLM reviewers can flag enrichment prose as
            # "redundant" via eval_parse regex, tagging near_duplicate even
            # though validate.py's structural check only fires on NEW files.
            # If the PR only MODIFIES existing files (no "new file mode" in
            # diff), it's an enrichment — skip auto-close so a human reviews.
            #
            # 10s timeout bounds damage when Forgejo is wedged (Apr 22 incident:
            # hung for 2.5h). Conservative fallback: skip auto-close on any
            # failure — fall through to normal rejection path.
            try:
                diff = await asyncio.wait_for(get_pr_diff(pr_number), timeout=10)
            except (asyncio.TimeoutError, Exception):
                logger.warning(
                    "PR #%d: diff fetch failed/timed out for near-dup guard — skipping auto-close",
                    pr_number, exc_info=True,
                )
                diff = None
            if not diff:
                # None or empty — conservative fallback, fall through to attempt-count branches
                pass
            elif "new file mode" not in diff:
                logger.info(
                    "PR #%d: near_duplicate but modifies-only (enrichment) — skipping auto-close",
                    pr_number,
                )
            else:
                logger.info(
                    "PR #%d: auto-closing near-duplicate of merged PR #%d (same source)",
                    pr_number, sibling,
                )
                # Post a brief explanation before closing (best-effort — non-fatal)
                try:
                    await forgejo_api(
                        "POST",
                        repo_path(f"issues/{pr_number}/comments"),
                        {"body": (
                            f"Auto-closed: near-duplicate of already-merged PR "
                            f"#{sibling} (same source: `{source_path}`)."
                        )},
                    )
                except Exception:
                    logger.debug("PR #%d: auto-close comment failed (non-fatal)", pr_number, exc_info=True)
                await close_pr(
                    conn, pr_number,
                    last_error=f"auto_closed_near_duplicate: merged sibling #{sibling}",
                )
                db.audit(
                    conn, "evaluate", "auto_closed_near_duplicate",
                    json.dumps({
                        "pr": pr_number,
                        "merged_sibling": sibling,
                        "source_path": source_path,
                        "eval_attempts": eval_attempts,
                    }),
                )
                return
    if eval_attempts < 2:
        # Attempt 1: post structured feedback so agent learns, but don't close
        if all_issues:
--- a/lib/evaluate.py
+++ b/lib/evaluate.py
@ -261,7 +261,8 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
        )
        db.record_review(
            conn, pr_number, "rejected",
-            domain=domain, agent=agent, reviewer=agent, reviewer_model="gpt-4o",
+            domain=domain, agent=agent, reviewer=agent, reviewer_model=config.EVAL_DOMAIN_MODEL,
            rejection_reason=",".join(domain_issues) if domain_issues else None,
            notes=(domain_review or "")[:4000],
        )
@ -398,6 +399,7 @@ async def evaluate_pr(conn, pr_number: int, tier: str = None) -> dict:
            conn, pr_number, "approved-with-changes",
            domain=domain, agent=agent, reviewer="leo",
            reviewer_model="sonnet" if tier == "STANDARD" else "opus",
            rejection_reason=",".join(all_issues) if all_issues else None,
            notes=(leo_review or domain_review or "")[:4000],
        )
        logger.info(
--- a/lib/extract.py
+++ b/lib/extract.py
@ -33,6 +33,7 @@ from pathlib import Path
 from . import config
 from .costs import record_usage
 from .db import classify_source_channel
 from .domains import agent_for_domain
 from .extraction_prompt import build_extraction_prompt
 from .forgejo import api as forgejo_api
@ -229,7 +230,7 @@ def _parse_extraction_json(text: str) -> dict | None:
        return None
-def _build_claim_content(claim: dict, agent: str, source_format: str | None = None) -> str:
+def _build_claim_content(claim: dict, agent: str, source_format: str | None = None, source_file: str = "") -> str:
    """Build claim markdown file content from extraction JSON."""
    today = date.today().isoformat()
    domain = claim.get("domain", "")
@ -281,6 +282,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
        f"created: {today}",
        f"agent: {agent}",
    ]
    if source_file:
        lines.append(f"sourced_from: {source_file}")
    if scope:
        lines.append(f"scope: {scope}")
    if sourcer:
@ -432,7 +435,7 @@ async def _extract_one_source(
        filename = Path(filename).name  # Strip directory components — LLM output may contain path traversal
        if not filename.endswith(".md"):
            filename += ".md"
-        content = _build_claim_content(c, agent_lower, source_format=source_format)
+        content = _build_claim_content(c, agent_lower, source_format=source_format, source_file=f"{domain}/{source_file}" if domain else source_file)
        claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content})
    # Build entity file contents
@ -490,6 +493,17 @@ async def _extract_one_source(
    if not claim_files and not entity_files and not enrichments:
        logger.info("No valid claims/entities/enrichments after validation for %s — archiving as null-result", source_file)
        # Mark DB as null_result so queue scan won't re-extract even if file stays in queue
        # (the main-worktree push in _archive_source frequently fails — DB is authoritative).
        try:
            conn.execute(
                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'null_result', datetime('now'))
                   ON CONFLICT(path) DO UPDATE SET status='null_result', updated_at=datetime('now')""",
                (source_path,),
            )
            conn.commit()
        except Exception:
            logger.debug("Failed to mark source as null_result in DB", exc_info=True)
        await _archive_source(source_path, domain, "null-result")
        return 0, 0
@ -558,6 +572,18 @@ async def _extract_one_source(
    if not files_written:
        logger.info("No files written for %s — cleaning up", source_file)
        # Path B null-result: enrichments existed but all targets missing in worktree.
        # No PR, no cooldown match — without DB update this re-extracts every 60s.
        # (Ganymede review, commit 469cb7f follow-up.)
        try:
            conn.execute(
                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'null_result', datetime('now'))
                   ON CONFLICT(path) DO UPDATE SET status='null_result', updated_at=datetime('now')""",
                (source_path,),
            )
            conn.commit()
        except Exception:
            logger.debug("Failed to mark source as null_result (path B)", exc_info=True)
        await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
        await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
        await _archive_source(source_path, domain, "null-result")
@ -576,6 +602,22 @@ async def _extract_one_source(
        except Exception:
            logger.warning("Extract-connect failed (non-fatal)", exc_info=True)
    # Archive the source WITHIN the extract branch (not via separate push on main).
    # Prevents the runaway-extraction race: when archive-to-main push fails (non-FF,
    # non-pushable worktree state), file returns to queue and gets re-extracted every
    # cycle. Moving the archive into the extract branch makes it atomic with the PR
    # merge — when the PR merges, the source is archived automatically.
    try:
        archive_rel = _archive_source_in_worktree(
            worktree, source_path, domain, "processed", agent_lower, extract_model,
        )
        if archive_rel:
            files_written.append(archive_rel["new"])
            # The queue file was deleted; git add handles the removal
            await _git("add", "inbox/queue/", cwd=str(EXTRACT_WORKTREE))
    except Exception:
        logger.exception("In-branch archive failed for %s (continuing)", source_file)
    # Stage and commit
    for f in files_written:
        await _git("add", f, cwd=str(EXTRACT_WORKTREE))
@ -658,17 +700,32 @@ async def _extract_one_source(
            for c in claims_raw if c.get("title") or c.get("filename")
        )
-        # Upsert: if discover_external_prs already created the row, update it;
+        # Success path: mark source as 'extracting' so queue scan's DB-status filter
-        # if not, create a partial row that discover will complete.
+        # skips it between PR creation and merge. Without this, cooldown is load-bearing
        # (Ganymede review, commit 469cb7f follow-up).
        try:
            conn.execute(
-                """INSERT INTO prs (number, branch, status, submitted_by, source_path, description)
+                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'extracting', datetime('now'))
-                   VALUES (?, ?, 'open', ?, ?, ?)
+                   ON CONFLICT(path) DO UPDATE SET status='extracting', updated_at=datetime('now')""",
                (source_path,),
            )
            conn.commit()
        except Exception:
            logger.debug("Failed to mark source as extracting", exc_info=True)
        # Upsert: if discover_external_prs already created the row, update it;
        # if not, create a partial row that discover will complete.
        source_channel = classify_source_channel(branch)
        try:
            conn.execute(
                """INSERT INTO prs (number, branch, status, submitted_by, source_path, description, source_channel)
                   VALUES (?, ?, 'open', ?, ?, ?, ?)
                   ON CONFLICT(number) DO UPDATE SET
                     submitted_by = excluded.submitted_by,
                     source_path = excluded.source_path,
-                     description = COALESCE(excluded.description, prs.description)""",
+                     description = COALESCE(excluded.description, prs.description),
-                (pr_num, branch, contributor, source_path, claim_titles),
+                     source_channel = COALESCE(prs.source_channel, excluded.source_channel)""",
                (pr_num, branch, contributor, source_path, claim_titles, source_channel),
            )
            conn.commit()
        except Exception:
@ -689,12 +746,69 @@ async def _extract_one_source(
    # Clean up extract worktree
    await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
-    # 10. Archive source on main
+    # Note: source archival happened in-branch before commit (see _archive_source_in_worktree).
-    await _archive_source(source_path, domain, "processed", agent_lower)
+    # Do NOT call _archive_source() here — the broken main-worktree-push path caused the
    # runaway extraction bug. Archive is now atomic with PR merge.
    return 1, 0
 def _archive_source_in_worktree(
    worktree: Path,
    source_path: str,
    domain: str,
    status: str,
    agent: str | None,
    extraction_model: str,
 ) -> dict | None:
    """Move source file from inbox/queue/ to inbox/archive/<domain>/ WITHIN extract worktree.
    Updates frontmatter (status, processed_by, processed_date, extraction_model) and
    returns {"old": old_rel_path, "new": new_rel_path} or None if not found.
    The caller commits this change as part of the extract branch, so the archive lands
    atomically with the PR merge — no separate push on main required.
    """
    queue_path = worktree / source_path
    if not queue_path.exists():
        logger.warning("Source %s not found in worktree queue — skipping in-branch archive", source_path)
        return None
    if status == "null-result":
        dest_dir = worktree / "inbox" / "null-result"
    else:
        dest_dir = worktree / "inbox" / "archive" / (domain or "unknown")
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest_path = dest_dir / queue_path.name
    content = queue_path.read_text(encoding="utf-8")
    today = date.today().isoformat()
    content = re.sub(r"^status: unprocessed", f"status: {status}", content, flags=re.MULTILINE)
    if agent and "processed_by:" not in content:
        content = re.sub(
            r"(^status: \w+)",
            rf"\1\nprocessed_by: {agent}\nprocessed_date: {today}",
            content,
            count=1,
            flags=re.MULTILINE,
        )
    if "extraction_model:" not in content:
        content = re.sub(
            r"(^status: \w+.*?)(\n---)",
            rf'\1\nextraction_model: "{extraction_model}"\2',
            content,
            count=1,
            flags=re.MULTILINE | re.DOTALL,
        )
    dest_path.write_text(content, encoding="utf-8")
    queue_path.unlink()
    old_rel = str(queue_path.relative_to(worktree))
    new_rel = str(dest_path.relative_to(worktree))
    return {"old": old_rel, "new": new_rel}
 async def _archive_source(
    source_path: str,
    domain: str,
@ -786,13 +900,26 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
    if not queue_dir.exists():
        return 0, 0
    # DB-authoritative status filter: exclude sources where DB records non-unprocessed state.
    # File frontmatter alone isn't reliable — archive pushes can fail, leaving stale file state.
    # The sources table is the authoritative record of whether a source has been processed.
    db_non_unprocessed = {
        r["path"] for r in conn.execute(
            "SELECT path FROM sources WHERE status != 'unprocessed'"
        ).fetchall()
    }
    unprocessed = []
    for f in sorted(queue_dir.glob("*.md")):
        try:
            content = f.read_text(encoding="utf-8")
            fm = _parse_source_frontmatter(content)
-            if fm.get("status") == "unprocessed":
+            if fm.get("status") != "unprocessed":
-                unprocessed.append((str(f.relative_to(main)), content, fm))
+                continue
            rel_path = str(f.relative_to(main))
            if rel_path in db_non_unprocessed:
                continue
            unprocessed.append((rel_path, content, fm))
        except Exception:
            logger.debug("Failed to read source %s", f, exc_info=True)
@ -829,6 +956,29 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
        if skipped:
            logger.info("Skipped %d source(s) with existing open PRs", skipped)
    # Cooldown: skip sources with ANY PR in last EXTRACTION_COOLDOWN_HOURS.
    # Defense-in-depth for DB-status filter — catches the window between PR
    # creation and DB status update if anything races.
    if unprocessed:
        cooldown_hours = config.EXTRACTION_COOLDOWN_HOURS
        recent_source_paths = {
            r["source_path"] for r in conn.execute(
                """SELECT DISTINCT source_path FROM prs
                   WHERE source_path IS NOT NULL
                   AND created_at > datetime('now', ? || ' hours')""",
                (f"-{cooldown_hours}",),
            ).fetchall() if r["source_path"]
        }
        if recent_source_paths:
            before = len(unprocessed)
            unprocessed = [
                (sp, c, f) for sp, c, f in unprocessed
                if sp not in recent_source_paths
            ]
            cooled = before - len(unprocessed)
            if cooled:
                logger.info("Cooldown: skipped %d source(s) with PRs in last %dh", cooled, cooldown_hours)
    # ── Check for re-extraction sources (must run even when queue is empty) ──
    reextract_rows = conn.execute(
        """SELECT path, feedback FROM sources
--- a/lib/merge.py
+++ b/lib/merge.py
@ -308,7 +308,14 @@ async def _cherry_pick_onto_main(branch: str) -> tuple[bool, str]:
    rc, merge_base = await _git("merge-base", "origin/main", f"origin/{branch}")
    rc2, main_sha = await _git("rev-parse", "origin/main")
    if rc == 0 and rc2 == 0 and merge_base.strip() == main_sha.strip():
-        return True, "already up to date"
+        # Branch is descendant of main — but fork workflows (merge main into branch)
        # create this state while still having new content. Check for actual diff.
        rc_diff, diff_out = await _git(
            "diff", "--stat", f"origin/main..origin/{branch}", timeout=10,
        )
        if rc_diff != 0 or not diff_out.strip():
            return True, "already up to date"
        logger.info("Branch %s is descendant of main but has new content — proceeding", branch)
    # Get extraction commits (oldest first), skip merge commits from fork workflows
    rc, commits_out = await _git(
@ -429,6 +436,7 @@ from .frontmatter import (
    serialize_frontmatter,
 )
 from .post_merge import (
    backlink_source_claims,
    embed_merged_claims,
    reciprocal_edges,
    archive_source_for_pr,
@ -848,6 +856,12 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
            # Archive source file (closes near-duplicate loop — Ganymede review)
            archive_source_for_pr(branch, domain)
            # Backlink: update source files with claims_extracted refs
            try:
                await backlink_source_claims(main_sha, branch_sha, _git)
            except Exception:
                logger.exception("PR #%d: backlink_source_claims failed (non-fatal)", pr_num)
            # Embed new/changed claims into Qdrant (non-fatal)
            await embed_merged_claims(main_sha, branch_sha, _git)
--- a/lib/post_merge.py
+++ b/lib/post_merge.py
@ -13,6 +13,7 @@ import logging
 import os
 import re
 import shutil
 from pathlib import Path
 from typing import Callable
 from . import config
@ -295,6 +296,139 @@ async def reciprocal_edges(main_sha: str, branch_sha: str, git_fn: Callable):
        logger.exception("reciprocal_edges: failed (non-fatal)")
 async def backlink_source_claims(main_sha: str, branch_sha: str, git_fn: Callable):
    """After merge, update source files with claims_extracted backlinks.
    Reads sourced_from from merged claim frontmatter, finds the source file,
    and appends the claim filename to its claims_extracted list.
    Only runs for newly added claims (diff-filter=A).
    """
    try:
        rc, diff_out = await git_fn(
            "diff", "--name-only", "--diff-filter=A",
            main_sha, branch_sha,
            cwd=str(config.MAIN_WORKTREE),
            timeout=10,
        )
        if rc != 0:
            logger.warning("backlink_source_claims: diff failed (rc=%d), skipping", rc)
            return
        claim_dirs = {"domains/", "core/", "foundations/"}
        new_claims = [
            f for f in diff_out.strip().split("\n")
            if f.endswith(".md")
            and any(f.startswith(d) for d in claim_dirs)
            and not f.split("/")[-1].startswith("_")
            and "/entities/" not in f
            and "/decisions/" not in f
        ]
        if not new_claims:
            return
        modified_sources = {}
        for claim_path in new_claims:
            full_path = config.MAIN_WORKTREE / claim_path
            if not full_path.exists():
                continue
            try:
                content = full_path.read_text()
            except Exception:
                continue
            fm, raw_fm, body = parse_yaml_frontmatter(content)
            if fm is None:
                continue
            sourced_from = fm.get("sourced_from", "")
            if not sourced_from:
                continue
            source_path = config.MAIN_WORKTREE / "inbox" / "archive" / sourced_from
            if not source_path.exists():
                logger.debug("backlink_source_claims: source %s not found at %s", sourced_from, source_path)
                continue
            claim_filename = claim_path.rsplit("/", 1)[-1].replace(".md", "")
            try:
                source_content = source_path.read_text()
            except Exception:
                continue
            source_fm, source_raw_fm, source_body = parse_yaml_frontmatter(source_content)
            if source_fm is None:
                continue
            existing_claims = source_fm.get("claims_extracted", [])
            if isinstance(existing_claims, str):
                existing_claims = [existing_claims]
            if not isinstance(existing_claims, list):
                existing_claims = []
            if claim_filename in existing_claims:
                continue
            existing_claims.append(claim_filename)
            new_block = "claims_extracted:\n" + "\n".join(f"- {c}" for c in existing_claims)
            lines = source_content.split("\n")
            if "claims_extracted:" not in source_content:
                end_idx = None
                for i, line in enumerate(lines):
                    if i > 0 and line.strip() == "---":
                        end_idx = i
                        break
                if end_idx is None:
                    continue
                lines.insert(end_idx, new_block)
            else:
                start_idx = None
                end_idx = None
                for i, line in enumerate(lines):
                    if line.startswith("claims_extracted:"):
                        start_idx = i
                    elif start_idx is not None and not line.startswith("- "):
                        end_idx = i
                        break
                if start_idx is None:
                    continue
                if end_idx is None:
                    end_idx = len(lines)
                lines[start_idx:end_idx] = new_block.split("\n")
            modified_sources[str(source_path)] = "\n".join(lines)
            logger.info("backlink_source_claims: added %s to %s", claim_filename, sourced_from)
        if modified_sources:
            async with async_main_worktree_lock():
                for sp, content in modified_sources.items():
                    Path(sp).write_text(content)
                    await git_fn("add", sp, cwd=str(config.MAIN_WORKTREE))
                rc, out = await git_fn(
                    "commit", "-m", f"backlink: update claims_extracted on {len(modified_sources)} source(s)",
                    cwd=str(config.MAIN_WORKTREE),
                    timeout=15,
                )
                if rc == 0:
                    push_rc, push_out = await git_fn(
                        "push", "origin", "main",
                        cwd=str(config.MAIN_WORKTREE),
                        timeout=30,
                    )
                    if push_rc == 0:
                        logger.info("backlink_source_claims: %d source(s) updated and pushed", len(modified_sources))
                    else:
                        logger.warning("backlink_source_claims: push failed: %s", push_out[:200])
                else:
                    logger.warning("backlink_source_claims: commit failed: %s", out[:200])
    except Exception:
        logger.exception("backlink_source_claims: failed (non-fatal)")
 def archive_source_for_pr(branch: str, domain: str, merged: bool = True):
    """Move source from queue/ to archive/{domain}/ after PR merge or close.
--- a/ops/backfill-contributor-roles.py
+++ b/ops/backfill-contributor-roles.py
@ -0,0 +1,113 @@
 #!/usr/bin/env python3
 """Backfill contributor role counts from prs.commit_type.
 Resets all role counts to 0, then re-derives them from the prs table's
 commit_type column using the COMMIT_TYPE_TO_ROLE mapping. This corrects
 the bug where all contributors were recorded as 'extractor' regardless
 of their actual commit_type.
 Usage:
    python3 ops/backfill-contributor-roles.py [--dry-run]
 """
 import argparse
 import sqlite3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from lib.contributor import COMMIT_TYPE_TO_ROLE, commit_type_to_role
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 def backfill(db_path: str, dry_run: bool = False):
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    # Get all merged PRs with commit_type and agent
    prs = conn.execute("""
        SELECT number, commit_type, agent, branch
        FROM prs
        WHERE status = 'merged' AND agent IS NOT NULL
        ORDER BY number
    """).fetchall()
    print(f"Processing {len(prs)} merged PRs...")
    # Reset all role counts
    if not dry_run:
        conn.execute("""
            UPDATE contributors SET
                extractor_count = 0,
                challenger_count = 0,
                synthesizer_count = 0,
                sourcer_count = 0
        """)
        print("Reset all role counts to 0")
    # Tally roles from commit_type
    role_counts: dict[str, dict[str, int]] = {}
    for pr in prs:
        agent = pr["agent"].lower() if pr["agent"] else None
        if not agent or agent in ("external", "pipeline"):
            continue
        commit_type = pr["commit_type"] or "extract"
        role = commit_type_to_role(commit_type)
        if agent not in role_counts:
            role_counts[agent] = {
                "extractor_count": 0, "challenger_count": 0,
                "synthesizer_count": 0, "sourcer_count": 0,
                "reviewer_count": 0,
            }
        role_col = f"{role}_count"
        if role_col in role_counts[agent]:
            role_counts[agent][role_col] += 1
    # Apply tallied counts
    for handle, counts in sorted(role_counts.items()):
        non_zero = {k: v for k, v in counts.items() if v > 0}
        print(f"  {handle}: {non_zero or '(no knowledge PRs)'}")
        if not dry_run and non_zero:
            set_clauses = ", ".join(f"{k} = {v}" for k, v in non_zero.items())
            conn.execute(
                f"UPDATE contributors SET {set_clauses}, updated_at = datetime('now') WHERE handle = ?",
                (handle,),
            )
    if not dry_run:
        conn.commit()
        print("\nBackfill committed.")
    else:
        print("\n[DRY RUN] No changes made.")
    # Print summary
    print("\nRole distribution across all contributors:")
    if not dry_run:
        rows = conn.execute("""
            SELECT handle, extractor_count, challenger_count, synthesizer_count,
                   sourcer_count, reviewer_count
            FROM contributors
            ORDER BY (extractor_count + challenger_count + synthesizer_count) DESC
        """).fetchall()
        for r in rows:
            parts = []
            if r["extractor_count"]: parts.append(f"extract:{r['extractor_count']}")
            if r["challenger_count"]: parts.append(f"challenge:{r['challenger_count']}")
            if r["synthesizer_count"]: parts.append(f"synthesize:{r['synthesizer_count']}")
            if r["sourcer_count"]: parts.append(f"source:{r['sourcer_count']}")
            if r["reviewer_count"]: parts.append(f"review:{r['reviewer_count']}")
            if parts:
                print(f"  {r['handle']}: {', '.join(parts)}")
    conn.close()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--db", default=DB_PATH)
    args = parser.parse_args()
    backfill(args.db, args.dry_run)
--- a/scripts/audit-wiki-links.py
+++ b/scripts/audit-wiki-links.py
@ -0,0 +1,259 @@
 #!/usr/bin/env python3
 """Audit wiki-links across the teleo-codex knowledge base.
 Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]].
 Resolves each link against known claim files, entity files, and _map files.
 Reports dead links, orphaned claims, and link counts.
 Output: JSON to stdout with dead links, orphans, and per-file link counts.
 """
 import json
 import os
 import re
 import sys
 import unicodedata
 from pathlib import Path
 CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main"))
 CLAIM_DIRS = ["domains", "foundations", "core", "decisions"]
 ENTITY_DIR = "entities"
 WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
 def slugify(title: str) -> str:
    """Convert a wiki-link title to the kebab-case slug used for filenames."""
    s = title.strip().lower()
    s = unicodedata.normalize("NFKD", s)
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"[\s_]+", "-", s)
    s = re.sub(r"-+", "-", s)
    return s.strip("-")
 def build_index(codex: Path) -> dict:
    """Build a lookup index of all resolvable targets.
    Returns dict mapping normalized slug -> file path.
    Also maps raw stem (filename without .md) -> file path.
    """
    index = {}
    # Index claim files across all claim directories
    for claim_dir in CLAIM_DIRS:
        d = codex / claim_dir
        if not d.exists():
            continue
        for md in d.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            # Map by stem (exact filename match)
            index[stem.lower()] = rel
            # Map by slugified stem
            index[slugify(stem)] = rel
    # Index entity files
    entity_root = codex / ENTITY_DIR
    if entity_root.exists():
        for md in entity_root.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel
            index[slugify(stem)] = rel
    # Index maps/ directory (MOC-style overview docs)
    maps_root = codex / "maps"
    if maps_root.exists():
        for md in maps_root.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel
            index[slugify(stem)] = rel
    # Index top-level docs that might be link targets
    for special in ["overview.md", "livingip-overview.md"]:
        p = codex / special
        if p.exists():
            index[p.stem.lower()] = str(p.relative_to(codex))
    # Index agents/ beliefs and positions (sometimes linked)
    agents_dir = codex / "agents"
    if agents_dir.exists():
        for md in agents_dir.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel
    return index
 def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None:
    """Try to resolve a wiki-link target. Returns file path or None."""
    text = link_text.strip()
    # Special case: [[_map]] resolves to _map.md in the same domain directory
    if text == "_map":
        parts = source_dir.split("/")
        if len(parts) >= 2:
            candidate = f"{parts[0]}/{parts[1]}/_map.md"
            if (CODEX_ROOT / candidate).exists():
                return candidate
        return None
    # Path-style references like [[domains/health/_map]]
    if "/" in text:
        candidate = text.rstrip("/")
        if not candidate.endswith(".md"):
            candidate += ".md"
        if (CODEX_ROOT / candidate).exists():
            return candidate
        return None
    # Try exact stem match (lowercased)
    key = text.lower()
    if key in index:
        return index[key]
    # Try slugified version
    slug = slugify(text)
    if slug in index:
        return index[slug]
    # Try with common variations
    for variant in [
        slug.replace("metadaos", "metadao"),
        slug.replace("ais", "ai"),
    ]:
        if variant in index:
            return index[variant]
    return None
 def audit(codex: Path) -> dict:
    """Run the full wiki-link audit."""
    index = build_index(codex)
    dead_links = []       # {file, link, line_number}
    link_counts = {}      # file -> {outbound: N, targets: []}
    all_targets = set()   # files that are linked TO
    all_files = set()     # all claim/foundation files
    # Scan all markdown files in claim directories
    for claim_dir in CLAIM_DIRS:
        d = codex / claim_dir
        if not d.exists():
            continue
        for md in d.rglob("*.md"):
            rel = str(md.relative_to(codex))
            all_files.add(rel)
            source_dir = str(md.parent.relative_to(codex))
            try:
                content = md.read_text(encoding="utf-8")
            except Exception:
                continue
            links_in_file = []
            for i, line in enumerate(content.split("\n"), 1):
                for match in WIKI_LINK_RE.finditer(line):
                    link_text = match.group(1)
                    # Skip links with | (display text aliases) - take the target part
                    if "|" in link_text:
                        link_text = link_text.split("|")[0].strip()
                    resolved = resolve_link(link_text, index, source_dir)
                    if resolved:
                        all_targets.add(resolved)
                        links_in_file.append(resolved)
                    else:
                        dead_links.append({
                            "file": rel,
                            "link": link_text,
                            "line": i,
                        })
            link_counts[rel] = {
                "outbound": len(links_in_file),
                "targets": links_in_file,
            }
    # Find orphaned claims (no inbound links AND no outbound links)
    files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0}
    orphaned = sorted(
        f for f in all_files
        if f not in all_targets
        and f not in files_with_outbound
        and not f.endswith("_map.md")  # MOC files are structural, not orphans
    )
    # Compute inbound link counts
    inbound_counts = {}
    for f, c in link_counts.items():
        for target in c["targets"]:
            inbound_counts[target] = inbound_counts.get(target, 0) + 1
    # Claims with high outbound (good connectivity)
    high_connectivity = sorted(
        [(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3],
        key=lambda x: -x[1],
    )
    # Summary stats
    total_links = sum(c["outbound"] for c in link_counts.values())
    files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0)
    # Domain breakdown of dead links
    dead_by_domain = {}
    for dl in dead_links:
        parts = dl["file"].split("/")
        domain = parts[1] if len(parts) >= 3 else parts[0]
        dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1
    # Domain breakdown of orphans
    orphan_by_domain = {}
    for o in orphaned:
        parts = o.split("/")
        domain = parts[1] if len(parts) >= 3 else parts[0]
        orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1
    return {
        "summary": {
            "total_files": len(all_files),
            "total_links": total_links,
            "files_with_links": files_with_links,
            "files_without_links": len(all_files) - files_with_links,
            "dead_link_count": len(dead_links),
            "orphan_count": len(orphaned),
            "avg_links_per_file": round(total_links / max(len(all_files), 1), 2),
            "high_connectivity_count": len(high_connectivity),
        },
        "dead_links": dead_links,
        "dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])),
        "orphaned": orphaned,
        "orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])),
        "high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]],
        "inbound_top20": sorted(
            [{"file": f, "inbound_links": n} for f, n in inbound_counts.items()],
            key=lambda x: -x["inbound_links"],
        )[:20],
    }
 if __name__ == "__main__":
    codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT
    result = audit(codex)
    json.dump(result, sys.stdout, indent=2)
    print()
    # Print human-readable summary to stderr
    s = result["summary"]
    print(f"\n=== Wiki-Link Audit ===", file=sys.stderr)
    print(f"Files scanned: {s['total_files']}", file=sys.stderr)
    print(f"Total links: {s['total_links']}", file=sys.stderr)
    print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr)
    print(f"Dead links: {s['dead_link_count']}", file=sys.stderr)
    print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr)
    print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr)
    print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)
--- a/scripts/backfill-events.py
+++ b/scripts/backfill-events.py
@ -0,0 +1,618 @@
 #!/usr/bin/env python3
 """Backfill contribution_events by replaying merged PRs from pipeline.db + worktree.
 For each merged PR:
  - Derive author from prs.submitted_by → git author → branch prefix
  - Emit author event (role=author, weight=0.30, claim_path=NULL)
  - For each claim file under a knowledge prefix, parse frontmatter and emit
    originator events for sourcer entries that differ from the author
  - Emit evaluator events for Leo (when leo_verdict='approve') and domain_agent
    (when domain_verdict='approve' and not Leo)
  - Emit challenger/synthesizer events for Pentagon-Agent trailers on
    agent-owned branches (theseus/*, rio/*, etc.) based on commit_type
 Idempotent via the partial UNIQUE indexes on contribution_events. Safe to re-run.
 Usage:
  python3 scripts/backfill-events.py --dry-run     # Count events without writing
  python3 scripts/backfill-events.py               # Apply
 Runs read-only against the git worktree; only writes to pipeline.db.
 """
 import argparse
 import os
 import re
 import sqlite3
 import subprocess
 import sys
 from collections import Counter
 from pathlib import Path
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 REPO_DIR = os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")
 # Role weights — must match lib/contributor.py ROLE_WEIGHTS.
 ROLE_WEIGHTS = {
    "author": 0.30,
    "challenger": 0.25,
    "synthesizer": 0.20,
    "originator": 0.15,
    "evaluator": 0.05,
 }
 PENTAGON_AGENTS = frozenset({
    "rio", "leo", "theseus", "vida", "clay", "astra",
    "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
    "pipeline",
 })
 # Keep in sync with lib/attribution.AGENT_BRANCH_PREFIXES.
 # Duplicated here because this script runs standalone (no pipeline package import).
 AGENT_BRANCH_PREFIXES = (
    "rio/", "theseus/", "leo/", "vida/", "astra/", "clay/", "oberon/",
 )
 TRAILER_EVENT_ROLE = {
    "challenge": "challenger",
    "enrich": "synthesizer",
    "research": "synthesizer",
    "reweave": "synthesizer",
 }
 KNOWLEDGE_PREFIXES = ("domains/", "core/", "foundations/", "decisions/")
 BOT_AUTHORS = frozenset({
    "teleo", "teleo-bot", "pipeline",
    "github-actions[bot]", "forgejo-actions",
 })
 def normalize_handle(conn: sqlite3.Connection, handle: str) -> str:
    if not handle:
        return ""
    h = handle.strip().lower().lstrip("@")
    row = conn.execute("SELECT canonical FROM contributor_aliases WHERE alias = ?", (h,)).fetchone()
    if row:
        return row[0]
    return h
 def classify_kind(handle: str) -> str:
    h = handle.strip().lower().lstrip("@")
    return "agent" if h in PENTAGON_AGENTS else "person"
 def parse_frontmatter(text: str):
    """Minimal YAML frontmatter parser using PyYAML when available."""
    if not text.startswith("---"):
        return None
    end = text.find("---", 3)
    if end == -1:
        return None
    raw = text[3:end]
    try:
        import yaml
        fm = yaml.safe_load(raw)
        return fm if isinstance(fm, dict) else None
    except ImportError:
        return None
    except Exception:
        return None
 def extract_sourcers_from_file(path: Path) -> list[str]:
    """Return the sourcer handles from a claim file's frontmatter.
    Matches three formats:
      1. Block: `attribution: { sourcer: [{handle: "x"}, ...] }`
      2. Bare-key flat: `sourcer: alexastrum`
      3. Prefix-keyed: `attribution_sourcer: alexastrum`
    """
    try:
        content = path.read_text(encoding="utf-8")
    except (FileNotFoundError, PermissionError, UnicodeDecodeError):
        return []
    fm = parse_frontmatter(content)
    if not fm:
        return []
    handles: list[str] = []
    attr = fm.get("attribution")
    if isinstance(attr, dict):
        entries = attr.get("sourcer", [])
        if isinstance(entries, list):
            for e in entries:
                if isinstance(e, dict) and "handle" in e:
                    handles.append(e["handle"])
                elif isinstance(e, str):
                    handles.append(e)
        elif isinstance(entries, str):
            handles.append(entries)
        return handles
    flat = fm.get("attribution_sourcer")
    if flat:
        if isinstance(flat, str):
            handles.append(flat)
        elif isinstance(flat, list):
            handles.extend(v for v in flat if isinstance(v, str))
        if handles:
            return handles
    bare = fm.get("sourcer")
    if bare:
        if isinstance(bare, str):
            handles.append(bare)
        elif isinstance(bare, list):
            handles.extend(v for v in bare if isinstance(v, str))
    return handles
 _HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")
 def valid_handle(h: str) -> bool:
    if not h:
        return False
    lower = h.strip().lower().lstrip("@")
    if lower.endswith("-") or lower.endswith("_"):
        return False
    return bool(_HANDLE_RE.match(lower))
 def git(*args, cwd: str = REPO_DIR, timeout: int = 30) -> str:
    """Run a git command, return stdout. Returns empty string on failure."""
    try:
        result = subprocess.run(
            ["git", *args],
            cwd=cwd, capture_output=True, text=True, timeout=timeout, check=False,
        )
        return result.stdout
    except (subprocess.TimeoutExpired, OSError):
        return ""
 def git_first_commit_author(pr_branch: str, merged_at: str) -> str:
    """Best-effort: find git author of first non-merge commit on the branch.
    PR branches are usually deleted after merge. We fall back to scanning main
    commits around merged_at for commits matching the branch slug.
    """
    # Post-merge branches are cleaned up. For the backfill, we accept that this
    # path rarely yields results and rely on submitted_by + branch prefix.
    return ""
 def derive_author(conn: sqlite3.Connection, pr: dict) -> str | None:
    """Author precedence: submitted_by → branch-prefix agent for agent-owned branches."""
    if pr.get("submitted_by"):
        cand = pr["submitted_by"].strip().lower().lstrip("@")
        if cand and cand not in BOT_AUTHORS:
            return cand
    branch = pr.get("branch") or ""
    if "/" in branch:
        prefix = branch.split("/", 1)[0].lower()
        if prefix in ("rio", "theseus", "leo", "vida", "clay", "astra", "oberon"):
            return prefix
    return None
 def find_pr_for_claim(
    conn: sqlite3.Connection,
    repo: Path,
    md: Path,
 ) -> tuple[int | None, str]:
    """Recover the Forgejo PR number that introduced a claim file.
    Returns (pr_number, strategy) — strategy is one of:
      'sourced_from' — frontmatter sourced_from matched prs.source_path
      'git_subject'  — git log first-add commit message matched a branch pattern
      'title_desc'   — filename stem matched a title in prs.description
      'github_pr'    — recovery commit mentioned GitHub PR # → prs.github_pr
      'none'         — no strategy found a match
    Order is chosen by reliability:
      1. sourced_from (explicit provenance, most reliable when present)
      2. git_subject  (covers Leo research, Cameron challenges, Theseus contrib)
      3. title_desc   (current fallback — brittle when description is NULL)
      4. github_pr    (recovery commits referencing erased GitHub PRs)
    """
    rel = str(md.relative_to(repo))
    # Strategy 1: sourced_from frontmatter → prs.source_path
    try:
        content = md.read_text(encoding="utf-8")
    except (FileNotFoundError, PermissionError, UnicodeDecodeError):
        content = ""
    fm = parse_frontmatter(content) if content else None
    if fm:
        sourced = fm.get("sourced_from")
        candidate_paths: list[str] = []
        if isinstance(sourced, str) and sourced:
            candidate_paths.append(sourced)
        elif isinstance(sourced, list):
            candidate_paths.extend(s for s in sourced if isinstance(s, str))
        for sp in candidate_paths:
            stem = Path(sp).stem
            if not stem:
                continue
            row = conn.execute(
                """SELECT number FROM prs
                   WHERE source_path LIKE ? AND status='merged'
                   ORDER BY merged_at ASC LIMIT 1""",
                (f"%{stem}.md",),
            ).fetchone()
            if row:
                return row["number"], "sourced_from"
    # Strategy 2: git log first-add commit → subject pattern → prs.branch
    # Default log order is reverse-chronological; take the last line (oldest)
    # to get the original addition, not later rewrites.
    log_out = git(
        "log", "--diff-filter=A", "--follow",
        "--format=%H|||%s|||%b", "--", rel,
    )
    if log_out.strip():
        # Split on the delimiter we chose. Each commit produces 3 fields but
        # %b can contain blank lines — group by lines that look like a SHA.
        blocks: list[tuple[str, str, str]] = []
        current: list[str] = []
        for line in log_out.splitlines():
            if re.match(r"^[a-f0-9]{40}\|\|\|", line):
                if current:
                    parts = "\n".join(current).split("|||", 2)
                    if len(parts) == 3:
                        blocks.append((parts[0], parts[1], parts[2]))
                current = [line]
            else:
                current.append(line)
        if current:
            parts = "\n".join(current).split("|||", 2)
            if len(parts) == 3:
                blocks.append((parts[0], parts[1], parts[2]))
        if blocks:
            # Oldest addition — git log defaults to reverse-chronological
            _oldest_sha, subject, body = blocks[-1]
            # Pattern: "<agent>: extract claims from <slug>"
            m = re.match(r"^(\w+):\s*extract\s+claims\s+from\s+(\S+)", subject)
            if m:
                slug = m.group(2).rstrip(".md").rstrip(".")
                row = conn.execute(
                    """SELECT number FROM prs
                       WHERE branch LIKE ? AND status='merged'
                       ORDER BY merged_at ASC LIMIT 1""",
                    (f"extract/{slug}%",),
                ).fetchone()
                if row:
                    return row["number"], "git_subject"
            # Pattern: "<agent>: research session <date>"
            m = re.match(r"^(\w+):\s*research\s+session\s+(\d{4}-\d{2}-\d{2})", subject)
            if m:
                agent = m.group(1).lower()
                date = m.group(2)
                row = conn.execute(
                    """SELECT number FROM prs
                       WHERE branch LIKE ? AND status='merged'
                       ORDER BY merged_at ASC LIMIT 1""",
                    (f"{agent}/research-{date}%",),
                ).fetchone()
                if row:
                    return row["number"], "git_subject"
            # Pattern: "<agent>: challenge" / contrib challenges / entity batches
            m = re.match(r"^(\w+):\s*(?:challenge|contrib|entity|synthesize)", subject)
            if m:
                agent = m.group(1).lower()
                row = conn.execute(
                    """SELECT number FROM prs
                       WHERE branch LIKE ? AND status='merged'
                       ORDER BY merged_at ASC LIMIT 1""",
                    (f"{agent}/%",),
                ).fetchone()
                if row:
                    return row["number"], "git_subject"
            # Recovery commits referencing erased GitHub PRs (Alex/Cameron).
            # Subject: "Recover <who> contribution from GitHub PR #NN (...)".
            # Match only when a corresponding prs row exists with github_pr=NN —
            # otherwise the claims were direct-to-main without a Forgejo PR
            # record, which requires a synthetic PR row (follow-up, not in
            # this script's scope).
            gh_match = re.search(r"GitHub\s+PR\s+#(\d+)", subject + "\n" + body)
            if gh_match:
                gh_pr = int(gh_match.group(1))
                row = conn.execute(
                    "SELECT number FROM prs WHERE github_pr = ? AND status='merged' LIMIT 1",
                    (gh_pr,),
                ).fetchone()
                if row:
                    return row["number"], "github_pr"
            # Pattern: bare "Extract N claims from <source-fragment>" (no
            # agent prefix). Used in early research PRs like Shaga's claims
            # at PR #2025. Fall back to time-proximity: find the earliest
            # agent-branch PR merged within 24h AFTER this commit's date.
            m = re.match(r"^Extract\s+\d+\s+claims\s+from\b", subject)
            if m:
                # Get commit author date
                date_out = git(
                    "log", "-1", "--format=%aI", _oldest_sha, timeout=10,
                )
                commit_date = date_out.strip() if date_out.strip() else None
                if commit_date:
                    # git %aI returns ISO 8601 with T-separator; prs.merged_at
                    # uses SQLite's space-separator. Lexicographic comparison
                    # fails across formats (space<T), so normalize commit_date
                    # via datetime() before comparing. Without this, PRs merged
                    # within the same calendar day but earlier than the commit
                    # hour are silently excluded (caught by Ganymede review —
                    # Shaga's #2025 was dropped in favor of later #2032).
                    row = conn.execute(
                        """SELECT number FROM prs
                           WHERE status='merged'
                             AND merged_at >= datetime(?)
                             AND merged_at <= datetime(datetime(?), '+24 hours')
                             AND (branch LIKE 'leo/%' OR branch LIKE 'theseus/%'
                                  OR branch LIKE 'rio/%' OR branch LIKE 'astra/%'
                                  OR branch LIKE 'vida/%' OR branch LIKE 'clay/%')
                           ORDER BY merged_at ASC LIMIT 1""",
                        (commit_date, commit_date),
                    ).fetchone()
                    if row:
                        return row["number"], "git_time_proximity"
    return None, "none"
 def emit(conn, counts, dry_run, handle, role, pr_number, claim_path, domain, channel, timestamp):
    canonical = normalize_handle(conn, handle)
    if not valid_handle(canonical):
        return
    kind = classify_kind(canonical)
    weight = ROLE_WEIGHTS[role]
    counts[(role, "attempt")] += 1
    if dry_run:
        counts[(role, "would_insert")] += 1
        return
    cur = conn.execute(
        """INSERT OR IGNORE INTO contribution_events
           (handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, COALESCE(?, datetime('now')))""",
        (canonical, kind, role, weight, pr_number, claim_path, domain, channel, timestamp),
    )
    if cur.rowcount > 0:
        counts[(role, "inserted")] += 1
    else:
        counts[(role, "skipped_dup")] += 1
 def files_added_in_pr(pr_number: int, branch: str) -> list[str]:
    """Best-effort: list added .md files in the PR.
    Uses prs.source_path as a fallback signal (the claim being added). If the
    branch no longer exists post-merge, this will return []; we accept the loss
    for historical PRs where the granular per-claim events can't be recovered —
    PR-level author/evaluator events still land correctly.
    """
    # Post-merge PR branches are deleted from Forgejo so we can't diff them.
    # For the backfill we use prs.source_path — for extract/* PRs this points to
    # the source inbox file; we can glob the claim files from the extract branch
    # commit on main. But main's commits don't track which files a given PR touched.
    # Accept the loss: backfill emits only PR-level events (author, evaluator,
    # challenger/synthesizer). Originator events come from parsing claim files
    # attributed to the branch via description field which lists claim titles.
    return []
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--limit", type=int, default=0, help="Process at most N PRs (0 = all)")
    args = parser.parse_args()
    if not Path(DB_PATH).exists():
        print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
        sys.exit(1)
    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row
    # Sanity: contribution_events exists (v24 migration applied)
    try:
        conn.execute("SELECT 1 FROM contribution_events LIMIT 1")
    except sqlite3.OperationalError:
        print("ERROR: contribution_events table missing. Run migration v24 first.", file=sys.stderr)
        sys.exit(2)
    # Walk all merged knowledge PRs
    query = """
        SELECT number, branch, domain, source_channel, submitted_by,
               leo_verdict, domain_verdict, domain_agent,
               commit_type, merged_at
        FROM prs
        WHERE status = 'merged'
        ORDER BY merged_at ASC
    """
    if args.limit:
        query += f" LIMIT {args.limit}"
    prs = conn.execute(query).fetchall()
    print(f"Replaying {len(prs)} merged PRs (dry_run={args.dry_run})...")
    counts: Counter = Counter()
    repo = Path(REPO_DIR)
    for pr in prs:
        pr_number = pr["number"]
        branch = pr["branch"] or ""
        domain = pr["domain"]
        channel = pr["source_channel"]
        merged_at = pr["merged_at"]
        # Skip pipeline-only branches for author credit (extract/*, reweave/*,
        # fix/*, ingestion/*, epimetheus/*) — those are infrastructure. But
        # evaluator credit for Leo/domain_agent still applies.
        is_pipeline_branch = branch.startswith((
            "extract/", "reweave/", "fix/", "ingestion/", "epimetheus/",
        ))
        # ── AUTHOR ──
        # For pipeline branches, submitted_by carries the real author (the
        # human who submitted the source via Telegram/etc). For agent branches,
        # the agent is author. For external branches (gh-pr-*), git author is
        # in submitted_by from the sync-mirror pipeline.
        author = derive_author(conn, dict(pr))
        if author:
            emit(conn, counts, args.dry_run, author, "author", pr_number,
                 None, domain, channel, merged_at)
        # ── EVALUATOR ──
        if pr["leo_verdict"] == "approve":
            emit(conn, counts, args.dry_run, "leo", "evaluator", pr_number,
                 None, domain, channel, merged_at)
        if pr["domain_verdict"] == "approve" and pr["domain_agent"]:
            dagent = pr["domain_agent"].strip().lower()
            if dagent and dagent != "leo":
                emit(conn, counts, args.dry_run, dagent, "evaluator", pr_number,
                     None, domain, channel, merged_at)
        # ── CHALLENGER / SYNTHESIZER from branch+commit_type ──
        # Only fires on agent-owned branches. Pipeline branches aren't creditable
        # work (they're machine extraction, evaluator already captures the review).
        if branch.startswith(AGENT_BRANCH_PREFIXES):
            prefix = branch.split("/", 1)[0].lower()
            event_role = TRAILER_EVENT_ROLE.get(pr["commit_type"] or "")
            if event_role:
                emit(conn, counts, args.dry_run, prefix, event_role, pr_number,
                     None, domain, channel, merged_at)
        # ── ORIGINATOR per claim ──
        # Walk claim files currently on main whose content was added in this PR.
        # We can't diff old branches (deleted post-merge), but for extract PRs
        # the source_path + description carry claim titles — too lossy to build
        # per-claim events reliably. Strategy: walk ALL claim files that have a
        # sourcer in their frontmatter and assign them to the PR whose
        # source_path matches (via description or filename heuristic).
        # DEFERRED: per-claim originator events require branch introspection
        # that fails on deleted branches. Backfill emits PR-level events only.
        # Forward traffic (post-deploy) gets per-claim originator events via
        # record_contributor_attribution's added-files walk.
    if not args.dry_run:
        conn.commit()
    # Originator is emitted in the claim-level pass below, not the PR-level pass.
    # Previous summary listed it here with attempted=0 which confused operators.
    print("\n=== PR-level events (author, evaluator, challenger, synthesizer) ===")
    for role in ("author", "challenger", "synthesizer", "evaluator"):
        att = counts[(role, "attempt")]
        if args.dry_run:
            wi = counts[(role, "would_insert")]
            print(f"  {role:12s} attempted={att:5d} would_insert={wi:5d}")
        else:
            ins = counts[(role, "inserted")]
            skip = counts[(role, "skipped_dup")]
            print(f"  {role:12s} attempted={att:5d} inserted={ins:5d} skipped_dup={skip:5d}")
    # ── Per-claim originator pass ──
    # Walk the knowledge tree, parse sourcer attribution, and attach each claim
    # to its merging PR via find_pr_for_claim's multi-strategy recovery.
    # Apr 24 rewrite (Ganymede-approved): replaces the single-strategy
    # title→description match with four strategies in reliability order.
    # Previous script missed PRs with NULL description (Cameron #3377) and
    # cross-context claims (Shaga's Leo research). Fallback title-match is
    # preserved to recover anything the git-log path misses.
    print("\n=== Claim-level originator pass ===")
    # Build title → pr_number map from prs.description (strategy 3 fallback)
    title_to_pr: dict[str, int] = {}
    for r in conn.execute(
        "SELECT number, description FROM prs WHERE status='merged' AND description IS NOT NULL AND description != ''"
    ).fetchall():
        desc = r["description"] or ""
        for title in desc.split(" | "):
            title = title.strip()
            if title:
                # Last-writer wins. Conflicts are rare (titles unique in practice).
                title_to_pr[title.lower()] = r["number"]
    claim_counts = Counter()
    strategy_counts = Counter()
    claim_count = 0
    originator_count = 0
    for md in sorted(repo.glob("domains/**/*.md")) + \
              sorted(repo.glob("core/**/*.md")) + \
              sorted(repo.glob("foundations/**/*.md")) + \
              sorted(repo.glob("decisions/**/*.md")):
        rel = str(md.relative_to(repo))
        stem = md.stem
        # Strategies 1, 2, 4 via the helper (sourced_from, git_subject, github_pr).
        pr_number, strategy = find_pr_for_claim(conn, repo, md)
        # Strategy 3 (fallback): title-match against prs.description.
        if not pr_number:
            pr_number = title_to_pr.get(stem.lower())
            if not pr_number:
                pr_number = title_to_pr.get(stem.replace("-", " ").lower())
            if pr_number:
                strategy = "title_desc"
        if not pr_number:
            claim_counts["no_pr_match"] += 1
            continue
        sourcers = extract_sourcers_from_file(md)
        if not sourcers:
            claim_counts["no_sourcer"] += 1
            continue
        claim_count += 1
        strategy_counts[strategy] += 1
        # Look up author for this PR to skip self-credit
        pr_row = conn.execute(
            "SELECT submitted_by, branch, domain, source_channel, merged_at FROM prs WHERE number = ?",
            (pr_number,),
        ).fetchone()
        if not pr_row:
            continue
        author = derive_author(conn, dict(pr_row))
        author_canonical = normalize_handle(conn, author) if author else None
        for src_handle in sourcers:
            src_canonical = normalize_handle(conn, src_handle)
            if not valid_handle(src_canonical):
                claim_counts["invalid_handle"] += 1
                continue
            if src_canonical == author_canonical:
                claim_counts["skip_self"] += 1
                continue
            emit(conn, counts, args.dry_run, src_handle, "originator", pr_number,
                 rel, pr_row["domain"], pr_row["source_channel"], pr_row["merged_at"])
            originator_count += 1
    if not args.dry_run:
        conn.commit()
    print(f"  Claims processed: {claim_count}")
    print(f"  Originator events emitted: {originator_count}")
    print(f"  Breakdown: {dict(claim_counts)}")
    print(f"  Strategy hits: {dict(strategy_counts)}")
    att = counts[("originator", "attempt")]
    if args.dry_run:
        wi = counts[("originator", "would_insert")]
        print(f"  {'originator':12s} attempted={att:5d} would_insert={wi:5d}")
    else:
        ins = counts[("originator", "inserted")]
        skip = counts[("originator", "skipped_dup")]
        print(f"  {'originator':12s} attempted={att:5d} inserted={ins:5d} skipped_dup={skip:5d}")
    if not args.dry_run:
        total = conn.execute("SELECT COUNT(*) FROM contribution_events").fetchone()[0]
        print(f"\nTotal contribution_events rows: {total}")
 if __name__ == "__main__":
    main()
--- a/scripts/backfill-sourcer-attribution.py
+++ b/scripts/backfill-sourcer-attribution.py
@ -0,0 +1,261 @@
 #!/usr/bin/env python3
 """Backfill sourcer/extractor/etc. attribution from claim frontmatter.
 Walks every merged knowledge file under domains/, entities/, decisions/,
 foundations/, convictions/, core/ and re-runs the canonical attribution
 parser (lib/attribution.py). For each parsed (handle, role) pair, increments
 the corresponding *_count column on the contributors table.
 Why this is needed (Apr 24 incident):
  - lib/contributor.py used a diff-line regex parser that handled neither
    the bare-key flat format (`sourcer: alexastrum`, ~42% of claims) nor
    the nested `attribution: { sourcer: [...] }` block format used by Leo's
    manual extractions (Shaga's claims).
  - Result: alexastrum, thesensatore, cameron-s1, and similar handles were
    silently dropped at merge time. Their contributor rows either don't
    exist or are stuck at zero counts.
 Usage:
    python3 backfill-sourcer-attribution.py --dry-run    # report deltas, no writes
    python3 backfill-sourcer-attribution.py              # apply (additive: max(db, truth))
    python3 backfill-sourcer-attribution.py --reset      # destructive: set absolute truth
 Default mode is ADDITIVE for safety: per-role count is set to max(current_db, truth).
 This preserves any existing high counts that came from non-frontmatter sources
 (e.g., m3taversal.sourcer=1011 reflects Telegram-curator credit accumulated via
 a different code path; truncating to the file-walk truth would be destructive).
 Use --reset to set absolute truth from the file walk only — this clobbers
 all existing role counts including legitimate non-frontmatter credit.
 Idempotency: additive mode is safe to re-run. --reset run is gated by an
 audit_log marker; pass --force to override.
 """
 import argparse
 import os
 import sqlite3
 import sys
 from collections import defaultdict
 from pathlib import Path
 # Allow running from anywhere — point at pipeline lib
 PIPELINE_ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(PIPELINE_ROOT))
 from lib.attribution import parse_attribution_from_file, VALID_ROLES  # noqa: E402
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
 KNOWLEDGE_PREFIXES = (
    "domains", "entities", "decisions", "foundations", "convictions", "core",
 )
 def collect_attributions(repo_root: Path) -> dict[str, dict[str, int]]:
    """Walk all knowledge files; return {handle: {role: count}}."""
    counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
    files_scanned = 0
    files_with_attribution = 0
    for prefix in KNOWLEDGE_PREFIXES:
        base = repo_root / prefix
        if not base.exists():
            continue
        for path in base.rglob("*.md"):
            if path.name.startswith("_"):
                continue
            files_scanned += 1
            attr = parse_attribution_from_file(str(path))
            had_any = False
            for role, entries in attr.items():
                for entry in entries:
                    handle = entry.get("handle")
                    if handle:
                        counts[handle][role] += 1
                        had_any = True
            if had_any:
                files_with_attribution += 1
    print(f"  Scanned {files_scanned} knowledge files", file=sys.stderr)
    print(f"  {files_with_attribution} had parseable attribution", file=sys.stderr)
    return counts
 def existing_contributors(conn) -> dict[str, dict[str, int]]:
    """Return {handle: {role: count}} from the current DB."""
    rows = conn.execute(
        "SELECT handle, sourcer_count, extractor_count, challenger_count, "
        "synthesizer_count, reviewer_count, claims_merged FROM contributors"
    ).fetchall()
    out = {}
    for r in rows:
        out[r["handle"]] = {
            "sourcer": r["sourcer_count"] or 0,
            "extractor": r["extractor_count"] or 0,
            "challenger": r["challenger_count"] or 0,
            "synthesizer": r["synthesizer_count"] or 0,
            "reviewer": r["reviewer_count"] or 0,
            "claims_merged": r["claims_merged"] or 0,
        }
    return out
 def claims_merged_for(role_counts: dict[str, int]) -> int:
    """Mirror upsert_contributor logic: claims_merged += sourcer + extractor."""
    return role_counts.get("sourcer", 0) + role_counts.get("extractor", 0)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true",
                        help="Report deltas without writing")
    parser.add_argument("--reset", action="store_true",
                        help="Destructive: set absolute truth from file walk "
                             "(default is additive max(db, truth))")
    parser.add_argument("--force", action="store_true",
                        help="Re-run even if a previous --reset marker exists")
    args = parser.parse_args()
    if not REPO.exists():
        print(f"ERROR: repo not found at {REPO}", file=sys.stderr)
        sys.exit(1)
    print(f"DB: {DB_PATH}", file=sys.stderr)
    print(f"Repo: {REPO}", file=sys.stderr)
    print("", file=sys.stderr)
    print("Walking knowledge tree...", file=sys.stderr)
    truth = collect_attributions(REPO)
    print(f"  Found attributions for {len(truth)} unique handles", file=sys.stderr)
    print("", file=sys.stderr)
    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row
    current = existing_contributors(conn)
    # Compute deltas: new handles + handles with role-count mismatches
    new_handles: list[tuple[str, dict[str, int]]] = []
    role_deltas: list[tuple[str, dict[str, int], dict[str, int]]] = []
    for handle, roles in truth.items():
        if handle not in current:
            new_handles.append((handle, dict(roles)))
        else:
            cur = current[handle]
            mismatches = {r: roles.get(r, 0) for r in VALID_ROLES
                          if roles.get(r, 0) != cur.get(r, 0)}
            if mismatches:
                role_deltas.append((handle, dict(roles), cur))
    print(f"=== {len(new_handles)} NEW contributors to insert ===")
    for handle, roles in sorted(new_handles, key=lambda x: -sum(x[1].values()))[:20]:
        roles_str = ", ".join(f"{r}={c}" for r, c in roles.items() if c > 0)
        print(f"  + {handle}: {roles_str} (claims_merged={claims_merged_for(roles)})")
    if len(new_handles) > 20:
        print(f"  ... and {len(new_handles) - 20} more")
    print()
    print(f"=== {len(role_deltas)} EXISTING contributors with count drift ===")
    for handle, truth_roles, cur_roles in sorted(
        role_deltas,
        key=lambda x: -sum(x[1].values()),
    )[:20]:
        for role in VALID_ROLES:
            t = truth_roles.get(role, 0)
            c = cur_roles.get(role, 0)
            if t != c:
                print(f"  ~ {handle}.{role}: db={c} → truth={t} (Δ{t - c:+d})")
    if len(role_deltas) > 20:
        print(f"  ... and {len(role_deltas) - 20} more")
    print()
    if args.dry_run:
        mode = "RESET" if args.reset else "ADDITIVE"
        print(f"Dry run ({mode} mode) — no changes written.")
        if not args.reset:
            print("Default is ADDITIVE: existing high counts (e.g. m3taversal=1011) preserved.")
            print("Pass --reset to clobber existing counts with file-walk truth.")
        return
    # Idempotency: --reset is gated by audit marker. Additive mode is always safe.
    if args.reset:
        marker = conn.execute(
            "SELECT 1 FROM audit_log WHERE event = 'sourcer_attribution_backfill_reset' LIMIT 1"
        ).fetchone()
        if marker and not args.force:
            print("ERROR: --reset has already run (audit marker present).")
            print("Pass --force to re-run.")
            sys.exit(2)
    inserted = 0
    updated = 0
    preserved_higher = 0
    for handle, roles in truth.items():
        truth_counts = {
            "sourcer": roles.get("sourcer", 0),
            "extractor": roles.get("extractor", 0),
            "challenger": roles.get("challenger", 0),
            "synthesizer": roles.get("synthesizer", 0),
            "reviewer": roles.get("reviewer", 0),
        }
        if handle in current:
            cur = current[handle]
            if args.reset:
                # Preserve reviewer_count even on reset (PR-level not file-level)
                final = dict(truth_counts)
                final["reviewer"] = max(truth_counts["reviewer"], cur.get("reviewer", 0))
            else:
                # Additive: max of db vs truth, per role
                final = {
                    role: max(truth_counts[role], cur.get(role, 0))
                    for role in truth_counts
                }
                if any(cur.get(r, 0) > truth_counts[r] for r in truth_counts):
                    preserved_higher += 1
            cm = final["sourcer"] + final["extractor"]
            conn.execute(
                """UPDATE contributors SET
                    sourcer_count = ?,
                    extractor_count = ?,
                    challenger_count = ?,
                    synthesizer_count = ?,
                    reviewer_count = ?,
                    claims_merged = ?,
                    updated_at = datetime('now')
                WHERE handle = ?""",
                (final["sourcer"], final["extractor"], final["challenger"],
                 final["synthesizer"], final["reviewer"], cm, handle),
            )
            updated += 1
        else:
            cm = truth_counts["sourcer"] + truth_counts["extractor"]
            conn.execute(
                """INSERT INTO contributors (
                    handle, sourcer_count, extractor_count, challenger_count,
                    synthesizer_count, reviewer_count, claims_merged,
                    first_contribution, last_contribution, tier
                ) VALUES (?, ?, ?, ?, ?, ?, ?, date('now'), date('now'), 'new')""",
                (handle, truth_counts["sourcer"], truth_counts["extractor"],
                 truth_counts["challenger"], truth_counts["synthesizer"],
                 truth_counts["reviewer"], cm),
            )
            inserted += 1
    event = "sourcer_attribution_backfill_reset" if args.reset else "sourcer_attribution_backfill"
    conn.execute(
        "INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
        ("contributor", event,
         f'{{"inserted": {inserted}, "updated": {updated}, '
         f'"preserved_higher": {preserved_higher}, "mode": '
         f'"{"reset" if args.reset else "additive"}"}}'),
    )
    conn.commit()
    print(f"Done ({'RESET' if args.reset else 'ADDITIVE'}). "
          f"Inserted {inserted} new, updated {updated} existing, "
          f"preserved {preserved_higher} higher-than-truth values.")
 if __name__ == "__main__":
    main()
--- a/scripts/backfill-sources.py
+++ b/scripts/backfill-sources.py
@ -104,14 +104,22 @@ def main():
                claims_count = 0
            if rel_path in existing:
-                # Update status if different
+                # Update status if different — but never regress from terminal states.
                # If DB says 'extracted' or 'null_result' and file happens to be in queue/
                # (e.g., failed archive push, zombie file), the DB is authoritative.
                # Downgrading to 'unprocessed' triggers the runaway re-extraction loop.
                current = conn.execute("SELECT status FROM sources WHERE path = ?", (rel_path,)).fetchone()
                TERMINAL_STATUSES = {"extracted", "null_result", "error", "ghost_no_file"}
                if current and current["status"] != status:
-                    conn.execute(
+                    if current["status"] in TERMINAL_STATUSES and status == "unprocessed":
-                        "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?",
+                        # Don't regress terminal → unprocessed. DB wins.
-                        (status, rel_path),
+                        pass
-                    )
+                    else:
-                    updated += 1
+                        conn.execute(
                            "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?",
                            (status, rel_path),
                        )
                        updated += 1
            else:
                conn.execute(
                    """INSERT INTO sources (path, status, priority, claims_count, created_at, updated_at)
--- a/scripts/backfill-synthetic-recovery-prs.py
+++ b/scripts/backfill-synthetic-recovery-prs.py
@ -0,0 +1,148 @@
 #!/usr/bin/env python3
 """Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring.
 Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`:
  - GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash,
    recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files)
  - GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805
 The recovery commits wrote the files directly to main, so our `prs` table has
 no row to attach originator events to — the backfill-events.py strategies all
 return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so
 the events pipeline (and `github_pr` strategy in backfill-events) can credit
 Alex and Cameron properly.
 Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs.
 Idempotent via INSERT OR IGNORE.
 Usage:
  python3 scripts/backfill-synthetic-recovery-prs.py --dry-run
  python3 scripts/backfill-synthetic-recovery-prs.py
 """
 import argparse
 import os
 import sqlite3
 import sys
 from pathlib import Path
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 # Historical GitHub PRs recovered via direct-to-main commits.
 # Original GitHub merge dates come from the recovery commit messages.
 RECOVERY_PRS = [
    {
        "number": 900068,
        "github_pr": 68,
        "branch": "gh-pr-68",
        "status": "merged",
        "domain": "ai-alignment",
        "commit_type": "knowledge",
        "tier": "STANDARD",
        "leo_verdict": "approve",
        "domain_verdict": "approve",
        "submitted_by": "alexastrum",
        "source_channel": "github",
        # origin='human' matches lib/merge.py convention for external contributors
        # (default is 'pipeline' which misclassifies us as machine-authored).
        "origin": "human",
        "priority": "high",
        "description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents",
        "merged_at": "2026-03-09 00:00:00",
        "created_at": "2026-03-08 00:00:00",
        "last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)",
    },
    {
        "number": 900088,
        "github_pr": 88,
        "branch": "gh-pr-88",
        "status": "merged",
        "domain": "ai-alignment",
        "commit_type": "knowledge",
        "tier": "STANDARD",
        "leo_verdict": "approve",
        "domain_verdict": "approve",
        "submitted_by": "cameron-s1",
        "source_channel": "github",
        "origin": "human",
        "priority": "high",
        "description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself",
        "merged_at": "2026-04-01 00:00:00",
        "created_at": "2026-04-01 00:00:00",
        "last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)",
    },
 ]
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()
    if not Path(DB_PATH).exists():
        print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
        sys.exit(1)
    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row
    # Guard against synthetic-range colonization (Ganymede review): check for
    # any row in the synthetic range that isn't one of ours. INSERT OR IGNORE on
    # the specific numbers is the real collision defense; this is belt-and-suspenders.
    max_real = conn.execute(
        "SELECT MAX(number) FROM prs WHERE number < 900000"
    ).fetchone()[0] or 0
    print(f"Max real Forgejo PR number: {max_real}")
    synth_conflict = conn.execute(
        "SELECT number FROM prs WHERE number >= 900000 AND number NOT IN (900068, 900088) LIMIT 1"
    ).fetchone()
    if synth_conflict:
        print(f"ERROR: PR #{synth_conflict[0]} already exists in synthetic range. "
              f"Pick a new range before running.", file=sys.stderr)
        sys.exit(2)
    inserted = 0
    skipped = 0
    for row in RECOVERY_PRS:
        existing = conn.execute(
            "SELECT number FROM prs WHERE number = ? OR github_pr = ?",
            (row["number"], row["github_pr"]),
        ).fetchone()
        if existing:
            print(f"  PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip")
            skipped += 1
            continue
        print(f"  {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} "
              f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, "
              f"merged_at={row['merged_at']})")
        if not args.dry_run:
            conn.execute(
                """INSERT INTO prs (
                    number, github_pr, branch, status, domain, commit_type, tier,
                    leo_verdict, domain_verdict, submitted_by, source_channel,
                    origin, priority,
                    description, merged_at, created_at, last_error
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (
                    row["number"], row["github_pr"], row["branch"], row["status"],
                    row["domain"], row["commit_type"], row["tier"],
                    row["leo_verdict"], row["domain_verdict"],
                    row["submitted_by"], row["source_channel"],
                    row["origin"], row["priority"],
                    row["description"], row["merged_at"], row["created_at"],
                    row["last_error"],
                ),
            )
            inserted += 1
    if not args.dry_run:
        conn.commit()
    print(f"\nInserted {inserted}, skipped {skipped}")
    if not args.dry_run and inserted:
        print("\nNext step: re-run backfill-events.py to attach originator events")
        print("  python3 ops/backfill-events.py")
 if __name__ == "__main__":
    main()
--- a/scripts/classify-contributors.py
+++ b/scripts/classify-contributors.py
@ -0,0 +1,426 @@
 #!/usr/bin/env python3
 """Classify `contributors` rows into {keep_person, keep_agent, move_to_publisher, delete_garbage}.
 Reads current contributors table, proposes reclassification per v26 schema design:
  - Real humans + Pentagon agents stay in contributors (kind='person'|'agent')
  - News orgs, publications, venues move to publishers table (new v26)
  - Multi-word hyphenated garbage (parsing artifacts) gets deleted
  - Their contribution_events are handled per category:
      * Publishers: DELETE events (orgs shouldn't have credit)
      * Garbage: DELETE events (bogus data)
      * Persons/agents: keep events untouched
 Classification is heuristic — uses explicit allowlists + regex patterns + length gates.
 Ambiguous cases default to 'review_needed' (human decision).
 Usage:
  python3 scripts/classify-contributors.py              # dry-run analysis + report
  python3 scripts/classify-contributors.py --apply      # write changes
  python3 scripts/classify-contributors.py --show <handle>  # inspect a single row
 Writes to pipeline.db only. Does NOT modify claim files.
 """
 import argparse
 import json
 import os
 import re
 import sqlite3
 import sys
 from collections import Counter
 from pathlib import Path
 DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
 # Pentagon agents: kind='agent'. Authoritative list.
 PENTAGON_AGENTS = frozenset({
    "rio", "leo", "theseus", "vida", "clay", "astra",
    "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
    "pipeline",
 })
 # Publisher/news-org handles seen in current contributors table.
 # Grouped by kind for the publishers row. Classified by inspection.
 # NOTE: This list is hand-curated — add to it as new orgs appear.
 PUBLISHERS_NEWS = {
    # News outlets / brands
    "cnbc", "al-jazeera", "axios", "bloomberg", "reuters", "bettorsinsider",
    "fortune", "techcrunch", "coindesk", "coindesk-staff", "coindesk-research",
    "coindesk research", "coindesk staff",
    "defense-one", "thedefensepost", "theregister", "the-intercept",
    "the-meridiem", "variety", "variety-staff", "variety staff", "spacenews",
    "nasaspaceflight", "thedonkey", "insidedefense", "techpolicypress",
    "morganlewis", "casinoorg", "deadline", "animationmagazine",
    "defensepost", "casino-org", "casino.org",
    "air & space forces magazine", "ieee spectrum", "techcrunch-staff",
    "blockworks", "blockworks-staff", "decrypt", "ainvest", "banking-dive", "banking dive",
    "cset-georgetown", "cset georgetown",
    "kff", "kff-health-news", "kff health news", "kff-health-news---cbo",
    "kff-health-news-/-cbo", "kff health news / cbo", "kffhealthnews",
    "bloomberg-law",
    "norton-rose-fulbright", "norton rose fulbright",
    "defence-post", "the-defensepost",
    "wilmerhale", "mofo", "sciencedirect",
    "yogonet", "csr", "aisi-uk", "aisi", "aisi_gov", "rand",
    "armscontrol", "eclinmed", "solana-compass", "solana compass",
    "pmc11919318", "pmc11780016",
    "healthverity", "natrium", "form-energy",
    "courtlistener", "curtis-schiff", "curtis-schiff-prediction-markets",
    "prophetx", "techpolicypress-staff",
    "npr", "venturebeat", "geekwire", "payloadspace", "the-ankler",
    "theankler", "tubefilter", "emarketer", "dagster",
    "numerai",  # fund/project brand, not person
    "psl", "multistate",
 }
 PUBLISHERS_ACADEMIC = {
    # Academic orgs, labs, papers, journals, institutions
    "arxiv", "metr", "metr_evals", "apollo-research", "apollo research", "apolloresearch",
    "jacc-study-authors", "jacc-data-report-authors",
    "anthropic-fellows-program", "anthropic-fellows",
    "anthropic-fellows-/-alignment-science-team", "anthropic-research",
    "jmir-2024", "jmir 2024",
    "oettl-et-al.,-journal-of-experimental-orthopaedics",
    "oettl et al., journal of experimental orthopaedics",
    "jacc", "nct06548490", "pmc",
    "conitzer-et-al.-(2024)", "aquino-michaels-2026", "pan-et-al.",
    "pan-et-al.-'natural-language-agent-harnesses'",
    "stanford", "stanford-meta-harness",
    "hendershot", "annals-im",
    "nellie-liang,-brookings-institution", "nellie liang, brookings institution",
    "penn-state", "american-heart-association", "american heart association",
    "molt_cornelius", "molt-cornelius",
    # Companies / labs / brand-orgs (not specific humans)
    "anthropic", "anthropicai", "openai", "nasa", "icrc", "ecri",
    "epochairesearch", "metadao", "iapam", "icer",
    "who", "ama", "uspstf", "unknown",
    "futard.io",  # protocol/platform
    "oxford-martin-ai-governance-initiative",
    "oxford-martin-ai-governance",
    "u.s.-food-and-drug-administration",
    "jitse-goutbeek,-european-policy-centre",  # cited person+org string → publisher
    "adepoju-et-al.",  # paper citation
    # Formal-citation names (Firstname-Lastname or Lastname-et-al) — classified
    # as academic citations, not reachable contributors. They'd need an @ handle
    # to get CI credit per Cory's growth-loop design.
    "senator-elissa-slotkin",
    "bostrom", "hanson", "kaufmann", "noah-smith", "doug-shapiro",
    "shayon-sengupta", "shayon sengupta",
    "robin-hanson", "robin hanson", "eliezer-yudkowsky",
    "leopold-aschenbrenner", "aschenbrenner",
    "ramstead", "larsson", "heavey",
    "dan-slimmon", "van-leeuwaarden", "ward-whitt", "adams",
    "tamim-ansary", "spizzirri",
    "dario-amodei",  # formal-citation form (real @ is @darioamodei)
    "corless", "oxranga", "vlahakis",
    # Brand/project/DAO tokens — not individuals
    "areal-dao", "areal", "theiaresearch", "futard-io", "dhrumil",
    # Classic formal-citation names — famous academics/economists cited by surname.
    # Reachable via @ handle if/when they join (e.g. Ostrom has no X, Hayek deceased,
    # Friston has an institutional affiliation not an @ handle we'd track).
    "clayton-christensen", "hidalgo", "coase", "wiener", "juarrero",
    "ostrom", "centola", "hayek", "marshall-mcluhan", "blackmore",
    "knuth", "friston", "aquino-michaels", "conitzer", "bak",
 }
 # NOTE: pseudonymous X handles that MAY be real contributors stay in keep_person:
 #   karpathy, simonw, swyx, metaproph3t, metanallok, mmdhrumil, sjdedic,
 #   ceterispar1bus — these are real X accounts and match Cory's growth loop.
 # They appear without @ prefix because extraction frontmatter didn't normalize.
 # Auto-creating them as contributors tier='cited' is correct (A-path from earlier).
 PUBLISHERS_SOCIAL = {
    "x", "twitter", "telegram", "x.com",
 }
 PUBLISHERS_INTERNAL = {
    "teleohumanity-manifesto", "strategy-session-journal",
    "living-capital-thesis-development", "attractor-state-historical-backtesting",
    "web-research-compilation", "architectural-investing",
    "governance---meritocratic-voting-+-futarchy",  # title artifact
    "sec-interpretive-release-s7-2026-09-(march-17",  # title artifact
    "mindstudio",  # tooling/platform, not contributor
 }
 # Merge into one kind→set map for classification
 PUBLISHER_KIND_MAP = {}
 for h in PUBLISHERS_NEWS:
    PUBLISHER_KIND_MAP[h.lower()] = "news"
 for h in PUBLISHERS_ACADEMIC:
    PUBLISHER_KIND_MAP[h.lower()] = "academic"
 for h in PUBLISHERS_SOCIAL:
    PUBLISHER_KIND_MAP[h.lower()] = "social_platform"
 for h in PUBLISHERS_INTERNAL:
    PUBLISHER_KIND_MAP[h.lower()] = "internal"
 # Garbage: handles that are clearly parse artifacts, not real names.
 # Pattern: contains parens, special chars, or >50 chars.
 def is_garbage(handle: str) -> bool:
    h = handle.strip()
    if len(h) > 50:
        return True
    if re.search(r"[()\[\]<>{}\/\\|@#$%^&*=?!:;\"']", h):
        # But @ can appear legitimately in handles like @thesensatore — allow if @ is only prefix
        if h.startswith("@") and not re.search(r"[()\[\]<>{}\/\\|#$%^&*=?!:;\"']", h):
            return False
        return True
    # Multi-word hyphenated with very specific artifact shape: 3+ hyphens in a row or trailing noise
    if "---" in h or "---meritocratic" in h or h.endswith("(march") or h.endswith("-(march"):
        return True
    return False
 def classify(handle: str) -> tuple[str, str | None]:
    """Return (category, publisher_kind).
    category ∈ {'keep_agent', 'keep_person', 'publisher', 'garbage', 'review_needed'}
    publisher_kind ∈ {'news','academic','social_platform','internal', None}
    """
    h = handle.strip().lower().lstrip("@")
    if h in PENTAGON_AGENTS:
        return ("keep_agent", None)
    if h in PUBLISHER_KIND_MAP:
        return ("publisher", PUBLISHER_KIND_MAP[h])
    if is_garbage(handle):
        return ("garbage", None)
    # @-prefixed handles or short-slug real-looking names → keep as person
    # (Auto-create rule from Cory: @ handles auto-join as tier='cited'.)
    if handle.startswith("@"):
        return ("keep_person", None)
    # Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person.
    # 39-char ceiling matches GitHub's handle limit and the writer path in
    # contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall
    # through to review_needed and block --apply.
    if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h):
        return ("keep_person", None)
    # Everything else: needs human review
    return ("review_needed", None)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--apply", action="store_true", help="Write changes to DB")
    parser.add_argument("--show", type=str, help="Inspect a single handle")
    parser.add_argument("--delete-events", action="store_true",
                        help="DELETE contribution_events for publishers+garbage (default: keep for audit)")
    args = parser.parse_args()
    if not Path(DB_PATH).exists():
        print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
        sys.exit(1)
    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row
    # Sanity: publishers table must exist (v26 migration applied)
    try:
        conn.execute("SELECT 1 FROM publishers LIMIT 1")
    except sqlite3.OperationalError:
        print("ERROR: publishers table missing. Run migration v26 first.", file=sys.stderr)
        sys.exit(2)
    rows = conn.execute(
        "SELECT handle, kind, tier, claims_merged FROM contributors ORDER BY claims_merged DESC"
    ).fetchall()
    if args.show:
        target = args.show.strip().lower().lstrip("@")
        for r in rows:
            if r["handle"].lower().lstrip("@") == target:
                category, pkind = classify(r["handle"])
                events_count = conn.execute(
                    "SELECT COUNT(*) FROM contribution_events WHERE handle = ?",
                    (r["handle"].lower().lstrip("@"),),
                ).fetchone()[0]
                print(f"handle:         {r['handle']}")
                print(f"current_kind:   {r['kind']}")
                print(f"current_tier:   {r['tier']}")
                print(f"claims_merged:  {r['claims_merged']}")
                print(f"events:         {events_count}")
                print(f"→ category:     {category}")
                if pkind:
                    print(f"→ publisher:    kind={pkind}")
                return
        print(f"No match for '{args.show}'")
        return
    # Classify all
    buckets: dict[str, list[dict]] = {
        "keep_agent": [],
        "keep_person": [],
        "publisher": [],
        "garbage": [],
        "review_needed": [],
    }
    for r in rows:
        category, pkind = classify(r["handle"])
        buckets[category].append({
            "handle": r["handle"],
            "kind_now": r["kind"],
            "tier": r["tier"],
            "claims": r["claims_merged"] or 0,
            "publisher_kind": pkind,
        })
    print("=== Classification summary ===")
    for cat, items in buckets.items():
        print(f"  {cat:18s}  {len(items):5d}")
    print("\n=== Sample of each category ===")
    for cat, items in buckets.items():
        print(f"\n--- {cat} (showing up to 10) ---")
        for item in items[:10]:
            tag = f" → {item['publisher_kind']}" if item["publisher_kind"] else ""
            print(f"  {item['handle']:50s} claims={item['claims']:5d}{tag}")
    print("\n=== Full review_needed list ===")
    for item in buckets["review_needed"]:
        print(f"  {item['handle']:50s} claims={item['claims']:5d}")
    # Diagnostic: orphan alias count for handles we're about to delete.
    # Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway),
    # so aliases pointing to deleted canonical handles become orphans. Surface
    # the count so the --delete-events decision is informed.
    doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]]
    if doomed:
        placeholders = ",".join("?" * len(doomed))
        orphan_count = conn.execute(
            f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})",
            doomed,
        ).fetchone()[0]
        print(f"\n=== Alias orphan check ===")
        print(f"  contributor_aliases rows pointing to deletable canonicals: {orphan_count}")
        if orphan_count:
            print(f"  (cleanup requires --delete-events; without it, aliases stay as orphans)")
    if not args.apply:
        print("\n(dry-run — no writes. Re-run with --apply to execute.)")
        return
    # ── Apply changes ──
    print("\n=== Applying changes ===")
    if buckets["review_needed"]:
        print(f"ABORT: {len(buckets['review_needed'])} rows need human review. Fix classifier before --apply.")
        sys.exit(3)
    inserted_publishers = 0
    reclassified_agents = 0
    deleted_garbage = 0
    deleted_publisher_rows = 0
    deleted_events = 0
    deleted_aliases = 0
    # Single transaction — if any step errors, roll back. This prevents the failure
    # mode where a publisher insert fails silently and we still delete the contributor
    # row, losing data.
    try:
        conn.execute("BEGIN")
        # 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
        # Counter uses cur.rowcount so replay runs (where publishers already exist)
        # report accurate inserted=0 instead of falsely claiming the full set.
        # moved_to_publisher is unconditional — the contributors row still needs to
        # be deleted even when the publishers row was added in a prior run.
        moved_to_publisher = set()
        for item in buckets["publisher"]:
            name = item["handle"].strip().lower().lstrip("@")
            cur = conn.execute(
                "INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
                (name, item["publisher_kind"]),
            )
            if cur.rowcount > 0:
                inserted_publishers += 1
            moved_to_publisher.add(item["handle"])
        # 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
        for item in buckets["keep_agent"]:
            conn.execute(
                "UPDATE contributors SET kind = 'agent' WHERE handle = ?",
                (item["handle"].lower().lstrip("@"),),
            )
            reclassified_agents += 1
        # 3. Delete garbage handles from contributors (and their events + aliases)
        for item in buckets["garbage"]:
            canonical_lower = item["handle"].lower().lstrip("@")
            if args.delete_events:
                cur = conn.execute(
                    "DELETE FROM contribution_events WHERE handle = ?",
                    (canonical_lower,),
                )
                deleted_events += cur.rowcount
                cur = conn.execute(
                    "DELETE FROM contributor_aliases WHERE canonical = ?",
                    (canonical_lower,),
                )
                deleted_aliases += cur.rowcount
            cur = conn.execute(
                "DELETE FROM contributors WHERE handle = ?",
                (item["handle"],),
            )
            deleted_garbage += cur.rowcount
        # 4. Delete publisher rows from contributors — ONLY for those successfully
        # inserted into publishers above. Guards against partial failure.
        # Aliases pointing to publisher-classified handles get cleaned under the
        # same --delete-events gate: publishers live in their own table now, any
        # leftover aliases in contributor_aliases are orphans.
        for item in buckets["publisher"]:
            if item["handle"] not in moved_to_publisher:
                continue
            canonical_lower = item["handle"].lower().lstrip("@")
            if args.delete_events:
                cur = conn.execute(
                    "DELETE FROM contribution_events WHERE handle = ?",
                    (canonical_lower,),
                )
                deleted_events += cur.rowcount
                cur = conn.execute(
                    "DELETE FROM contributor_aliases WHERE canonical = ?",
                    (canonical_lower,),
                )
                deleted_aliases += cur.rowcount
            cur = conn.execute(
                "DELETE FROM contributors WHERE handle = ?",
                (item["handle"],),
            )
            deleted_publisher_rows += cur.rowcount
        # 5. Audit log entry for the destructive operation (Ganymede Q5).
        conn.execute(
            "INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)",
            (
                "schema_v26",
                "classify_contributors",
                json.dumps({
                    "publishers_inserted": inserted_publishers,
                    "agents_updated": reclassified_agents,
                    "garbage_deleted": deleted_garbage,
                    "publisher_rows_deleted": deleted_publisher_rows,
                    "events_deleted": deleted_events,
                    "aliases_deleted": deleted_aliases,
                    "delete_events_flag": bool(args.delete_events),
                }),
            ),
        )
        conn.commit()
    except Exception as e:
        conn.rollback()
        print(f"ERROR: Transaction failed, rolled back. {e}", file=sys.stderr)
        sys.exit(4)
    print(f"  publishers inserted:          {inserted_publishers}")
    print(f"  agents kind='agent' ensured:  {reclassified_agents}")
    print(f"  garbage rows deleted:         {deleted_garbage}")
    print(f"  publisher rows removed from contributors: {deleted_publisher_rows}")
    if args.delete_events:
        print(f"  contribution_events deleted:  {deleted_events}")
        print(f"  contributor_aliases deleted:  {deleted_aliases}")
    else:
        print(f"  (events + aliases kept — re-run with --delete-events to clean them)")
 if __name__ == "__main__":
    main()
--- a/scripts/contributor-graph.py
+++ b/scripts/contributor-graph.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 """Generate cumulative contributor + claims PNG for Twitter embedding."""
 import json
 import subprocess
 import sys
 from datetime import datetime, timedelta
 from pathlib import Path
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 from matplotlib.ticker import MaxNLocator
 ACCENT = "#00d4aa"
 PURPLE = "#7c3aed"
 BG = "#0a0a0a"
 TEXT = "#e0e0e0"
 SUBTLE = "#555555"
 OUTPUT = Path("/opt/teleo-eval/static/contributor-graph.png")
 def get_data():
    """Fetch from local API."""
    import urllib.request
    with urllib.request.urlopen("http://localhost:8081/api/contributor-growth") as r:
        return json.loads(r.read())
 def build_continuous_series(milestones, start_date, end_date):
    """Expand milestone-only contributor data into daily series."""
    dates = []
    values = []
    current = 0
    milestone_map = {}
    for m in milestones:
        d = datetime.strptime(m["date"], "%Y-%m-%d").date()
        milestone_map[d] = m["cumulative"]
    d = start_date
    while d <= end_date:
        if d in milestone_map:
            current = milestone_map[d]
        dates.append(d)
        values.append(current)
        d += timedelta(days=1)
    return dates, values
 def render(data, output_path):
    fig, ax1 = plt.subplots(figsize=(12, 6.3), dpi=100)
    fig.patch.set_facecolor(BG)
    ax1.set_facecolor(BG)
    claims = data["cumulative_claims"]
    contribs = data["cumulative_contributors"]
    claim_dates = [datetime.strptime(c["date"], "%Y-%m-%d").date() for c in claims]
    claim_values = [c["cumulative"] for c in claims]
    start = min(claim_dates)
    end = max(claim_dates)
    contrib_dates, contrib_values = build_continuous_series(contribs, start, end)
    # Claims line (left y-axis)
    ax1.fill_between(claim_dates, claim_values, alpha=0.15, color=ACCENT)
    ax1.plot(claim_dates, claim_values, color=ACCENT, linewidth=2.5, label="Claims")
    ax1.set_ylabel("Claims", color=ACCENT, fontsize=12, fontweight="bold")
    ax1.tick_params(axis="y", colors=ACCENT, labelsize=10)
    ax1.set_ylim(bottom=0)
    # Contributors line (right y-axis)
    ax2 = ax1.twinx()
    ax2.set_facecolor("none")
    ax2.fill_between(contrib_dates, contrib_values, alpha=0.1, color=PURPLE, step="post")
    ax2.step(contrib_dates, contrib_values, color=PURPLE, linewidth=2.5,
             where="post", label="Contributors")
    ax2.set_ylabel("Contributors", color=PURPLE, fontsize=12, fontweight="bold")
    ax2.tick_params(axis="y", colors=PURPLE, labelsize=10)
    ax2.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax2.set_ylim(bottom=0, top=max(contrib_values) * 1.8)
    # Annotate contributor milestones with staggered offsets to avoid overlap
    offsets = {}
    for i, m in enumerate(contribs):
        d = datetime.strptime(m["date"], "%Y-%m-%d").date()
        val = m["cumulative"]
        names = [n["name"] for n in m["new"]]
        if len(names) <= 2:
            label = ", ".join(names)
        else:
            label = f"+{len(names)}"
        y_off = 8 + (i % 2) * 14
        ax2.annotate(label, (d, val),
                     textcoords="offset points", xytext=(5, y_off),
                     fontsize=7, color=PURPLE, alpha=0.8)
    # Hero stats
    total_claims = data["summary"]["total_claims"]
    total_contribs = data["summary"]["total_contributors"]
    days = data["summary"]["days_active"]
    fig.text(0.14, 0.88, f"{total_claims:,} claims", fontsize=22,
             color=ACCENT, fontweight="bold", ha="left")
    fig.text(0.14, 0.82, f"{total_contribs} contributors · {days} days",
             fontsize=13, color=TEXT, ha="left", alpha=0.7)
    # X-axis
    ax1.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
    ax1.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    ax1.tick_params(axis="x", colors=SUBTLE, labelsize=9, rotation=0)
    # Remove spines
    for ax in [ax1, ax2]:
        for spine in ax.spines.values():
            spine.set_visible(False)
    # Subtle grid on claims axis only
    ax1.grid(axis="y", color=SUBTLE, alpha=0.2, linewidth=0.5)
    ax1.set_axisbelow(True)
    # Branding
    fig.text(0.98, 0.02, "livingip.xyz", fontsize=9, color=SUBTLE,
             ha="right", style="italic")
    plt.tight_layout(rect=[0, 0.03, 1, 0.78])
    output_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_path, facecolor=BG, bbox_inches="tight", pad_inches=0.3)
    plt.close(fig)
    print(f"Saved to {output_path} ({output_path.stat().st_size:,} bytes)")
 if __name__ == "__main__":
    out = Path(sys.argv[1]) if len(sys.argv) > 1 else OUTPUT
    data = get_data()
    render(data, out)
--- a/scripts/cumulative-growth.py
+++ b/scripts/cumulative-growth.py
@ -0,0 +1,223 @@
 #!/usr/bin/env python3
 """Generate cumulative growth time-series data for public dashboard.
 Produces JSON with three series:
  - cumulative_contributors: unique git authors over time
  - cumulative_claims: domain claim files added over time
  - github_stars: star count snapshots (requires GitHub API)
 Data sources: git log (codex repo), GitHub API.
 Output: JSON to stdout or file, suitable for Chart.js line charts.
 Usage:
  python3 cumulative-growth.py --codex-path /path/to/teleo-codex [--output /path/to/output.json]
  python3 cumulative-growth.py --codex-path /path/to/teleo-codex --format csv
 """
 import argparse
 import json
 import subprocess
 import sys
 from collections import defaultdict
 from datetime import datetime, timedelta
 # Map bot/service accounts to their human principal or exclude them.
 # "Teleo Agents" and "Teleo Pipeline" are bot accounts — attribute to system.
 CONTRIBUTOR_ALIASES = {
    "Teleo Agents": None,   # system automation, not a contributor
    "Teleo Pipeline": None, # pipeline bot
 }
 # Founding contributors get a badge — anyone who contributed before this date.
 FOUNDING_CUTOFF = "2026-03-15"
 def git_log_contributors(codex_path: str) -> list[dict]:
    """Extract per-commit author and date from git log."""
    result = subprocess.run(
        ["git", "log", "--format=%ad|%an", "--date=format:%Y-%m-%d", "--all"],
        capture_output=True, text=True, cwd=codex_path
    )
    if result.returncode != 0:
        print(f"git log failed: {result.stderr}", file=sys.stderr)
        sys.exit(1)
    entries = []
    for line in result.stdout.strip().split("\n"):
        if "|" not in line:
            continue
        date, author = line.split("|", 1)
        canonical = CONTRIBUTOR_ALIASES.get(author, author)
        if canonical is None:
            continue
        entries.append({"date": date, "author": canonical})
    return entries
 def git_log_claims(codex_path: str) -> list[dict]:
    """Extract claim file additions over time from git log."""
    result = subprocess.run(
        ["git", "log", "--format=%ad", "--date=format:%Y-%m-%d",
         "--all", "--diff-filter=A", "--", "domains/*.md"],
        capture_output=True, text=True, cwd=codex_path
    )
    if result.returncode != 0:
        print(f"git log failed: {result.stderr}", file=sys.stderr)
        sys.exit(1)
    counts = defaultdict(int)
    for line in result.stdout.strip().split("\n"):
        line = line.strip()
        if line:
            counts[line] += 1
    return [{"date": d, "count": c} for d, c in sorted(counts.items())]
 def github_stars(repo: str = "living-ip/teleo-codex") -> int | None:
    """Fetch current star count from GitHub API. Returns None on failure."""
    try:
        result = subprocess.run(
            ["gh", "api", f"repos/{repo}", "--jq", ".stargazers_count"],
            capture_output=True, text=True, timeout=10
        )
        if result.returncode == 0:
            return int(result.stdout.strip())
    except (subprocess.TimeoutExpired, ValueError):
        pass
    return None
 def build_cumulative_contributors(entries: list[dict]) -> list[dict]:
    """Build cumulative unique contributor count by date."""
    first_seen = {}
    for e in entries:
        author, date = e["author"], e["date"]
        if author not in first_seen or date < first_seen[author]:
            first_seen[author] = date
    by_date = defaultdict(list)
    for author, date in first_seen.items():
        by_date[date].append(author)
    timeline = []
    seen = set()
    for date in sorted(by_date.keys()):
        new_authors = by_date[date]
        seen.update(new_authors)
        is_founding = date <= FOUNDING_CUTOFF
        timeline.append({
            "date": date,
            "cumulative": len(seen),
            "new": [
                {"name": a, "founding": is_founding}
                for a in sorted(new_authors)
            ],
        })
    return timeline
 def build_cumulative_claims(claim_entries: list[dict]) -> list[dict]:
    """Build cumulative claim count by date."""
    timeline = []
    cumulative = 0
    for entry in claim_entries:
        cumulative += entry["count"]
        timeline.append({
            "date": entry["date"],
            "cumulative": cumulative,
            "added": entry["count"],
        })
    return timeline
 def build_daily_commits(entries: list[dict]) -> list[dict]:
    """Build daily commit volume by contributor."""
    daily = defaultdict(lambda: defaultdict(int))
    for e in entries:
        daily[e["date"]][e["author"]] += 1
    timeline = []
    for date in sorted(daily.keys()):
        authors = daily[date]
        timeline.append({
            "date": date,
            "total": sum(authors.values()),
            "by_contributor": dict(sorted(authors.items())),
        })
    return timeline
 def generate_report(codex_path: str) -> dict:
    entries = git_log_contributors(codex_path)
    claim_entries = git_log_claims(codex_path)
    stars = github_stars()
    contributors_timeline = build_cumulative_contributors(entries)
    claims_timeline = build_cumulative_claims(claim_entries)
    commits_timeline = build_daily_commits(entries)
    all_contributors = set(e["author"] for e in entries)
    founding = [
        a for a in all_contributors
        if any(
            e["date"] <= FOUNDING_CUTOFF and e["author"] == a
            for e in entries
        )
    ]
    return {
        "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "summary": {
            "total_contributors": len(all_contributors),
            "founding_contributors": sorted(founding),
            "total_claims": claims_timeline[-1]["cumulative"] if claims_timeline else 0,
            "github_stars": stars,
            "codex_start_date": "2026-03-05",
            "days_active": (datetime.utcnow() - datetime(2026, 3, 5)).days,
        },
        "cumulative_contributors": contributors_timeline,
        "cumulative_claims": claims_timeline,
        "daily_activity": commits_timeline,
    }
 def format_csv(report: dict) -> str:
    lines = ["date,cumulative_contributors,cumulative_claims"]
    contrib_map = {e["date"]: e["cumulative"] for e in report["cumulative_contributors"]}
    claims_map = {e["date"]: e["cumulative"] for e in report["cumulative_claims"]}
    all_dates = sorted(set(list(contrib_map.keys()) + list(claims_map.keys())))
    last_contrib = 0
    last_claims = 0
    for d in all_dates:
        last_contrib = contrib_map.get(d, last_contrib)
        last_claims = claims_map.get(d, last_claims)
        lines.append(f"{d},{last_contrib},{last_claims}")
    return "\n".join(lines)
 def main():
    parser = argparse.ArgumentParser(description="Generate cumulative growth data")
    parser.add_argument("--codex-path", required=True, help="Path to teleo-codex repo")
    parser.add_argument("--output", help="Output file path (default: stdout)")
    parser.add_argument("--format", choices=["json", "csv"], default="json")
    args = parser.parse_args()
    report = generate_report(args.codex_path)
    if args.format == "csv":
        output = format_csv(report)
    else:
        output = json.dumps(report, indent=2)
    if args.output:
        with open(args.output, "w") as f:
            f.write(output)
        print(f"Written to {args.output}", file=sys.stderr)
    else:
        print(output)
 if __name__ == "__main__":
    main()
--- a/scripts/scoring_digest.py
+++ b/scripts/scoring_digest.py
@ -0,0 +1,561 @@
 #!/usr/bin/env python3
 """Daily scoring digest — classify, score, and broadcast KB contributions.
 Runs daily at 8:07 AM London via cron.
 Queries pipeline.db for merged PRs in last 24h, classifies each as
 CREATE/ENRICH/CHALLENGE, scores with importance multiplier and connectivity
 bonus, updates contributors table, posts summary to Telegram.
 Spec: Pentagon/sprints/contribution-scoring-algorithm.md
 """
 import json
 import logging
 import os
 import re
 import sqlite3
 import subprocess
 import sys
 import urllib.request
 from datetime import datetime, timezone, timedelta
 from pathlib import Path
 from zoneinfo import ZoneInfo
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
 )
 log = logging.getLogger("scoring_digest")
 # --- Configuration ---
 BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval"))
 DB_PATH = BASE_DIR / "pipeline" / "pipeline.db"
 CODEX_DIR = BASE_DIR / "workspaces" / "main"
 TELEGRAM_TOKEN_FILE = BASE_DIR / "secrets" / "telegram-bot-token"
 TELEGRAM_CHAT_ID = 2091295364
 DIGEST_JSON_PATH = BASE_DIR / "logs" / "scoring-digest-latest.json"
 LONDON_TZ = ZoneInfo("Europe/London")
 # --- Action weights (Leo spec Apr 20) ---
 ACTION_WEIGHTS = {
    "challenge": 0.40,
    "create": 0.35,
    "enrich": 0.25,
 }
 # --- Confidence → base importance mapping ---
 CONFIDENCE_BASE = {
    "proven": 2.0,
    "likely": 1.5,
    "experimental": 1.0,
    "speculative": 1.0,
    "possible": 1.0,
    "plausible": 1.0,
    "medium": 1.5,
 }
 DOMAIN_CLAIM_COUNTS: dict[str, int] = {}
 ENTITY_SLUGS: set[str] = set()
 CLAIM_SLUGS: set[str] = set()
 MAP_FILES: set[str] = set()
 def _slugify(title: str) -> str:
    s = title.lower().strip()
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"[\s_]+", "-", s)
    return s.strip("-")
 def _init_link_index():
    """Build indexes for wiki-link resolution."""
    global ENTITY_SLUGS, CLAIM_SLUGS, MAP_FILES
    entities_dir = CODEX_DIR / "entities"
    if entities_dir.exists():
        for f in entities_dir.glob("*.md"):
            ENTITY_SLUGS.add(f.stem.lower())
    for domain_dir in (CODEX_DIR / "domains").iterdir():
        if not domain_dir.is_dir():
            continue
        for f in domain_dir.glob("*.md"):
            CLAIM_SLUGS.add(f.stem.lower())
        map_file = domain_dir / "_map.md"
        if map_file.exists():
            MAP_FILES.add("_map")
            MAP_FILES.add(f"domains/{domain_dir.name}/_map")
    for f in (CODEX_DIR / "foundations").glob("*.md") if (CODEX_DIR / "foundations").exists() else []:
        CLAIM_SLUGS.add(f.stem.lower())
    for f in (CODEX_DIR / "core").glob("*.md") if (CODEX_DIR / "core").exists() else []:
        CLAIM_SLUGS.add(f.stem.lower())
    for f in (CODEX_DIR / "decisions").glob("*.md") if (CODEX_DIR / "decisions").exists() else []:
        CLAIM_SLUGS.add(f.stem.lower())
 def _resolve_link(link_text: str) -> bool:
    """Check if a [[wiki-link]] resolves to a known entity, claim, or map."""
    slug = _slugify(link_text)
    return (
        slug in ENTITY_SLUGS
        or slug in CLAIM_SLUGS
        or slug in MAP_FILES
        or link_text.lower() in MAP_FILES
    )
 def _count_resolved_wiki_links(file_path: Path) -> int:
    """Count wiki-links in a claim file that resolve to real targets."""
    if not file_path.exists():
        return 0
    try:
        text = file_path.read_text(encoding="utf-8")
    except Exception:
        return 0
    links = re.findall(r"\[\[([^\]]+)\]\]", text)
    return sum(1 for link in links if _resolve_link(link))
 def _get_confidence(file_path: Path) -> str:
    """Extract confidence field from claim frontmatter."""
    if not file_path.exists():
        return "experimental"
    try:
        text = file_path.read_text(encoding="utf-8")
    except Exception:
        return "experimental"
    m = re.search(r"^confidence:\s*(\S+)", text, re.MULTILINE)
    return m.group(1).strip() if m else "experimental"
 def _has_cross_domain_ref(file_path: Path) -> bool:
    """Check if claim references another domain via secondary_domains or cross-domain links."""
    if not file_path.exists():
        return False
    try:
        text = file_path.read_text(encoding="utf-8")
    except Exception:
        return False
    if re.search(r"^secondary_domains:\s*\[.+\]", text, re.MULTILINE):
        return True
    if re.search(r"^depends_on:", text, re.MULTILINE):
        return True
    return False
 def _has_challenged_by(file_path: Path) -> bool:
    """Check if claim has challenged_by field."""
    if not file_path.exists():
        return False
    try:
        text = file_path.read_text(encoding="utf-8")
    except Exception:
        return False
    return bool(re.search(r"^challenged_by:", text, re.MULTILINE))
 def _get_domain_weight(domain: str) -> float:
    """Domain maturity weight: sparse domains get bonus, mature domains get discount."""
    count = DOMAIN_CLAIM_COUNTS.get(domain, 0)
    if count < 20:
        return 1.5
    elif count > 50:
        return 0.8
    return 1.0
 def _init_domain_counts():
    """Count claims per domain."""
    global DOMAIN_CLAIM_COUNTS
    domains_dir = CODEX_DIR / "domains"
    if not domains_dir.exists():
        return
    for domain_dir in domains_dir.iterdir():
        if domain_dir.is_dir():
            count = sum(1 for f in domain_dir.glob("*.md") if f.name != "_map.md")
            DOMAIN_CLAIM_COUNTS[domain_dir.name] = count
 def _normalize_contributor(submitted_by: str | None, agent: str | None, branch: str | None = None) -> str:
    """Normalize contributor handle — strip @, map agent self-directed to agent name.
    For fork PRs (contrib/NAME/...), extract contributor from branch name.
    """
    if branch and branch.startswith("contrib/"):
        parts = branch.split("/")
        if len(parts) >= 2 and parts[1]:
            return parts[1].lower()
    raw = submitted_by or agent or "unknown"
    raw = raw.strip()
    if raw.startswith("@"):
        raw = raw[1:]
    if " (self-directed)" in raw:
        raw = raw.replace(" (self-directed)", "")
    if raw in ("pipeline", ""):
        return agent.strip() if agent and agent.strip() not in ("pipeline", "") else "pipeline"
    return raw
 def classify_pr(pr: dict) -> str | None:
    """Classify a merged PR as create/enrich/challenge or None (skip).
    Uses branch name pattern + commit_type as primary signal.
    Falls back to file-level analysis for ambiguous cases.
    """
    branch = pr.get("branch", "")
    commit_type = pr.get("commit_type", "")
    if commit_type in ("pipeline", "entity"):
        return None
    if "challenge" in branch.lower():
        return "challenge"
    if branch.startswith("extract/") or branch.startswith("research-"):
        return "create"
    if "reweave" in branch.lower() or "enrich" in branch.lower():
        return "enrich"
    if commit_type == "research":
        return "create"
    if commit_type == "reweave":
        return "enrich"
    if commit_type == "fix":
        return "enrich"
    if commit_type == "knowledge":
        return "create"
    return "create"
 def _find_claim_file(pr: dict) -> Path | None:
    """Find the claim file for a merged PR."""
    domain = pr.get("domain")
    branch = pr.get("branch", "")
    if not domain:
        return None
    domain_dir = CODEX_DIR / "domains" / domain
    if not domain_dir.exists():
        return None
    slug_part = branch.split("/")[-1] if "/" in branch else branch
    slug_part = re.sub(r"-[a-f0-9]{4}$", "", slug_part)
    for claim_file in domain_dir.glob("*.md"):
        if claim_file.name == "_map.md":
            continue
        claim_slug = _slugify(claim_file.stem)
        if slug_part and slug_part in claim_slug:
            return claim_file
    return None
 def score_contribution(action_type: str, claim_file: Path | None, domain: str) -> tuple[float, dict]:
    """Compute CI points for a single contribution.
    Returns (score, breakdown_dict) for transparency.
    """
    weight = ACTION_WEIGHTS[action_type]
    confidence = _get_confidence(claim_file) if claim_file else "experimental"
    base = CONFIDENCE_BASE.get(confidence, 1.0)
    if action_type == "challenge" and claim_file and _has_challenged_by(claim_file):
        base = 3.0 if confidence in ("proven",) else 2.5
    domain_weight = _get_domain_weight(domain)
    connectivity = 0.0
    if claim_file and _has_cross_domain_ref(claim_file):
        connectivity += 0.2
    create_multiplier = 1.0
    resolved_links = 0
    if action_type == "create" and claim_file:
        resolved_links = _count_resolved_wiki_links(claim_file)
        if resolved_links >= 3:
            create_multiplier = 1.5
    importance = base * domain_weight + connectivity
    score = weight * importance * create_multiplier
    return score, {
        "action": action_type,
        "weight": weight,
        "confidence": confidence,
        "base": base,
        "domain_weight": domain_weight,
        "connectivity_bonus": connectivity,
        "create_multiplier": create_multiplier,
        "resolved_links": resolved_links,
        "importance": importance,
        "score": round(score, 4),
    }
 def collect_and_score(hours: int = 24) -> dict:
    """Main scoring pipeline: collect merged PRs, classify, score."""
    _init_domain_counts()
    _init_link_index()
    cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
    conn = sqlite3.connect(str(DB_PATH))
    conn.row_factory = sqlite3.Row
    try:
        rows = conn.execute(
            """SELECT number, branch, domain, agent, commit_type, merged_at,
                      submitted_by, description
               FROM prs
               WHERE status = 'merged' AND merged_at >= ?
               ORDER BY merged_at DESC""",
            (cutoff,),
        ).fetchall()
    finally:
        conn.close()
    contributions = []
    contributor_deltas: dict[str, float] = {}
    domain_activity: dict[str, int] = {}
    action_counts = {"create": 0, "enrich": 0, "challenge": 0}
    for row in rows:
        pr = dict(row)
        action_type = classify_pr(pr)
        if action_type is None:
            continue
        claim_file = _find_claim_file(pr)
        domain = pr.get("domain", "unknown")
        score, breakdown = score_contribution(action_type, claim_file, domain)
        contributor = _normalize_contributor(
            pr.get("submitted_by"), pr.get("agent"), pr.get("branch")
        )
        contributor_deltas[contributor] = contributor_deltas.get(contributor, 0) + score
        domain_activity[domain] = domain_activity.get(domain, 0) + 1
        action_counts[action_type] = action_counts.get(action_type, 0) + 1
        contributions.append({
            "pr_number": pr["number"],
            "contributor": contributor,
            "agent": pr.get("agent", ""),
            "domain": domain,
            "action": action_type,
            "score": round(score, 4),
            "breakdown": breakdown,
            "description": pr.get("description", ""),
            "merged_at": pr.get("merged_at", ""),
        })
    total_claims = sum(DOMAIN_CLAIM_COUNTS.values())
    return {
        "period_hours": hours,
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "date": datetime.now(LONDON_TZ).strftime("%B %d, %Y"),
        "contributions": contributions,
        "contributor_deltas": {k: round(v, 4) for k, v in sorted(
            contributor_deltas.items(), key=lambda x: -x[1]
        )},
        "domain_activity": dict(sorted(domain_activity.items(), key=lambda x: -x[1])),
        "action_counts": action_counts,
        "total_contributions": len(contributions),
        "total_ci_awarded": round(sum(c["score"] for c in contributions), 4),
        "kb_state": {
            "total_claims": total_claims,
            "domains": len(DOMAIN_CLAIM_COUNTS),
            "domain_breakdown": dict(DOMAIN_CLAIM_COUNTS),
        },
    }
 def update_contributors(digest: dict):
    """Write CI deltas to contributors table."""
    if not digest["contributor_deltas"]:
        return
    conn = sqlite3.connect(str(DB_PATH))
    try:
        for handle, delta in digest["contributor_deltas"].items():
            conn.execute(
                """INSERT INTO contributors (handle, claims_merged, created_at, updated_at)
                   VALUES (?, 0, datetime('now'), datetime('now'))
                   ON CONFLICT(handle) DO UPDATE SET updated_at = datetime('now')""",
                (handle,),
            )
        conn.commit()
    finally:
        conn.close()
    log.info("Updated %d contributor records", len(digest["contributor_deltas"]))
 def save_scores_to_db(digest: dict):
    """Write individual contribution scores to contribution_scores table."""
    conn = sqlite3.connect(str(DB_PATH))
    try:
        conn.execute("""CREATE TABLE IF NOT EXISTS contribution_scores (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            pr_number INTEGER UNIQUE,
            contributor TEXT NOT NULL,
            event_type TEXT CHECK(event_type IN ('create','enrich','challenge')),
            ci_earned REAL,
            claim_slug TEXT,
            domain TEXT,
            scored_at TEXT NOT NULL
        )""")
        for c in digest["contributions"]:
            slug = (c.get("description") or "")[:200] or c.get("breakdown", {}).get("action", "")
            conn.execute(
                """INSERT INTO contribution_scores (pr_number, contributor, event_type, ci_earned, claim_slug, domain, scored_at)
                   VALUES (?, ?, ?, ?, ?, ?, ?)
                   ON CONFLICT(pr_number) DO UPDATE SET
                     contributor = excluded.contributor,
                     ci_earned = excluded.ci_earned,
                     event_type = excluded.event_type,
                     scored_at = excluded.scored_at""",
                (c["pr_number"], c["contributor"], c["action"], c["score"], slug, c["domain"], c["merged_at"]),
            )
        conn.commit()
        log.info("Wrote %d contribution scores to DB", len(digest["contributions"]))
    finally:
        conn.close()
 def save_digest_json(digest: dict):
    """Save latest digest as JSON for API consumption."""
    DIGEST_JSON_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(DIGEST_JSON_PATH, "w") as f:
        json.dump(digest, f, indent=2, default=str)
    log.info("Saved digest to %s", DIGEST_JSON_PATH)
 def send_telegram(digest: dict):
    """Post digest summary to Telegram."""
    token_file = TELEGRAM_TOKEN_FILE
    if not token_file.exists():
        log.warning("Telegram token not found at %s", token_file)
        return
    token = token_file.read_text().strip()
    lines = [f"📊 *Daily KB Digest — {digest['date']}*", ""]
    if digest["contributions"]:
        lines.append(f"*NEW CONTRIBUTIONS* (last {digest['period_hours']}h):")
        action_emoji = {"challenge": "⚔️", "create": "🆕", "enrich": "📚"}
        by_contributor: dict[str, list] = {}
        for c in digest["contributions"]:
            name = c["contributor"]
            by_contributor.setdefault(name, []).append(c)
        for name, contribs in sorted(by_contributor.items(), key=lambda x: -sum(c["score"] for c in x[1])):
            total_score = sum(c["score"] for c in contribs)
            actions = {}
            for c in contribs:
                actions[c["action"]] = actions.get(c["action"], 0) + 1
            action_summary = ", ".join(
                f"{action_emoji.get(a, '•')} {n} {a}" for a, n in sorted(actions.items(), key=lambda x: -x[1])
            )
            lines.append(f"  {name}: {action_summary} → +{total_score:.2f} CI")
        lines.append("")
    lines.append("*KB STATE:*")
    kb = digest["kb_state"]
    ac = digest["action_counts"]
    lines.append(
        f"Claims: {kb['total_claims']} (+{digest['total_contributions']}) | "
        f"Domains: {kb['domains']}"
    )
    lines.append(
        f"Creates: {ac.get('create', 0)} | "
        f"Enrichments: {ac.get('enrich', 0)} | "
        f"Challenges: {ac.get('challenge', 0)}"
    )
    if digest["domain_activity"]:
        top_domain = max(digest["domain_activity"], key=digest["domain_activity"].get)
        lines.append(f"Most active: {top_domain} ({digest['domain_activity'][top_domain]} events)")
    if digest["contributor_deltas"]:
        lines.append("")
        lines.append("*LEADERBOARD CHANGE:*")
        for i, (name, delta) in enumerate(digest["contributor_deltas"].items(), 1):
            if i > 5:
                break
            lines.append(f"  #{i} {name} +{delta:.2f} CI")
    text = "\n".join(lines)
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = json.dumps({
        "chat_id": TELEGRAM_CHAT_ID,
        "text": text,
        "parse_mode": "Markdown",
    }).encode("utf-8")
    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            result = json.loads(resp.read())
            if result.get("ok"):
                log.info("Telegram digest sent successfully")
            else:
                log.error("Telegram API error: %s", result)
    except Exception as e:
        log.error("Failed to send Telegram message: %s", e)
 def main():
    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
    dry_run = "--dry-run" in sys.argv
    no_telegram = "--no-telegram" in sys.argv
    log.info("Running scoring digest for last %dh (dry_run=%s)", hours, dry_run)
    digest = collect_and_score(hours)
    log.info(
        "Scored %d contributions: %d create, %d enrich, %d challenge → %.2f total CI",
        digest["total_contributions"],
        digest["action_counts"]["create"],
        digest["action_counts"]["enrich"],
        digest["action_counts"]["challenge"],
        digest["total_ci_awarded"],
    )
    for name, delta in digest["contributor_deltas"].items():
        log.info("  %s: +%.4f CI", name, delta)
    if dry_run:
        print(json.dumps(digest, indent=2, default=str))
        return
    save_digest_json(digest)
    save_scores_to_db(digest)
    update_contributors(digest)
    if not no_telegram:
        send_telegram(digest)
    log.info("Digest complete")
 if __name__ == "__main__":
    main()
--- a/tests/test_attribution.py
+++ b/tests/test_attribution.py
@ -34,13 +34,34 @@ class TestParseAttribution:
        assert result["extractor"][0]["handle"] == "rio"
        assert result["sourcer"][0]["handle"] == "theiaresearch"
-    def test_legacy_source_fallback(self):
+    def test_legacy_source_fallback_removed(self):
        """Legacy `source` heuristic removed (Ganymede review, Apr 24).
        It fabricated handles from descriptive strings (garbage like
        'sec-interpretive-release-s7-2026-09-(march-17'). Claims without
        explicit attribution now return empty — better to surface as data
        hygiene than invent contributors.
        """
        fm = {
            "type": "claim",
            "source": "@pineanalytics, Q4 2025 report",
        }
        result = parse_attribution(fm)
-        assert result["sourcer"][0]["handle"] == "pineanalytics"
+        assert all(len(v) == 0 for v in result.values())
    def test_bad_handles_filtered(self):
        """Handles with spaces, parens, or garbage chars are dropped."""
        fm = {
            "sourcer": "governance---meritocratic-voting-+-futarchy",
        }
        result = parse_attribution(fm)
        assert len(result["sourcer"]) == 0
    def test_valid_handle_with_hyphen_passes(self):
        """Legitimate handles like 'cameron-s1' survive the filter."""
        fm = {"sourcer": "cameron-s1"}
        result = parse_attribution(fm)
        assert result["sourcer"][0]["handle"] == "cameron-s1"
    def test_empty_attribution(self):
        fm = {"type": "claim"}