teleo-infrastructure/scripts/classify-contributors.py

#!/usr/bin/env python3
"""Classify `contributors` rows into {keep_person, keep_agent, move_to_publisher, delete_garbage}.

Reads current contributors table, proposes reclassification per v26 schema design:
  - Real humans + Pentagon agents stay in contributors (kind='person'|'agent')
  - News orgs, publications, venues move to publishers table (new v26)
  - Multi-word hyphenated garbage (parsing artifacts) gets deleted
  - Their contribution_events are handled per category:
      * Publishers: DELETE events (orgs shouldn't have credit)
      * Garbage: DELETE events (bogus data)
      * Persons/agents: keep events untouched

Classification is heuristic — uses explicit allowlists + regex patterns + length gates.
Ambiguous cases default to 'review_needed' (human decision).

Usage:
  python3 scripts/classify-contributors.py              # dry-run analysis + report
  python3 scripts/classify-contributors.py --apply      # write changes
  python3 scripts/classify-contributors.py --show <handle>  # inspect a single row

Writes to pipeline.db only. Does NOT modify claim files.
"""
import argparse
import json
import os
import re
import sqlite3
import sys
from collections import Counter
from pathlib import Path

DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")

# Pentagon agents: kind='agent'. Authoritative list.
PENTAGON_AGENTS = frozenset({
    "rio", "leo", "theseus", "vida", "clay", "astra",
    "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
    "pipeline",
})

# Publisher/news-org handles seen in current contributors table.
# Grouped by kind for the publishers row. Classified by inspection.
# NOTE: This list is hand-curated — add to it as new orgs appear.
PUBLISHERS_NEWS = {
    # News outlets / brands
    "cnbc", "al-jazeera", "axios", "bloomberg", "reuters", "bettorsinsider",
    "fortune", "techcrunch", "coindesk", "coindesk-staff", "coindesk-research",
    "coindesk research", "coindesk staff",
    "defense-one", "thedefensepost", "theregister", "the-intercept",
    "the-meridiem", "variety", "variety-staff", "variety staff", "spacenews",
    "nasaspaceflight", "thedonkey", "insidedefense", "techpolicypress",
    "morganlewis", "casinoorg", "deadline", "animationmagazine",
    "defensepost", "casino-org", "casino.org",
    "air & space forces magazine", "ieee spectrum", "techcrunch-staff",
    "blockworks", "blockworks-staff", "decrypt", "ainvest", "banking-dive", "banking dive",
    "cset-georgetown", "cset georgetown",
    "kff", "kff-health-news", "kff health news", "kff-health-news---cbo",
    "kff-health-news-/-cbo", "kff health news / cbo", "kffhealthnews",
    "bloomberg-law",
    "norton-rose-fulbright", "norton rose fulbright",
    "defence-post", "the-defensepost",
    "wilmerhale", "mofo", "sciencedirect",
    "yogonet", "csr", "aisi-uk", "aisi", "aisi_gov", "rand",
    "armscontrol", "eclinmed", "solana-compass", "solana compass",
    "pmc11919318", "pmc11780016",
    "healthverity", "natrium", "form-energy",
    "courtlistener", "curtis-schiff", "curtis-schiff-prediction-markets",
    "prophetx", "techpolicypress-staff",
    "npr", "venturebeat", "geekwire", "payloadspace", "the-ankler",
    "theankler", "tubefilter", "emarketer", "dagster",
    "numerai",  # fund/project brand, not person
    "psl", "multistate",
}
PUBLISHERS_ACADEMIC = {
    # Academic orgs, labs, papers, journals, institutions
    "arxiv", "metr", "metr_evals", "apollo-research", "apollo research", "apolloresearch",
    "jacc-study-authors", "jacc-data-report-authors",
    "anthropic-fellows-program", "anthropic-fellows",
    "anthropic-fellows-/-alignment-science-team", "anthropic-research",
    "jmir-2024", "jmir 2024",
    "oettl-et-al.,-journal-of-experimental-orthopaedics",
    "oettl et al., journal of experimental orthopaedics",
    "jacc", "nct06548490", "pmc",
    "conitzer-et-al.-(2024)", "aquino-michaels-2026", "pan-et-al.",
    "pan-et-al.-'natural-language-agent-harnesses'",
    "stanford", "stanford-meta-harness",
    "hendershot", "annals-im",
    "nellie-liang,-brookings-institution", "nellie liang, brookings institution",
    "penn-state", "american-heart-association", "american heart association",
    "molt_cornelius", "molt-cornelius",
    # Companies / labs / brand-orgs (not specific humans)
    "anthropic", "anthropicai", "openai", "nasa", "icrc", "ecri",
    "epochairesearch", "metadao", "iapam", "icer",
    "who", "ama", "uspstf", "unknown",
    "futard.io",  # protocol/platform
    "oxford-martin-ai-governance-initiative",
    "oxford-martin-ai-governance",
    "u.s.-food-and-drug-administration",
    "jitse-goutbeek,-european-policy-centre",  # cited person+org string → publisher
    "adepoju-et-al.",  # paper citation
    # Formal-citation names (Firstname-Lastname or Lastname-et-al) — classified
    # as academic citations, not reachable contributors. They'd need an @ handle
    # to get CI credit per Cory's growth-loop design.
    "senator-elissa-slotkin",
    "bostrom", "hanson", "kaufmann", "noah-smith", "doug-shapiro",
    "shayon-sengupta", "shayon sengupta",
    "robin-hanson", "robin hanson", "eliezer-yudkowsky",
    "leopold-aschenbrenner", "aschenbrenner",
    "ramstead", "larsson", "heavey",
    "dan-slimmon", "van-leeuwaarden", "ward-whitt", "adams",
    "tamim-ansary", "spizzirri",
    "dario-amodei",  # formal-citation form (real @ is @darioamodei)
    "corless", "oxranga", "vlahakis",
    # Brand/project/DAO tokens — not individuals
    "areal-dao", "areal", "theiaresearch", "futard-io", "dhrumil",
    # Classic formal-citation names — famous academics/economists cited by surname.
    # Reachable via @ handle if/when they join (e.g. Ostrom has no X, Hayek deceased,
    # Friston has an institutional affiliation not an @ handle we'd track).
    "clayton-christensen", "hidalgo", "coase", "wiener", "juarrero",
    "ostrom", "centola", "hayek", "marshall-mcluhan", "blackmore",
    "knuth", "friston", "aquino-michaels", "conitzer", "bak",
}
# NOTE: pseudonymous X handles that MAY be real contributors stay in keep_person:
#   karpathy, simonw, swyx, metaproph3t, metanallok, mmdhrumil, sjdedic,
#   ceterispar1bus — these are real X accounts and match Cory's growth loop.
# They appear without @ prefix because extraction frontmatter didn't normalize.
# Auto-creating them as contributors tier='cited' is correct (A-path from earlier).
PUBLISHERS_SOCIAL = {
    "x", "twitter", "telegram", "x.com",
}
PUBLISHERS_INTERNAL = {
    "teleohumanity-manifesto", "strategy-session-journal",
    "living-capital-thesis-development", "attractor-state-historical-backtesting",
    "web-research-compilation", "architectural-investing",
    "governance---meritocratic-voting-+-futarchy",  # title artifact
    "sec-interpretive-release-s7-2026-09-(march-17",  # title artifact
    "mindstudio",  # tooling/platform, not contributor
}
# Merge into one kind→set map for classification
PUBLISHER_KIND_MAP = {}
for h in PUBLISHERS_NEWS:
    PUBLISHER_KIND_MAP[h.lower()] = "news"
for h in PUBLISHERS_ACADEMIC:
    PUBLISHER_KIND_MAP[h.lower()] = "academic"
for h in PUBLISHERS_SOCIAL:
    PUBLISHER_KIND_MAP[h.lower()] = "social_platform"
for h in PUBLISHERS_INTERNAL:
    PUBLISHER_KIND_MAP[h.lower()] = "internal"


# Garbage: handles that are clearly parse artifacts, not real names.
# Pattern: contains parens, special chars, or >50 chars.
def is_garbage(handle: str) -> bool:
    h = handle.strip()
    if len(h) > 50:
        return True
    if re.search(r"[()\[\]<>{}\/\\|@#$%^&*=?!:;\"']", h):
        # But @ can appear legitimately in handles like @thesensatore — allow if @ is only prefix
        if h.startswith("@") and not re.search(r"[()\[\]<>{}\/\\|#$%^&*=?!:;\"']", h):
            return False
        return True
    # Multi-word hyphenated with very specific artifact shape: 3+ hyphens in a row or trailing noise
    if "---" in h or "---meritocratic" in h or h.endswith("(march") or h.endswith("-(march"):
        return True
    return False


def classify(handle: str) -> tuple[str, str | None]:
    """Return (category, publisher_kind).

    category ∈ {'keep_agent', 'keep_person', 'publisher', 'garbage', 'review_needed'}
    publisher_kind ∈ {'news','academic','social_platform','internal', None}
    """
    h = handle.strip().lower().lstrip("@")

    if h in PENTAGON_AGENTS:
        return ("keep_agent", None)

    if h in PUBLISHER_KIND_MAP:
        return ("publisher", PUBLISHER_KIND_MAP[h])

    if is_garbage(handle):
        return ("garbage", None)

    # @-prefixed handles or short-slug real-looking names → keep as person
    # (Auto-create rule from Cory: @ handles auto-join as tier='cited'.)
    if handle.startswith("@"):
        return ("keep_person", None)

    # Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person.
    # 39-char ceiling matches GitHub's handle limit and the writer path in
    # contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall
    # through to review_needed and block --apply.
    if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h):
        return ("keep_person", None)

    # Everything else: needs human review
    return ("review_needed", None)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--apply", action="store_true", help="Write changes to DB")
    parser.add_argument("--show", type=str, help="Inspect a single handle")
    parser.add_argument("--delete-events", action="store_true",
                        help="DELETE contribution_events for publishers+garbage (default: keep for audit)")
    args = parser.parse_args()

    if not Path(DB_PATH).exists():
        print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
        sys.exit(1)

    conn = sqlite3.connect(DB_PATH, timeout=30)
    conn.row_factory = sqlite3.Row

    # Sanity: publishers table must exist (v26 migration applied)
    try:
        conn.execute("SELECT 1 FROM publishers LIMIT 1")
    except sqlite3.OperationalError:
        print("ERROR: publishers table missing. Run migration v26 first.", file=sys.stderr)
        sys.exit(2)

    rows = conn.execute(
        "SELECT handle, kind, tier, claims_merged FROM contributors ORDER BY claims_merged DESC"
    ).fetchall()

    if args.show:
        target = args.show.strip().lower().lstrip("@")
        for r in rows:
            if r["handle"].lower().lstrip("@") == target:
                category, pkind = classify(r["handle"])
                events_count = conn.execute(
                    "SELECT COUNT(*) FROM contribution_events WHERE handle = ?",
                    (r["handle"].lower().lstrip("@"),),
                ).fetchone()[0]
                print(f"handle:         {r['handle']}")
                print(f"current_kind:   {r['kind']}")
                print(f"current_tier:   {r['tier']}")
                print(f"claims_merged:  {r['claims_merged']}")
                print(f"events:         {events_count}")
                print(f"→ category:     {category}")
                if pkind:
                    print(f"→ publisher:    kind={pkind}")
                return
        print(f"No match for '{args.show}'")
        return

    # Classify all
    buckets: dict[str, list[dict]] = {
        "keep_agent": [],
        "keep_person": [],
        "publisher": [],
        "garbage": [],
        "review_needed": [],
    }
    for r in rows:
        category, pkind = classify(r["handle"])
        buckets[category].append({
            "handle": r["handle"],
            "kind_now": r["kind"],
            "tier": r["tier"],
            "claims": r["claims_merged"] or 0,
            "publisher_kind": pkind,
        })

    print("=== Classification summary ===")
    for cat, items in buckets.items():
        print(f"  {cat:18s}  {len(items):5d}")

    print("\n=== Sample of each category ===")
    for cat, items in buckets.items():
        print(f"\n--- {cat} (showing up to 10) ---")
        for item in items[:10]:
            tag = f" → {item['publisher_kind']}" if item["publisher_kind"] else ""
            print(f"  {item['handle']:50s} claims={item['claims']:5d}{tag}")

    print("\n=== Full review_needed list ===")
    for item in buckets["review_needed"]:
        print(f"  {item['handle']:50s} claims={item['claims']:5d}")

    # Diagnostic: orphan alias count for handles we're about to delete.
    # Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway),
    # so aliases pointing to deleted canonical handles become orphans. Surface
    # the count so the --delete-events decision is informed.
    doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]]
    if doomed:
        placeholders = ",".join("?" * len(doomed))
        orphan_count = conn.execute(
            f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})",
            doomed,
        ).fetchone()[0]
        print(f"\n=== Alias orphan check ===")
        print(f"  contributor_aliases rows pointing to deletable canonicals: {orphan_count}")
        if orphan_count:
            print(f"  (cleanup requires --delete-events; without it, aliases stay as orphans)")

    if not args.apply:
        print("\n(dry-run — no writes. Re-run with --apply to execute.)")
        return

    # ── Apply changes ──
    print("\n=== Applying changes ===")
    if buckets["review_needed"]:
        print(f"ABORT: {len(buckets['review_needed'])} rows need human review. Fix classifier before --apply.")
        sys.exit(3)

    inserted_publishers = 0
    reclassified_agents = 0
    deleted_garbage = 0
    deleted_publisher_rows = 0
    deleted_events = 0
    deleted_aliases = 0

    # Single transaction — if any step errors, roll back. This prevents the failure
    # mode where a publisher insert fails silently and we still delete the contributor
    # row, losing data.
    try:
        conn.execute("BEGIN")

        # 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
        # Counter uses cur.rowcount so replay runs (where publishers already exist)
        # report accurate inserted=0 instead of falsely claiming the full set.
        # moved_to_publisher is unconditional — the contributors row still needs to
        # be deleted even when the publishers row was added in a prior run.
        moved_to_publisher = set()
        for item in buckets["publisher"]:
            name = item["handle"].strip().lower().lstrip("@")
            cur = conn.execute(
                "INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
                (name, item["publisher_kind"]),
            )
            if cur.rowcount > 0:
                inserted_publishers += 1
            moved_to_publisher.add(item["handle"])

        # 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
        for item in buckets["keep_agent"]:
            conn.execute(
                "UPDATE contributors SET kind = 'agent' WHERE handle = ?",
                (item["handle"].lower().lstrip("@"),),
            )
            reclassified_agents += 1

        # 3. Delete garbage handles from contributors (and their events + aliases)
        for item in buckets["garbage"]:
            canonical_lower = item["handle"].lower().lstrip("@")
            if args.delete_events:
                cur = conn.execute(
                    "DELETE FROM contribution_events WHERE handle = ?",
                    (canonical_lower,),
                )
                deleted_events += cur.rowcount
                cur = conn.execute(
                    "DELETE FROM contributor_aliases WHERE canonical = ?",
                    (canonical_lower,),
                )
                deleted_aliases += cur.rowcount
            cur = conn.execute(
                "DELETE FROM contributors WHERE handle = ?",
                (item["handle"],),
            )
            deleted_garbage += cur.rowcount

        # 4. Delete publisher rows from contributors — ONLY for those successfully
        # inserted into publishers above. Guards against partial failure.
        # Aliases pointing to publisher-classified handles get cleaned under the
        # same --delete-events gate: publishers live in their own table now, any
        # leftover aliases in contributor_aliases are orphans.
        for item in buckets["publisher"]:
            if item["handle"] not in moved_to_publisher:
                continue
            canonical_lower = item["handle"].lower().lstrip("@")
            if args.delete_events:
                cur = conn.execute(
                    "DELETE FROM contribution_events WHERE handle = ?",
                    (canonical_lower,),
                )
                deleted_events += cur.rowcount
                cur = conn.execute(
                    "DELETE FROM contributor_aliases WHERE canonical = ?",
                    (canonical_lower,),
                )
                deleted_aliases += cur.rowcount
            cur = conn.execute(
                "DELETE FROM contributors WHERE handle = ?",
                (item["handle"],),
            )
            deleted_publisher_rows += cur.rowcount

        # 5. Audit log entry for the destructive operation (Ganymede Q5).
        conn.execute(
            "INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)",
            (
                "schema_v26",
                "classify_contributors",
                json.dumps({
                    "publishers_inserted": inserted_publishers,
                    "agents_updated": reclassified_agents,
                    "garbage_deleted": deleted_garbage,
                    "publisher_rows_deleted": deleted_publisher_rows,
                    "events_deleted": deleted_events,
                    "aliases_deleted": deleted_aliases,
                    "delete_events_flag": bool(args.delete_events),
                }),
            ),
        )

        conn.commit()
    except Exception as e:
        conn.rollback()
        print(f"ERROR: Transaction failed, rolled back. {e}", file=sys.stderr)
        sys.exit(4)

    print(f"  publishers inserted:          {inserted_publishers}")
    print(f"  agents kind='agent' ensured:  {reclassified_agents}")
    print(f"  garbage rows deleted:         {deleted_garbage}")
    print(f"  publisher rows removed from contributors: {deleted_publisher_rows}")
    if args.delete_events:
        print(f"  contribution_events deleted:  {deleted_events}")
        print(f"  contributor_aliases deleted:  {deleted_aliases}")
    else:
        print(f"  (events + aliases kept — re-run with --delete-events to clean them)")


if __name__ == "__main__":
    main()