diff --git a/lib/db.py b/lib/db.py index f28f295..af06d37 100644 --- a/lib/db.py +++ b/lib/db.py @@ -9,7 +9,7 @@ from . import config logger = logging.getLogger("pipeline.db") -SCHEMA_VERSION = 25 +SCHEMA_VERSION = 26 SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -35,6 +35,15 @@ CREATE TABLE IF NOT EXISTS sources ( feedback TEXT, -- eval feedback for re-extraction (JSON) cost_usd REAL DEFAULT 0, + -- v26: provenance — publisher (news org / venue) + content author. + -- publisher_id references publishers(id) when source is from a known org. + -- original_author_handle references contributors(handle) when author is in our system. + -- original_author is free-text fallback ("Kim et al.", "Robin Hanson") — not credit-bearing. + publisher_id INTEGER REFERENCES publishers(id), + content_type TEXT, + -- article | paper | tweet | conversation | self_authored | webpage | podcast + original_author TEXT, + original_author_handle TEXT REFERENCES contributors(handle), created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')) ); @@ -207,6 +216,33 @@ CREATE TABLE IF NOT EXISTS contributor_aliases ( created_at TEXT DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS idx_aliases_canonical ON contributor_aliases(canonical); + +-- Publishers: news orgs, academic venues, social platforms. NOT contributors — these +-- provide metadata/provenance for sources, never earn leaderboard credit. Separating +-- these from contributors prevents CNBC/SpaceNews from dominating the leaderboard. +-- (Apr 24 Cory directive: "only credit the original source if its on X or tg") +CREATE TABLE IF NOT EXISTS publishers ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + kind TEXT CHECK(kind IN ('news', 'academic', 'social_platform', 'podcast', 'self', 'internal', 'legal', 'government', 'research_org', 'commercial', 'other')), + url_pattern TEXT, + created_at TEXT DEFAULT (datetime('now')) +); +CREATE INDEX IF NOT EXISTS idx_publishers_name ON publishers(name); +CREATE INDEX IF NOT EXISTS idx_publishers_kind ON publishers(kind); + +-- Multi-platform identity: one contributor, many handles. Enables the leaderboard to +-- unify @thesensatore (X) + thesensatore (TG) + thesensatore@github into one person. +-- Writers check this table after resolving aliases to find canonical contributor handle. +CREATE TABLE IF NOT EXISTS contributor_identities ( + contributor_handle TEXT NOT NULL, + platform TEXT NOT NULL CHECK(platform IN ('x', 'telegram', 'github', 'email', 'web', 'internal')), + platform_handle TEXT NOT NULL, + verified INTEGER DEFAULT 0, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY (platform, platform_handle) +); +CREATE INDEX IF NOT EXISTS idx_identities_contributor ON contributor_identities(contributor_handle); """ @@ -764,6 +800,51 @@ def migrate(conn: sqlite3.Connection): conn.commit() logger.info("Migration v25: patched kind='agent' for pipeline handle") + if current < 26: + # Add publishers + contributor_identities. Non-breaking — new tables only. + # No existing data moved. Classification into publishers happens via a + # separate script (scripts/reclassify-contributors.py) with Cory-reviewed + # seed list. CHECK constraint on contributors.kind deferred to v27 after + # classification completes. (Apr 24 Cory directive: "fix schema, don't + # filter output" — separate contributors from publishers at the data layer.) + conn.executescript(""" + CREATE TABLE IF NOT EXISTS publishers ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + kind TEXT CHECK(kind IN ('news', 'academic', 'social_platform', 'podcast', 'self', 'internal', 'legal', 'government', 'research_org', 'commercial', 'other')), + url_pattern TEXT, + created_at TEXT DEFAULT (datetime('now')) + ); + CREATE INDEX IF NOT EXISTS idx_publishers_name ON publishers(name); + CREATE INDEX IF NOT EXISTS idx_publishers_kind ON publishers(kind); + + CREATE TABLE IF NOT EXISTS contributor_identities ( + contributor_handle TEXT NOT NULL, + platform TEXT NOT NULL CHECK(platform IN ('x', 'telegram', 'github', 'email', 'web', 'internal')), + platform_handle TEXT NOT NULL, + verified INTEGER DEFAULT 0, + created_at TEXT DEFAULT (datetime('now')), + PRIMARY KEY (platform, platform_handle) + ); + CREATE INDEX IF NOT EXISTS idx_identities_contributor ON contributor_identities(contributor_handle); + """) + # Extend sources with provenance columns. ALTER TABLE ADD COLUMN is + # idempotent-safe via try/except because SQLite doesn't support IF NOT EXISTS + # on column adds. + for col_sql in ( + "ALTER TABLE sources ADD COLUMN publisher_id INTEGER REFERENCES publishers(id)", + "ALTER TABLE sources ADD COLUMN content_type TEXT", + "ALTER TABLE sources ADD COLUMN original_author TEXT", + "ALTER TABLE sources ADD COLUMN original_author_handle TEXT REFERENCES contributors(handle)", + ): + try: + conn.execute(col_sql) + except sqlite3.OperationalError as e: + if "duplicate column" not in str(e).lower(): + raise + conn.commit() + logger.info("Migration v26: added publishers + contributor_identities tables + sources provenance columns") + if current < SCHEMA_VERSION: conn.execute( "INSERT OR REPLACE INTO schema_version (version) VALUES (?)", diff --git a/scripts/classify-contributors.py b/scripts/classify-contributors.py new file mode 100644 index 0000000..80299ae --- /dev/null +++ b/scripts/classify-contributors.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +"""Classify `contributors` rows into {keep_person, keep_agent, move_to_publisher, delete_garbage}. + +Reads current contributors table, proposes reclassification per v26 schema design: + - Real humans + Pentagon agents stay in contributors (kind='person'|'agent') + - News orgs, publications, venues move to publishers table (new v26) + - Multi-word hyphenated garbage (parsing artifacts) gets deleted + - Their contribution_events are handled per category: + * Publishers: DELETE events (orgs shouldn't have credit) + * Garbage: DELETE events (bogus data) + * Persons/agents: keep events untouched + +Classification is heuristic — uses explicit allowlists + regex patterns + length gates. +Ambiguous cases default to 'review_needed' (human decision). + +Usage: + python3 scripts/classify-contributors.py # dry-run analysis + report + python3 scripts/classify-contributors.py --apply # write changes + python3 scripts/classify-contributors.py --show # inspect a single row + +Writes to pipeline.db only. Does NOT modify claim files. +""" +import argparse +import os +import re +import sqlite3 +import sys +from collections import Counter +from pathlib import Path + +DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") + +# Pentagon agents: kind='agent'. Authoritative list. +PENTAGON_AGENTS = frozenset({ + "rio", "leo", "theseus", "vida", "clay", "astra", + "oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship", + "pipeline", +}) + +# Publisher/news-org handles seen in current contributors table. +# Grouped by kind for the publishers row. Classified by inspection. +# NOTE: This list is hand-curated — add to it as new orgs appear. +PUBLISHERS_NEWS = { + # News outlets / brands + "cnbc", "al-jazeera", "axios", "bloomberg", "reuters", "bettorsinsider", + "fortune", "techcrunch", "coindesk", "coindesk-staff", "coindesk-research", + "coindesk research", "coindesk staff", + "defense-one", "thedefensepost", "theregister", "the-intercept", + "the-meridiem", "variety", "variety-staff", "variety staff", "spacenews", + "nasaspaceflight", "thedonkey", "insidedefense", "techpolicypress", + "morganlewis", "casinoorg", "deadline", "animationmagazine", + "defensepost", "casino-org", "casino.org", + "air & space forces magazine", "ieee spectrum", "techcrunch-staff", + "blockworks", "blockworks-staff", "decrypt", "ainvest", "banking-dive", "banking dive", + "cset-georgetown", "cset georgetown", + "kff", "kff-health-news", "kff health news", "kff-health-news---cbo", + "kff-health-news-/-cbo", "kff health news / cbo", "kffhealthnews", + "bloomberg-law", + "norton-rose-fulbright", "norton rose fulbright", + "defence-post", "the-defensepost", + "wilmerhale", "mofo", "sciencedirect", + "yogonet", "csr", "aisi-uk", "aisi", "aisi_gov", "rand", + "armscontrol", "eclinmed", "solana-compass", "solana compass", + "pmc11919318", "pmc11780016", + "healthverity", "natrium", "form-energy", + "courtlistener", "curtis-schiff", "curtis-schiff-prediction-markets", + "prophetx", "techpolicypress-staff", + "npr", "venturebeat", "geekwire", "payloadspace", "the-ankler", + "theankler", "tubefilter", "emarketer", "dagster", + "numerai", # fund/project brand, not person + "psl", "multistate", +} +PUBLISHERS_ACADEMIC = { + # Academic orgs, labs, papers, journals, institutions + "arxiv", "metr", "metr_evals", "apollo-research", "apollo research", "apolloresearch", + "jacc-study-authors", "jacc-data-report-authors", + "anthropic-fellows-program", "anthropic-fellows", + "anthropic-fellows-/-alignment-science-team", "anthropic-research", + "jmir-2024", "jmir 2024", + "oettl-et-al.,-journal-of-experimental-orthopaedics", + "oettl et al., journal of experimental orthopaedics", + "jacc", "nct06548490", "pmc", + "conitzer-et-al.-(2024)", "aquino-michaels-2026", "pan-et-al.", + "pan-et-al.-'natural-language-agent-harnesses'", + "stanford", "stanford-meta-harness", + "hendershot", "annals-im", + "nellie-liang,-brookings-institution", "nellie liang, brookings institution", + "penn-state", "american-heart-association", "american heart association", + "molt_cornelius", "molt-cornelius", + # Companies / labs / brand-orgs (not specific humans) + "anthropic", "anthropicai", "openai", "nasa", "icrc", "ecri", + "epochairesearch", "metadao", "iapam", "icer", + "who", "ama", "uspstf", "unknown", + "futard.io", # protocol/platform + "oxford-martin-ai-governance-initiative", + "oxford-martin-ai-governance", + "u.s.-food-and-drug-administration", + "jitse-goutbeek,-european-policy-centre", # cited person+org string → publisher + "adepoju-et-al.", # paper citation + # Formal-citation names (Firstname-Lastname or Lastname-et-al) — classified + # as academic citations, not reachable contributors. They'd need an @ handle + # to get CI credit per Cory's growth-loop design. + "senator-elissa-slotkin", + "bostrom", "hanson", "kaufmann", "noah-smith", "doug-shapiro", + "shayon-sengupta", "shayon sengupta", + "robin-hanson", "robin hanson", "eliezer-yudkowsky", + "leopold-aschenbrenner", "aschenbrenner", + "ramstead", "larsson", "heavey", + "dan-slimmon", "van-leeuwaarden", "ward-whitt", "adams", + "tamim-ansary", "spizzirri", + "dario-amodei", # formal-citation form (real @ is @darioamodei) + "corless", "oxranga", "vlahakis", + # Brand/project/DAO tokens — not individuals + "areal-dao", "areal", "theiaresearch", "futard-io", "dhrumil", + # Classic formal-citation names — famous academics/economists cited by surname. + # Reachable via @ handle if/when they join (e.g. Ostrom has no X, Hayek deceased, + # Friston has an institutional affiliation not an @ handle we'd track). + "clayton-christensen", "hidalgo", "coase", "wiener", "juarrero", + "ostrom", "centola", "hayek", "marshall-mcluhan", "blackmore", + "knuth", "friston", "aquino-michaels", "conitzer", "bak", +} +# NOTE: pseudonymous X handles that MAY be real contributors stay in keep_person: +# karpathy, simonw, swyx, metaproph3t, metanallok, mmdhrumil, sjdedic, +# ceterispar1bus — these are real X accounts and match Cory's growth loop. +# They appear without @ prefix because extraction frontmatter didn't normalize. +# Auto-creating them as contributors tier='cited' is correct (A-path from earlier). +PUBLISHERS_SOCIAL = { + "x", "twitter", "telegram", "x.com", +} +PUBLISHERS_INTERNAL = { + "teleohumanity-manifesto", "strategy-session-journal", + "living-capital-thesis-development", "attractor-state-historical-backtesting", + "web-research-compilation", "architectural-investing", + "governance---meritocratic-voting-+-futarchy", # title artifact + "sec-interpretive-release-s7-2026-09-(march-17", # title artifact + "mindstudio", # tooling/platform, not contributor +} +# Merge into one kind→set map for classification +PUBLISHER_KIND_MAP = {} +for h in PUBLISHERS_NEWS: + PUBLISHER_KIND_MAP[h.lower()] = "news" +for h in PUBLISHERS_ACADEMIC: + PUBLISHER_KIND_MAP[h.lower()] = "academic" +for h in PUBLISHERS_SOCIAL: + PUBLISHER_KIND_MAP[h.lower()] = "social_platform" +for h in PUBLISHERS_INTERNAL: + PUBLISHER_KIND_MAP[h.lower()] = "internal" + + +# Garbage: handles that are clearly parse artifacts, not real names. +# Pattern: contains parens, special chars, or >50 chars. +def is_garbage(handle: str) -> bool: + h = handle.strip() + if len(h) > 50: + return True + if re.search(r"[()\[\]<>{}\/\\|@#$%^&*=?!:;\"']", h): + # But @ can appear legitimately in handles like @thesensatore — allow if @ is only prefix + if h.startswith("@") and not re.search(r"[()\[\]<>{}\/\\|#$%^&*=?!:;\"']", h): + return False + return True + # Multi-word hyphenated with very specific artifact shape: 3+ hyphens in a row or trailing noise + if "---" in h or "---meritocratic" in h or h.endswith("(march") or h.endswith("-(march"): + return True + return False + + +def classify(handle: str) -> tuple[str, str | None]: + """Return (category, publisher_kind). + + category ∈ {'keep_agent', 'keep_person', 'publisher', 'garbage', 'review_needed'} + publisher_kind ∈ {'news','academic','social_platform','internal', None} + """ + h = handle.strip().lower().lstrip("@") + + if h in PENTAGON_AGENTS: + return ("keep_agent", None) + + if h in PUBLISHER_KIND_MAP: + return ("publisher", PUBLISHER_KIND_MAP[h]) + + if is_garbage(handle): + return ("garbage", None) + + # @-prefixed handles or short-slug real-looking names → keep as person + # (Auto-create rule from Cory: @ handles auto-join as tier='cited'.) + if handle.startswith("@"): + return ("keep_person", None) + + # Short plausible handles (<=20 chars, alphanum + underscore/hyphen): treat as person + if re.match(r"^[a-z0-9][a-z0-9_-]{0,19}$", h): + return ("keep_person", None) + + # Everything else: needs human review + return ("review_needed", None) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--apply", action="store_true", help="Write changes to DB") + parser.add_argument("--show", type=str, help="Inspect a single handle") + parser.add_argument("--delete-events", action="store_true", + help="DELETE contribution_events for publishers+garbage (default: keep for audit)") + args = parser.parse_args() + + if not Path(DB_PATH).exists(): + print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr) + sys.exit(1) + + conn = sqlite3.connect(DB_PATH, timeout=30) + conn.row_factory = sqlite3.Row + + # Sanity: publishers table must exist (v26 migration applied) + try: + conn.execute("SELECT 1 FROM publishers LIMIT 1") + except sqlite3.OperationalError: + print("ERROR: publishers table missing. Run migration v26 first.", file=sys.stderr) + sys.exit(2) + + rows = conn.execute( + "SELECT handle, kind, tier, claims_merged FROM contributors ORDER BY claims_merged DESC" + ).fetchall() + + if args.show: + target = args.show.strip().lower().lstrip("@") + for r in rows: + if r["handle"].lower().lstrip("@") == target: + category, pkind = classify(r["handle"]) + events_count = conn.execute( + "SELECT COUNT(*) FROM contribution_events WHERE handle = ?", + (r["handle"].lower().lstrip("@"),), + ).fetchone()[0] + print(f"handle: {r['handle']}") + print(f"current_kind: {r['kind']}") + print(f"current_tier: {r['tier']}") + print(f"claims_merged: {r['claims_merged']}") + print(f"events: {events_count}") + print(f"→ category: {category}") + if pkind: + print(f"→ publisher: kind={pkind}") + return + print(f"No match for '{args.show}'") + return + + # Classify all + buckets: dict[str, list[dict]] = { + "keep_agent": [], + "keep_person": [], + "publisher": [], + "garbage": [], + "review_needed": [], + } + for r in rows: + category, pkind = classify(r["handle"]) + buckets[category].append({ + "handle": r["handle"], + "kind_now": r["kind"], + "tier": r["tier"], + "claims": r["claims_merged"] or 0, + "publisher_kind": pkind, + }) + + print("=== Classification summary ===") + for cat, items in buckets.items(): + print(f" {cat:18s} {len(items):5d}") + + print("\n=== Sample of each category ===") + for cat, items in buckets.items(): + print(f"\n--- {cat} (showing up to 10) ---") + for item in items[:10]: + tag = f" → {item['publisher_kind']}" if item["publisher_kind"] else "" + print(f" {item['handle']:50s} claims={item['claims']:5d}{tag}") + + print("\n=== Full review_needed list ===") + for item in buckets["review_needed"]: + print(f" {item['handle']:50s} claims={item['claims']:5d}") + + if not args.apply: + print("\n(dry-run — no writes. Re-run with --apply to execute.)") + return + + # ── Apply changes ── + print("\n=== Applying changes ===") + if buckets["review_needed"]: + print(f"ABORT: {len(buckets['review_needed'])} rows need human review. Fix classifier before --apply.") + sys.exit(3) + + inserted_publishers = 0 + reclassified_agents = 0 + deleted_garbage = 0 + deleted_publisher_rows = 0 + deleted_events = 0 + + # Single transaction — if any step errors, roll back. This prevents the failure + # mode where a publisher insert fails silently and we still delete the contributor + # row, losing data. + try: + conn.execute("BEGIN") + + # 1. Insert publishers. Track which ones succeeded so step 4 only deletes those. + moved_to_publisher = set() + for item in buckets["publisher"]: + name = item["handle"].strip().lower().lstrip("@") + conn.execute( + "INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)", + (name, item["publisher_kind"]), + ) + moved_to_publisher.add(item["handle"]) + inserted_publishers += 1 + + # 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch) + for item in buckets["keep_agent"]: + conn.execute( + "UPDATE contributors SET kind = 'agent' WHERE handle = ?", + (item["handle"].lower().lstrip("@"),), + ) + reclassified_agents += 1 + + # 3. Delete garbage handles from contributors (and their events) + for item in buckets["garbage"]: + if args.delete_events: + cur = conn.execute( + "DELETE FROM contribution_events WHERE handle = ?", + (item["handle"].lower().lstrip("@"),), + ) + deleted_events += cur.rowcount + cur = conn.execute( + "DELETE FROM contributors WHERE handle = ?", + (item["handle"],), + ) + deleted_garbage += cur.rowcount + + # 4. Delete publisher rows from contributors — ONLY for those successfully + # inserted into publishers above. Guards against partial failure. + for item in buckets["publisher"]: + if item["handle"] not in moved_to_publisher: + continue + if args.delete_events: + cur = conn.execute( + "DELETE FROM contribution_events WHERE handle = ?", + (item["handle"].lower().lstrip("@"),), + ) + deleted_events += cur.rowcount + cur = conn.execute( + "DELETE FROM contributors WHERE handle = ?", + (item["handle"],), + ) + deleted_publisher_rows += cur.rowcount + + conn.commit() + except Exception as e: + conn.rollback() + print(f"ERROR: Transaction failed, rolled back. {e}", file=sys.stderr) + sys.exit(4) + + print(f" publishers inserted: {inserted_publishers}") + print(f" agents kind='agent' ensured: {reclassified_agents}") + print(f" garbage rows deleted: {deleted_garbage}") + print(f" publisher rows removed from contributors: {deleted_publisher_rows}") + if args.delete_events: + print(f" contribution_events deleted: {deleted_events}") + else: + print(f" (events kept — re-run with --delete-events to clean them)") + + +if __name__ == "__main__": + main()