1. WARNING — orphan contributor_aliases after publisher/garbage delete:
Added alias cleanup to the transaction (gated on --delete-events, same
audit rationale as events). Both garbage and publisher deletion loops
now DELETE matching contributor_aliases rows. Dry-run adds an orphan
count diagnostic so the --delete-events decision is informed.
2. NIT — inserted_publishers counter over-reports on replay:
INSERT OR IGNORE silently skips name collisions, but the counter
incremented unconditionally. Now uses cur.rowcount so a second apply
reports 0 inserts instead of falsely claiming 100. moved_to_publisher
set remains unconditional — publisher rows already present still need
the matching contributors row deleted.
3. NIT — handle-length gate diverged from writer path:
Widened from {0,19} (20 chars) to {0,38} (39 chars) to match GitHub's
handle limit and contributor.py::_HANDLE_RE. Prevents future long-handle
real contributors from falling through to review_needed and blocking
--apply. Current data has 0 review_needed either way.
Bonus (Q5): Added audit_log entry inside the transaction. One row in
audit_log.stage='schema_v26', event='classify_contributors' with counter
detail JSON on every --apply run. Cheap audit trail for the destructive op.
Verified end-to-end on VPS DB snapshot:
- First apply: 100/9/9/100/0 (matches pre-fix)
- Second apply: 0/9/0/0/0 (counter fix working)
- With injected aliases + --delete-events: 2 aliases deleted, 1 pre-existing
orphan correctly left alone (outside script scope), audit_log entry
written with accurate counters.
Ganymede msg-3. Protocol closed.
426 lines
18 KiB
Python
426 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Classify `contributors` rows into {keep_person, keep_agent, move_to_publisher, delete_garbage}.
|
|
|
|
Reads current contributors table, proposes reclassification per v26 schema design:
|
|
- Real humans + Pentagon agents stay in contributors (kind='person'|'agent')
|
|
- News orgs, publications, venues move to publishers table (new v26)
|
|
- Multi-word hyphenated garbage (parsing artifacts) gets deleted
|
|
- Their contribution_events are handled per category:
|
|
* Publishers: DELETE events (orgs shouldn't have credit)
|
|
* Garbage: DELETE events (bogus data)
|
|
* Persons/agents: keep events untouched
|
|
|
|
Classification is heuristic — uses explicit allowlists + regex patterns + length gates.
|
|
Ambiguous cases default to 'review_needed' (human decision).
|
|
|
|
Usage:
|
|
python3 scripts/classify-contributors.py # dry-run analysis + report
|
|
python3 scripts/classify-contributors.py --apply # write changes
|
|
python3 scripts/classify-contributors.py --show <handle> # inspect a single row
|
|
|
|
Writes to pipeline.db only. Does NOT modify claim files.
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
|
|
# Pentagon agents: kind='agent'. Authoritative list.
|
|
PENTAGON_AGENTS = frozenset({
|
|
"rio", "leo", "theseus", "vida", "clay", "astra",
|
|
"oberon", "argus", "rhea", "ganymede", "epimetheus", "hermes", "ship",
|
|
"pipeline",
|
|
})
|
|
|
|
# Publisher/news-org handles seen in current contributors table.
|
|
# Grouped by kind for the publishers row. Classified by inspection.
|
|
# NOTE: This list is hand-curated — add to it as new orgs appear.
|
|
PUBLISHERS_NEWS = {
|
|
# News outlets / brands
|
|
"cnbc", "al-jazeera", "axios", "bloomberg", "reuters", "bettorsinsider",
|
|
"fortune", "techcrunch", "coindesk", "coindesk-staff", "coindesk-research",
|
|
"coindesk research", "coindesk staff",
|
|
"defense-one", "thedefensepost", "theregister", "the-intercept",
|
|
"the-meridiem", "variety", "variety-staff", "variety staff", "spacenews",
|
|
"nasaspaceflight", "thedonkey", "insidedefense", "techpolicypress",
|
|
"morganlewis", "casinoorg", "deadline", "animationmagazine",
|
|
"defensepost", "casino-org", "casino.org",
|
|
"air & space forces magazine", "ieee spectrum", "techcrunch-staff",
|
|
"blockworks", "blockworks-staff", "decrypt", "ainvest", "banking-dive", "banking dive",
|
|
"cset-georgetown", "cset georgetown",
|
|
"kff", "kff-health-news", "kff health news", "kff-health-news---cbo",
|
|
"kff-health-news-/-cbo", "kff health news / cbo", "kffhealthnews",
|
|
"bloomberg-law",
|
|
"norton-rose-fulbright", "norton rose fulbright",
|
|
"defence-post", "the-defensepost",
|
|
"wilmerhale", "mofo", "sciencedirect",
|
|
"yogonet", "csr", "aisi-uk", "aisi", "aisi_gov", "rand",
|
|
"armscontrol", "eclinmed", "solana-compass", "solana compass",
|
|
"pmc11919318", "pmc11780016",
|
|
"healthverity", "natrium", "form-energy",
|
|
"courtlistener", "curtis-schiff", "curtis-schiff-prediction-markets",
|
|
"prophetx", "techpolicypress-staff",
|
|
"npr", "venturebeat", "geekwire", "payloadspace", "the-ankler",
|
|
"theankler", "tubefilter", "emarketer", "dagster",
|
|
"numerai", # fund/project brand, not person
|
|
"psl", "multistate",
|
|
}
|
|
PUBLISHERS_ACADEMIC = {
|
|
# Academic orgs, labs, papers, journals, institutions
|
|
"arxiv", "metr", "metr_evals", "apollo-research", "apollo research", "apolloresearch",
|
|
"jacc-study-authors", "jacc-data-report-authors",
|
|
"anthropic-fellows-program", "anthropic-fellows",
|
|
"anthropic-fellows-/-alignment-science-team", "anthropic-research",
|
|
"jmir-2024", "jmir 2024",
|
|
"oettl-et-al.,-journal-of-experimental-orthopaedics",
|
|
"oettl et al., journal of experimental orthopaedics",
|
|
"jacc", "nct06548490", "pmc",
|
|
"conitzer-et-al.-(2024)", "aquino-michaels-2026", "pan-et-al.",
|
|
"pan-et-al.-'natural-language-agent-harnesses'",
|
|
"stanford", "stanford-meta-harness",
|
|
"hendershot", "annals-im",
|
|
"nellie-liang,-brookings-institution", "nellie liang, brookings institution",
|
|
"penn-state", "american-heart-association", "american heart association",
|
|
"molt_cornelius", "molt-cornelius",
|
|
# Companies / labs / brand-orgs (not specific humans)
|
|
"anthropic", "anthropicai", "openai", "nasa", "icrc", "ecri",
|
|
"epochairesearch", "metadao", "iapam", "icer",
|
|
"who", "ama", "uspstf", "unknown",
|
|
"futard.io", # protocol/platform
|
|
"oxford-martin-ai-governance-initiative",
|
|
"oxford-martin-ai-governance",
|
|
"u.s.-food-and-drug-administration",
|
|
"jitse-goutbeek,-european-policy-centre", # cited person+org string → publisher
|
|
"adepoju-et-al.", # paper citation
|
|
# Formal-citation names (Firstname-Lastname or Lastname-et-al) — classified
|
|
# as academic citations, not reachable contributors. They'd need an @ handle
|
|
# to get CI credit per Cory's growth-loop design.
|
|
"senator-elissa-slotkin",
|
|
"bostrom", "hanson", "kaufmann", "noah-smith", "doug-shapiro",
|
|
"shayon-sengupta", "shayon sengupta",
|
|
"robin-hanson", "robin hanson", "eliezer-yudkowsky",
|
|
"leopold-aschenbrenner", "aschenbrenner",
|
|
"ramstead", "larsson", "heavey",
|
|
"dan-slimmon", "van-leeuwaarden", "ward-whitt", "adams",
|
|
"tamim-ansary", "spizzirri",
|
|
"dario-amodei", # formal-citation form (real @ is @darioamodei)
|
|
"corless", "oxranga", "vlahakis",
|
|
# Brand/project/DAO tokens — not individuals
|
|
"areal-dao", "areal", "theiaresearch", "futard-io", "dhrumil",
|
|
# Classic formal-citation names — famous academics/economists cited by surname.
|
|
# Reachable via @ handle if/when they join (e.g. Ostrom has no X, Hayek deceased,
|
|
# Friston has an institutional affiliation not an @ handle we'd track).
|
|
"clayton-christensen", "hidalgo", "coase", "wiener", "juarrero",
|
|
"ostrom", "centola", "hayek", "marshall-mcluhan", "blackmore",
|
|
"knuth", "friston", "aquino-michaels", "conitzer", "bak",
|
|
}
|
|
# NOTE: pseudonymous X handles that MAY be real contributors stay in keep_person:
|
|
# karpathy, simonw, swyx, metaproph3t, metanallok, mmdhrumil, sjdedic,
|
|
# ceterispar1bus — these are real X accounts and match Cory's growth loop.
|
|
# They appear without @ prefix because extraction frontmatter didn't normalize.
|
|
# Auto-creating them as contributors tier='cited' is correct (A-path from earlier).
|
|
PUBLISHERS_SOCIAL = {
|
|
"x", "twitter", "telegram", "x.com",
|
|
}
|
|
PUBLISHERS_INTERNAL = {
|
|
"teleohumanity-manifesto", "strategy-session-journal",
|
|
"living-capital-thesis-development", "attractor-state-historical-backtesting",
|
|
"web-research-compilation", "architectural-investing",
|
|
"governance---meritocratic-voting-+-futarchy", # title artifact
|
|
"sec-interpretive-release-s7-2026-09-(march-17", # title artifact
|
|
"mindstudio", # tooling/platform, not contributor
|
|
}
|
|
# Merge into one kind→set map for classification
|
|
PUBLISHER_KIND_MAP = {}
|
|
for h in PUBLISHERS_NEWS:
|
|
PUBLISHER_KIND_MAP[h.lower()] = "news"
|
|
for h in PUBLISHERS_ACADEMIC:
|
|
PUBLISHER_KIND_MAP[h.lower()] = "academic"
|
|
for h in PUBLISHERS_SOCIAL:
|
|
PUBLISHER_KIND_MAP[h.lower()] = "social_platform"
|
|
for h in PUBLISHERS_INTERNAL:
|
|
PUBLISHER_KIND_MAP[h.lower()] = "internal"
|
|
|
|
|
|
# Garbage: handles that are clearly parse artifacts, not real names.
|
|
# Pattern: contains parens, special chars, or >50 chars.
|
|
def is_garbage(handle: str) -> bool:
|
|
h = handle.strip()
|
|
if len(h) > 50:
|
|
return True
|
|
if re.search(r"[()\[\]<>{}\/\\|@#$%^&*=?!:;\"']", h):
|
|
# But @ can appear legitimately in handles like @thesensatore — allow if @ is only prefix
|
|
if h.startswith("@") and not re.search(r"[()\[\]<>{}\/\\|#$%^&*=?!:;\"']", h):
|
|
return False
|
|
return True
|
|
# Multi-word hyphenated with very specific artifact shape: 3+ hyphens in a row or trailing noise
|
|
if "---" in h or "---meritocratic" in h or h.endswith("(march") or h.endswith("-(march"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def classify(handle: str) -> tuple[str, str | None]:
|
|
"""Return (category, publisher_kind).
|
|
|
|
category ∈ {'keep_agent', 'keep_person', 'publisher', 'garbage', 'review_needed'}
|
|
publisher_kind ∈ {'news','academic','social_platform','internal', None}
|
|
"""
|
|
h = handle.strip().lower().lstrip("@")
|
|
|
|
if h in PENTAGON_AGENTS:
|
|
return ("keep_agent", None)
|
|
|
|
if h in PUBLISHER_KIND_MAP:
|
|
return ("publisher", PUBLISHER_KIND_MAP[h])
|
|
|
|
if is_garbage(handle):
|
|
return ("garbage", None)
|
|
|
|
# @-prefixed handles or short-slug real-looking names → keep as person
|
|
# (Auto-create rule from Cory: @ handles auto-join as tier='cited'.)
|
|
if handle.startswith("@"):
|
|
return ("keep_person", None)
|
|
|
|
# Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person.
|
|
# 39-char ceiling matches GitHub's handle limit and the writer path in
|
|
# contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall
|
|
# through to review_needed and block --apply.
|
|
if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h):
|
|
return ("keep_person", None)
|
|
|
|
# Everything else: needs human review
|
|
return ("review_needed", None)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--apply", action="store_true", help="Write changes to DB")
|
|
parser.add_argument("--show", type=str, help="Inspect a single handle")
|
|
parser.add_argument("--delete-events", action="store_true",
|
|
help="DELETE contribution_events for publishers+garbage (default: keep for audit)")
|
|
args = parser.parse_args()
|
|
|
|
if not Path(DB_PATH).exists():
|
|
print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Sanity: publishers table must exist (v26 migration applied)
|
|
try:
|
|
conn.execute("SELECT 1 FROM publishers LIMIT 1")
|
|
except sqlite3.OperationalError:
|
|
print("ERROR: publishers table missing. Run migration v26 first.", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
rows = conn.execute(
|
|
"SELECT handle, kind, tier, claims_merged FROM contributors ORDER BY claims_merged DESC"
|
|
).fetchall()
|
|
|
|
if args.show:
|
|
target = args.show.strip().lower().lstrip("@")
|
|
for r in rows:
|
|
if r["handle"].lower().lstrip("@") == target:
|
|
category, pkind = classify(r["handle"])
|
|
events_count = conn.execute(
|
|
"SELECT COUNT(*) FROM contribution_events WHERE handle = ?",
|
|
(r["handle"].lower().lstrip("@"),),
|
|
).fetchone()[0]
|
|
print(f"handle: {r['handle']}")
|
|
print(f"current_kind: {r['kind']}")
|
|
print(f"current_tier: {r['tier']}")
|
|
print(f"claims_merged: {r['claims_merged']}")
|
|
print(f"events: {events_count}")
|
|
print(f"→ category: {category}")
|
|
if pkind:
|
|
print(f"→ publisher: kind={pkind}")
|
|
return
|
|
print(f"No match for '{args.show}'")
|
|
return
|
|
|
|
# Classify all
|
|
buckets: dict[str, list[dict]] = {
|
|
"keep_agent": [],
|
|
"keep_person": [],
|
|
"publisher": [],
|
|
"garbage": [],
|
|
"review_needed": [],
|
|
}
|
|
for r in rows:
|
|
category, pkind = classify(r["handle"])
|
|
buckets[category].append({
|
|
"handle": r["handle"],
|
|
"kind_now": r["kind"],
|
|
"tier": r["tier"],
|
|
"claims": r["claims_merged"] or 0,
|
|
"publisher_kind": pkind,
|
|
})
|
|
|
|
print("=== Classification summary ===")
|
|
for cat, items in buckets.items():
|
|
print(f" {cat:18s} {len(items):5d}")
|
|
|
|
print("\n=== Sample of each category ===")
|
|
for cat, items in buckets.items():
|
|
print(f"\n--- {cat} (showing up to 10) ---")
|
|
for item in items[:10]:
|
|
tag = f" → {item['publisher_kind']}" if item["publisher_kind"] else ""
|
|
print(f" {item['handle']:50s} claims={item['claims']:5d}{tag}")
|
|
|
|
print("\n=== Full review_needed list ===")
|
|
for item in buckets["review_needed"]:
|
|
print(f" {item['handle']:50s} claims={item['claims']:5d}")
|
|
|
|
# Diagnostic: orphan alias count for handles we're about to delete.
|
|
# Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway),
|
|
# so aliases pointing to deleted canonical handles become orphans. Surface
|
|
# the count so the --delete-events decision is informed.
|
|
doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]]
|
|
if doomed:
|
|
placeholders = ",".join("?" * len(doomed))
|
|
orphan_count = conn.execute(
|
|
f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})",
|
|
doomed,
|
|
).fetchone()[0]
|
|
print(f"\n=== Alias orphan check ===")
|
|
print(f" contributor_aliases rows pointing to deletable canonicals: {orphan_count}")
|
|
if orphan_count:
|
|
print(f" (cleanup requires --delete-events; without it, aliases stay as orphans)")
|
|
|
|
if not args.apply:
|
|
print("\n(dry-run — no writes. Re-run with --apply to execute.)")
|
|
return
|
|
|
|
# ── Apply changes ──
|
|
print("\n=== Applying changes ===")
|
|
if buckets["review_needed"]:
|
|
print(f"ABORT: {len(buckets['review_needed'])} rows need human review. Fix classifier before --apply.")
|
|
sys.exit(3)
|
|
|
|
inserted_publishers = 0
|
|
reclassified_agents = 0
|
|
deleted_garbage = 0
|
|
deleted_publisher_rows = 0
|
|
deleted_events = 0
|
|
deleted_aliases = 0
|
|
|
|
# Single transaction — if any step errors, roll back. This prevents the failure
|
|
# mode where a publisher insert fails silently and we still delete the contributor
|
|
# row, losing data.
|
|
try:
|
|
conn.execute("BEGIN")
|
|
|
|
# 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
|
|
# Counter uses cur.rowcount so replay runs (where publishers already exist)
|
|
# report accurate inserted=0 instead of falsely claiming the full set.
|
|
# moved_to_publisher is unconditional — the contributors row still needs to
|
|
# be deleted even when the publishers row was added in a prior run.
|
|
moved_to_publisher = set()
|
|
for item in buckets["publisher"]:
|
|
name = item["handle"].strip().lower().lstrip("@")
|
|
cur = conn.execute(
|
|
"INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
|
|
(name, item["publisher_kind"]),
|
|
)
|
|
if cur.rowcount > 0:
|
|
inserted_publishers += 1
|
|
moved_to_publisher.add(item["handle"])
|
|
|
|
# 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
|
|
for item in buckets["keep_agent"]:
|
|
conn.execute(
|
|
"UPDATE contributors SET kind = 'agent' WHERE handle = ?",
|
|
(item["handle"].lower().lstrip("@"),),
|
|
)
|
|
reclassified_agents += 1
|
|
|
|
# 3. Delete garbage handles from contributors (and their events + aliases)
|
|
for item in buckets["garbage"]:
|
|
canonical_lower = item["handle"].lower().lstrip("@")
|
|
if args.delete_events:
|
|
cur = conn.execute(
|
|
"DELETE FROM contribution_events WHERE handle = ?",
|
|
(canonical_lower,),
|
|
)
|
|
deleted_events += cur.rowcount
|
|
cur = conn.execute(
|
|
"DELETE FROM contributor_aliases WHERE canonical = ?",
|
|
(canonical_lower,),
|
|
)
|
|
deleted_aliases += cur.rowcount
|
|
cur = conn.execute(
|
|
"DELETE FROM contributors WHERE handle = ?",
|
|
(item["handle"],),
|
|
)
|
|
deleted_garbage += cur.rowcount
|
|
|
|
# 4. Delete publisher rows from contributors — ONLY for those successfully
|
|
# inserted into publishers above. Guards against partial failure.
|
|
# Aliases pointing to publisher-classified handles get cleaned under the
|
|
# same --delete-events gate: publishers live in their own table now, any
|
|
# leftover aliases in contributor_aliases are orphans.
|
|
for item in buckets["publisher"]:
|
|
if item["handle"] not in moved_to_publisher:
|
|
continue
|
|
canonical_lower = item["handle"].lower().lstrip("@")
|
|
if args.delete_events:
|
|
cur = conn.execute(
|
|
"DELETE FROM contribution_events WHERE handle = ?",
|
|
(canonical_lower,),
|
|
)
|
|
deleted_events += cur.rowcount
|
|
cur = conn.execute(
|
|
"DELETE FROM contributor_aliases WHERE canonical = ?",
|
|
(canonical_lower,),
|
|
)
|
|
deleted_aliases += cur.rowcount
|
|
cur = conn.execute(
|
|
"DELETE FROM contributors WHERE handle = ?",
|
|
(item["handle"],),
|
|
)
|
|
deleted_publisher_rows += cur.rowcount
|
|
|
|
# 5. Audit log entry for the destructive operation (Ganymede Q5).
|
|
conn.execute(
|
|
"INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)",
|
|
(
|
|
"schema_v26",
|
|
"classify_contributors",
|
|
json.dumps({
|
|
"publishers_inserted": inserted_publishers,
|
|
"agents_updated": reclassified_agents,
|
|
"garbage_deleted": deleted_garbage,
|
|
"publisher_rows_deleted": deleted_publisher_rows,
|
|
"events_deleted": deleted_events,
|
|
"aliases_deleted": deleted_aliases,
|
|
"delete_events_flag": bool(args.delete_events),
|
|
}),
|
|
),
|
|
)
|
|
|
|
conn.commit()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
print(f"ERROR: Transaction failed, rolled back. {e}", file=sys.stderr)
|
|
sys.exit(4)
|
|
|
|
print(f" publishers inserted: {inserted_publishers}")
|
|
print(f" agents kind='agent' ensured: {reclassified_agents}")
|
|
print(f" garbage rows deleted: {deleted_garbage}")
|
|
print(f" publisher rows removed from contributors: {deleted_publisher_rows}")
|
|
if args.delete_events:
|
|
print(f" contribution_events deleted: {deleted_events}")
|
|
print(f" contributor_aliases deleted: {deleted_aliases}")
|
|
else:
|
|
print(f" (events + aliases kept — re-run with --delete-events to clean them)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|