From 3fe524dd144e4c397a49a910f8d48f92515607d8 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Fri, 24 Apr 2026 17:48:47 +0100 Subject: [PATCH] =?UTF-8?q?fix(classify):=20Ganymede=20review=20fixes=20?= =?UTF-8?q?=E2=80=94=20alias=20cleanup=20+=20counter=20accuracy=20+=20hand?= =?UTF-8?q?le=20alignment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. WARNING — orphan contributor_aliases after publisher/garbage delete: Added alias cleanup to the transaction (gated on --delete-events, same audit rationale as events). Both garbage and publisher deletion loops now DELETE matching contributor_aliases rows. Dry-run adds an orphan count diagnostic so the --delete-events decision is informed. 2. NIT — inserted_publishers counter over-reports on replay: INSERT OR IGNORE silently skips name collisions, but the counter incremented unconditionally. Now uses cur.rowcount so a second apply reports 0 inserts instead of falsely claiming 100. moved_to_publisher set remains unconditional — publisher rows already present still need the matching contributors row deleted. 3. NIT — handle-length gate diverged from writer path: Widened from {0,19} (20 chars) to {0,38} (39 chars) to match GitHub's handle limit and contributor.py::_HANDLE_RE. Prevents future long-handle real contributors from falling through to review_needed and blocking --apply. Current data has 0 review_needed either way. Bonus (Q5): Added audit_log entry inside the transaction. One row in audit_log.stage='schema_v26', event='classify_contributors' with counter detail JSON on every --apply run. Cheap audit trail for the destructive op. Verified end-to-end on VPS DB snapshot: - First apply: 100/9/9/100/0 (matches pre-fix) - Second apply: 0/9/0/0/0 (counter fix working) - With injected aliases + --delete-events: 2 aliases deleted, 1 pre-existing orphan correctly left alone (outside script scope), audit_log entry written with accurate counters. Ganymede msg-3. Protocol closed. --- scripts/classify-contributors.py | 76 ++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/scripts/classify-contributors.py b/scripts/classify-contributors.py index 80299ae..07e019c 100644 --- a/scripts/classify-contributors.py +++ b/scripts/classify-contributors.py @@ -21,6 +21,7 @@ Usage: Writes to pipeline.db only. Does NOT modify claim files. """ import argparse +import json import os import re import sqlite3 @@ -186,8 +187,11 @@ def classify(handle: str) -> tuple[str, str | None]: if handle.startswith("@"): return ("keep_person", None) - # Short plausible handles (<=20 chars, alphanum + underscore/hyphen): treat as person - if re.match(r"^[a-z0-9][a-z0-9_-]{0,19}$", h): + # Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person. + # 39-char ceiling matches GitHub's handle limit and the writer path in + # contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall + # through to review_needed and block --apply. + if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h): return ("keep_person", None) # Everything else: needs human review @@ -274,6 +278,22 @@ def main(): for item in buckets["review_needed"]: print(f" {item['handle']:50s} claims={item['claims']:5d}") + # Diagnostic: orphan alias count for handles we're about to delete. + # Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway), + # so aliases pointing to deleted canonical handles become orphans. Surface + # the count so the --delete-events decision is informed. + doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]] + if doomed: + placeholders = ",".join("?" * len(doomed)) + orphan_count = conn.execute( + f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})", + doomed, + ).fetchone()[0] + print(f"\n=== Alias orphan check ===") + print(f" contributor_aliases rows pointing to deletable canonicals: {orphan_count}") + if orphan_count: + print(f" (cleanup requires --delete-events; without it, aliases stay as orphans)") + if not args.apply: print("\n(dry-run — no writes. Re-run with --apply to execute.)") return @@ -289,6 +309,7 @@ def main(): deleted_garbage = 0 deleted_publisher_rows = 0 deleted_events = 0 + deleted_aliases = 0 # Single transaction — if any step errors, roll back. This prevents the failure # mode where a publisher insert fails silently and we still delete the contributor @@ -297,15 +318,20 @@ def main(): conn.execute("BEGIN") # 1. Insert publishers. Track which ones succeeded so step 4 only deletes those. + # Counter uses cur.rowcount so replay runs (where publishers already exist) + # report accurate inserted=0 instead of falsely claiming the full set. + # moved_to_publisher is unconditional — the contributors row still needs to + # be deleted even when the publishers row was added in a prior run. moved_to_publisher = set() for item in buckets["publisher"]: name = item["handle"].strip().lower().lstrip("@") - conn.execute( + cur = conn.execute( "INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)", (name, item["publisher_kind"]), ) + if cur.rowcount > 0: + inserted_publishers += 1 moved_to_publisher.add(item["handle"]) - inserted_publishers += 1 # 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch) for item in buckets["keep_agent"]: @@ -315,14 +341,20 @@ def main(): ) reclassified_agents += 1 - # 3. Delete garbage handles from contributors (and their events) + # 3. Delete garbage handles from contributors (and their events + aliases) for item in buckets["garbage"]: + canonical_lower = item["handle"].lower().lstrip("@") if args.delete_events: cur = conn.execute( "DELETE FROM contribution_events WHERE handle = ?", - (item["handle"].lower().lstrip("@"),), + (canonical_lower,), ) deleted_events += cur.rowcount + cur = conn.execute( + "DELETE FROM contributor_aliases WHERE canonical = ?", + (canonical_lower,), + ) + deleted_aliases += cur.rowcount cur = conn.execute( "DELETE FROM contributors WHERE handle = ?", (item["handle"],), @@ -331,21 +363,48 @@ def main(): # 4. Delete publisher rows from contributors — ONLY for those successfully # inserted into publishers above. Guards against partial failure. + # Aliases pointing to publisher-classified handles get cleaned under the + # same --delete-events gate: publishers live in their own table now, any + # leftover aliases in contributor_aliases are orphans. for item in buckets["publisher"]: if item["handle"] not in moved_to_publisher: continue + canonical_lower = item["handle"].lower().lstrip("@") if args.delete_events: cur = conn.execute( "DELETE FROM contribution_events WHERE handle = ?", - (item["handle"].lower().lstrip("@"),), + (canonical_lower,), ) deleted_events += cur.rowcount + cur = conn.execute( + "DELETE FROM contributor_aliases WHERE canonical = ?", + (canonical_lower,), + ) + deleted_aliases += cur.rowcount cur = conn.execute( "DELETE FROM contributors WHERE handle = ?", (item["handle"],), ) deleted_publisher_rows += cur.rowcount + # 5. Audit log entry for the destructive operation (Ganymede Q5). + conn.execute( + "INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)", + ( + "schema_v26", + "classify_contributors", + json.dumps({ + "publishers_inserted": inserted_publishers, + "agents_updated": reclassified_agents, + "garbage_deleted": deleted_garbage, + "publisher_rows_deleted": deleted_publisher_rows, + "events_deleted": deleted_events, + "aliases_deleted": deleted_aliases, + "delete_events_flag": bool(args.delete_events), + }), + ), + ) + conn.commit() except Exception as e: conn.rollback() @@ -358,8 +417,9 @@ def main(): print(f" publisher rows removed from contributors: {deleted_publisher_rows}") if args.delete_events: print(f" contribution_events deleted: {deleted_events}") + print(f" contributor_aliases deleted: {deleted_aliases}") else: - print(f" (events kept — re-run with --delete-events to clean them)") + print(f" (events + aliases kept — re-run with --delete-events to clean them)") if __name__ == "__main__":