fix(classify): Ganymede review fixes — alias cleanup + counter accuracy + handle alignment
Some checks failed
CI / lint-and-test (push) Has been cancelled

1. WARNING — orphan contributor_aliases after publisher/garbage delete:
   Added alias cleanup to the transaction (gated on --delete-events, same
   audit rationale as events). Both garbage and publisher deletion loops
   now DELETE matching contributor_aliases rows. Dry-run adds an orphan
   count diagnostic so the --delete-events decision is informed.

2. NIT — inserted_publishers counter over-reports on replay:
   INSERT OR IGNORE silently skips name collisions, but the counter
   incremented unconditionally. Now uses cur.rowcount so a second apply
   reports 0 inserts instead of falsely claiming 100. moved_to_publisher
   set remains unconditional — publisher rows already present still need
   the matching contributors row deleted.

3. NIT — handle-length gate diverged from writer path:
   Widened from {0,19} (20 chars) to {0,38} (39 chars) to match GitHub's
   handle limit and contributor.py::_HANDLE_RE. Prevents future long-handle
   real contributors from falling through to review_needed and blocking
   --apply. Current data has 0 review_needed either way.

Bonus (Q5): Added audit_log entry inside the transaction. One row in
audit_log.stage='schema_v26', event='classify_contributors' with counter
detail JSON on every --apply run. Cheap audit trail for the destructive op.

Verified end-to-end on VPS DB snapshot:
- First apply: 100/9/9/100/0 (matches pre-fix)
- Second apply: 0/9/0/0/0 (counter fix working)
- With injected aliases + --delete-events: 2 aliases deleted, 1 pre-existing
  orphan correctly left alone (outside script scope), audit_log entry
  written with accurate counters.

Ganymede msg-3. Protocol closed.
This commit is contained in:
m3taversal 2026-04-24 17:48:47 +01:00
parent 45b2f6de20
commit 3fe524dd14

View file

@ -21,6 +21,7 @@ Usage:
Writes to pipeline.db only. Does NOT modify claim files. Writes to pipeline.db only. Does NOT modify claim files.
""" """
import argparse import argparse
import json
import os import os
import re import re
import sqlite3 import sqlite3
@ -186,8 +187,11 @@ def classify(handle: str) -> tuple[str, str | None]:
if handle.startswith("@"): if handle.startswith("@"):
return ("keep_person", None) return ("keep_person", None)
# Short plausible handles (<=20 chars, alphanum + underscore/hyphen): treat as person # Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person.
if re.match(r"^[a-z0-9][a-z0-9_-]{0,19}$", h): # 39-char ceiling matches GitHub's handle limit and the writer path in
# contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall
# through to review_needed and block --apply.
if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h):
return ("keep_person", None) return ("keep_person", None)
# Everything else: needs human review # Everything else: needs human review
@ -274,6 +278,22 @@ def main():
for item in buckets["review_needed"]: for item in buckets["review_needed"]:
print(f" {item['handle']:50s} claims={item['claims']:5d}") print(f" {item['handle']:50s} claims={item['claims']:5d}")
# Diagnostic: orphan alias count for handles we're about to delete.
# Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway),
# so aliases pointing to deleted canonical handles become orphans. Surface
# the count so the --delete-events decision is informed.
doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]]
if doomed:
placeholders = ",".join("?" * len(doomed))
orphan_count = conn.execute(
f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})",
doomed,
).fetchone()[0]
print(f"\n=== Alias orphan check ===")
print(f" contributor_aliases rows pointing to deletable canonicals: {orphan_count}")
if orphan_count:
print(f" (cleanup requires --delete-events; without it, aliases stay as orphans)")
if not args.apply: if not args.apply:
print("\n(dry-run — no writes. Re-run with --apply to execute.)") print("\n(dry-run — no writes. Re-run with --apply to execute.)")
return return
@ -289,6 +309,7 @@ def main():
deleted_garbage = 0 deleted_garbage = 0
deleted_publisher_rows = 0 deleted_publisher_rows = 0
deleted_events = 0 deleted_events = 0
deleted_aliases = 0
# Single transaction — if any step errors, roll back. This prevents the failure # Single transaction — if any step errors, roll back. This prevents the failure
# mode where a publisher insert fails silently and we still delete the contributor # mode where a publisher insert fails silently and we still delete the contributor
@ -297,15 +318,20 @@ def main():
conn.execute("BEGIN") conn.execute("BEGIN")
# 1. Insert publishers. Track which ones succeeded so step 4 only deletes those. # 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
# Counter uses cur.rowcount so replay runs (where publishers already exist)
# report accurate inserted=0 instead of falsely claiming the full set.
# moved_to_publisher is unconditional — the contributors row still needs to
# be deleted even when the publishers row was added in a prior run.
moved_to_publisher = set() moved_to_publisher = set()
for item in buckets["publisher"]: for item in buckets["publisher"]:
name = item["handle"].strip().lower().lstrip("@") name = item["handle"].strip().lower().lstrip("@")
conn.execute( cur = conn.execute(
"INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)", "INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
(name, item["publisher_kind"]), (name, item["publisher_kind"]),
) )
moved_to_publisher.add(item["handle"]) if cur.rowcount > 0:
inserted_publishers += 1 inserted_publishers += 1
moved_to_publisher.add(item["handle"])
# 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch) # 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
for item in buckets["keep_agent"]: for item in buckets["keep_agent"]:
@ -315,14 +341,20 @@ def main():
) )
reclassified_agents += 1 reclassified_agents += 1
# 3. Delete garbage handles from contributors (and their events) # 3. Delete garbage handles from contributors (and their events + aliases)
for item in buckets["garbage"]: for item in buckets["garbage"]:
canonical_lower = item["handle"].lower().lstrip("@")
if args.delete_events: if args.delete_events:
cur = conn.execute( cur = conn.execute(
"DELETE FROM contribution_events WHERE handle = ?", "DELETE FROM contribution_events WHERE handle = ?",
(item["handle"].lower().lstrip("@"),), (canonical_lower,),
) )
deleted_events += cur.rowcount deleted_events += cur.rowcount
cur = conn.execute(
"DELETE FROM contributor_aliases WHERE canonical = ?",
(canonical_lower,),
)
deleted_aliases += cur.rowcount
cur = conn.execute( cur = conn.execute(
"DELETE FROM contributors WHERE handle = ?", "DELETE FROM contributors WHERE handle = ?",
(item["handle"],), (item["handle"],),
@ -331,21 +363,48 @@ def main():
# 4. Delete publisher rows from contributors — ONLY for those successfully # 4. Delete publisher rows from contributors — ONLY for those successfully
# inserted into publishers above. Guards against partial failure. # inserted into publishers above. Guards against partial failure.
# Aliases pointing to publisher-classified handles get cleaned under the
# same --delete-events gate: publishers live in their own table now, any
# leftover aliases in contributor_aliases are orphans.
for item in buckets["publisher"]: for item in buckets["publisher"]:
if item["handle"] not in moved_to_publisher: if item["handle"] not in moved_to_publisher:
continue continue
canonical_lower = item["handle"].lower().lstrip("@")
if args.delete_events: if args.delete_events:
cur = conn.execute( cur = conn.execute(
"DELETE FROM contribution_events WHERE handle = ?", "DELETE FROM contribution_events WHERE handle = ?",
(item["handle"].lower().lstrip("@"),), (canonical_lower,),
) )
deleted_events += cur.rowcount deleted_events += cur.rowcount
cur = conn.execute(
"DELETE FROM contributor_aliases WHERE canonical = ?",
(canonical_lower,),
)
deleted_aliases += cur.rowcount
cur = conn.execute( cur = conn.execute(
"DELETE FROM contributors WHERE handle = ?", "DELETE FROM contributors WHERE handle = ?",
(item["handle"],), (item["handle"],),
) )
deleted_publisher_rows += cur.rowcount deleted_publisher_rows += cur.rowcount
# 5. Audit log entry for the destructive operation (Ganymede Q5).
conn.execute(
"INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)",
(
"schema_v26",
"classify_contributors",
json.dumps({
"publishers_inserted": inserted_publishers,
"agents_updated": reclassified_agents,
"garbage_deleted": deleted_garbage,
"publisher_rows_deleted": deleted_publisher_rows,
"events_deleted": deleted_events,
"aliases_deleted": deleted_aliases,
"delete_events_flag": bool(args.delete_events),
}),
),
)
conn.commit() conn.commit()
except Exception as e: except Exception as e:
conn.rollback() conn.rollback()
@ -358,8 +417,9 @@ def main():
print(f" publisher rows removed from contributors: {deleted_publisher_rows}") print(f" publisher rows removed from contributors: {deleted_publisher_rows}")
if args.delete_events: if args.delete_events:
print(f" contribution_events deleted: {deleted_events}") print(f" contribution_events deleted: {deleted_events}")
print(f" contributor_aliases deleted: {deleted_aliases}")
else: else:
print(f" (events kept — re-run with --delete-events to clean them)") print(f" (events + aliases kept — re-run with --delete-events to clean them)")
if __name__ == "__main__": if __name__ == "__main__":