fix(classify): Ganymede review fixes — alias cleanup + counter accuracy + handle alignment
Some checks failed
CI / lint-and-test (push) Has been cancelled
Some checks failed
CI / lint-and-test (push) Has been cancelled
1. WARNING — orphan contributor_aliases after publisher/garbage delete:
Added alias cleanup to the transaction (gated on --delete-events, same
audit rationale as events). Both garbage and publisher deletion loops
now DELETE matching contributor_aliases rows. Dry-run adds an orphan
count diagnostic so the --delete-events decision is informed.
2. NIT — inserted_publishers counter over-reports on replay:
INSERT OR IGNORE silently skips name collisions, but the counter
incremented unconditionally. Now uses cur.rowcount so a second apply
reports 0 inserts instead of falsely claiming 100. moved_to_publisher
set remains unconditional — publisher rows already present still need
the matching contributors row deleted.
3. NIT — handle-length gate diverged from writer path:
Widened from {0,19} (20 chars) to {0,38} (39 chars) to match GitHub's
handle limit and contributor.py::_HANDLE_RE. Prevents future long-handle
real contributors from falling through to review_needed and blocking
--apply. Current data has 0 review_needed either way.
Bonus (Q5): Added audit_log entry inside the transaction. One row in
audit_log.stage='schema_v26', event='classify_contributors' with counter
detail JSON on every --apply run. Cheap audit trail for the destructive op.
Verified end-to-end on VPS DB snapshot:
- First apply: 100/9/9/100/0 (matches pre-fix)
- Second apply: 0/9/0/0/0 (counter fix working)
- With injected aliases + --delete-events: 2 aliases deleted, 1 pre-existing
orphan correctly left alone (outside script scope), audit_log entry
written with accurate counters.
Ganymede msg-3. Protocol closed.
This commit is contained in:
parent
45b2f6de20
commit
3fe524dd14
1 changed files with 68 additions and 8 deletions
|
|
@ -21,6 +21,7 @@ Usage:
|
||||||
Writes to pipeline.db only. Does NOT modify claim files.
|
Writes to pipeline.db only. Does NOT modify claim files.
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
@ -186,8 +187,11 @@ def classify(handle: str) -> tuple[str, str | None]:
|
||||||
if handle.startswith("@"):
|
if handle.startswith("@"):
|
||||||
return ("keep_person", None)
|
return ("keep_person", None)
|
||||||
|
|
||||||
# Short plausible handles (<=20 chars, alphanum + underscore/hyphen): treat as person
|
# Plausible handles (<=39 chars, alphanum + underscore/hyphen): treat as person.
|
||||||
if re.match(r"^[a-z0-9][a-z0-9_-]{0,19}$", h):
|
# 39-char ceiling matches GitHub's handle limit and the writer path in
|
||||||
|
# contributor.py::_HANDLE_RE, so a valid 21-39 char real handle won't fall
|
||||||
|
# through to review_needed and block --apply.
|
||||||
|
if re.match(r"^[a-z0-9][a-z0-9_-]{0,38}$", h):
|
||||||
return ("keep_person", None)
|
return ("keep_person", None)
|
||||||
|
|
||||||
# Everything else: needs human review
|
# Everything else: needs human review
|
||||||
|
|
@ -274,6 +278,22 @@ def main():
|
||||||
for item in buckets["review_needed"]:
|
for item in buckets["review_needed"]:
|
||||||
print(f" {item['handle']:50s} claims={item['claims']:5d}")
|
print(f" {item['handle']:50s} claims={item['claims']:5d}")
|
||||||
|
|
||||||
|
# Diagnostic: orphan alias count for handles we're about to delete.
|
||||||
|
# Contributor_aliases has no FK (SQLite FKs require PRAGMA to enforce anyway),
|
||||||
|
# so aliases pointing to deleted canonical handles become orphans. Surface
|
||||||
|
# the count so the --delete-events decision is informed.
|
||||||
|
doomed = [item["handle"].lower().lstrip("@") for item in buckets["garbage"] + buckets["publisher"]]
|
||||||
|
if doomed:
|
||||||
|
placeholders = ",".join("?" * len(doomed))
|
||||||
|
orphan_count = conn.execute(
|
||||||
|
f"SELECT COUNT(*) FROM contributor_aliases WHERE canonical IN ({placeholders})",
|
||||||
|
doomed,
|
||||||
|
).fetchone()[0]
|
||||||
|
print(f"\n=== Alias orphan check ===")
|
||||||
|
print(f" contributor_aliases rows pointing to deletable canonicals: {orphan_count}")
|
||||||
|
if orphan_count:
|
||||||
|
print(f" (cleanup requires --delete-events; without it, aliases stay as orphans)")
|
||||||
|
|
||||||
if not args.apply:
|
if not args.apply:
|
||||||
print("\n(dry-run — no writes. Re-run with --apply to execute.)")
|
print("\n(dry-run — no writes. Re-run with --apply to execute.)")
|
||||||
return
|
return
|
||||||
|
|
@ -289,6 +309,7 @@ def main():
|
||||||
deleted_garbage = 0
|
deleted_garbage = 0
|
||||||
deleted_publisher_rows = 0
|
deleted_publisher_rows = 0
|
||||||
deleted_events = 0
|
deleted_events = 0
|
||||||
|
deleted_aliases = 0
|
||||||
|
|
||||||
# Single transaction — if any step errors, roll back. This prevents the failure
|
# Single transaction — if any step errors, roll back. This prevents the failure
|
||||||
# mode where a publisher insert fails silently and we still delete the contributor
|
# mode where a publisher insert fails silently and we still delete the contributor
|
||||||
|
|
@ -297,15 +318,20 @@ def main():
|
||||||
conn.execute("BEGIN")
|
conn.execute("BEGIN")
|
||||||
|
|
||||||
# 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
|
# 1. Insert publishers. Track which ones succeeded so step 4 only deletes those.
|
||||||
|
# Counter uses cur.rowcount so replay runs (where publishers already exist)
|
||||||
|
# report accurate inserted=0 instead of falsely claiming the full set.
|
||||||
|
# moved_to_publisher is unconditional — the contributors row still needs to
|
||||||
|
# be deleted even when the publishers row was added in a prior run.
|
||||||
moved_to_publisher = set()
|
moved_to_publisher = set()
|
||||||
for item in buckets["publisher"]:
|
for item in buckets["publisher"]:
|
||||||
name = item["handle"].strip().lower().lstrip("@")
|
name = item["handle"].strip().lower().lstrip("@")
|
||||||
conn.execute(
|
cur = conn.execute(
|
||||||
"INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
|
"INSERT OR IGNORE INTO publishers (name, kind) VALUES (?, ?)",
|
||||||
(name, item["publisher_kind"]),
|
(name, item["publisher_kind"]),
|
||||||
)
|
)
|
||||||
|
if cur.rowcount > 0:
|
||||||
|
inserted_publishers += 1
|
||||||
moved_to_publisher.add(item["handle"])
|
moved_to_publisher.add(item["handle"])
|
||||||
inserted_publishers += 1
|
|
||||||
|
|
||||||
# 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
|
# 2. Ensure Pentagon agents have kind='agent' (idempotent after v25 patch)
|
||||||
for item in buckets["keep_agent"]:
|
for item in buckets["keep_agent"]:
|
||||||
|
|
@ -315,14 +341,20 @@ def main():
|
||||||
)
|
)
|
||||||
reclassified_agents += 1
|
reclassified_agents += 1
|
||||||
|
|
||||||
# 3. Delete garbage handles from contributors (and their events)
|
# 3. Delete garbage handles from contributors (and their events + aliases)
|
||||||
for item in buckets["garbage"]:
|
for item in buckets["garbage"]:
|
||||||
|
canonical_lower = item["handle"].lower().lstrip("@")
|
||||||
if args.delete_events:
|
if args.delete_events:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"DELETE FROM contribution_events WHERE handle = ?",
|
"DELETE FROM contribution_events WHERE handle = ?",
|
||||||
(item["handle"].lower().lstrip("@"),),
|
(canonical_lower,),
|
||||||
)
|
)
|
||||||
deleted_events += cur.rowcount
|
deleted_events += cur.rowcount
|
||||||
|
cur = conn.execute(
|
||||||
|
"DELETE FROM contributor_aliases WHERE canonical = ?",
|
||||||
|
(canonical_lower,),
|
||||||
|
)
|
||||||
|
deleted_aliases += cur.rowcount
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"DELETE FROM contributors WHERE handle = ?",
|
"DELETE FROM contributors WHERE handle = ?",
|
||||||
(item["handle"],),
|
(item["handle"],),
|
||||||
|
|
@ -331,21 +363,48 @@ def main():
|
||||||
|
|
||||||
# 4. Delete publisher rows from contributors — ONLY for those successfully
|
# 4. Delete publisher rows from contributors — ONLY for those successfully
|
||||||
# inserted into publishers above. Guards against partial failure.
|
# inserted into publishers above. Guards against partial failure.
|
||||||
|
# Aliases pointing to publisher-classified handles get cleaned under the
|
||||||
|
# same --delete-events gate: publishers live in their own table now, any
|
||||||
|
# leftover aliases in contributor_aliases are orphans.
|
||||||
for item in buckets["publisher"]:
|
for item in buckets["publisher"]:
|
||||||
if item["handle"] not in moved_to_publisher:
|
if item["handle"] not in moved_to_publisher:
|
||||||
continue
|
continue
|
||||||
|
canonical_lower = item["handle"].lower().lstrip("@")
|
||||||
if args.delete_events:
|
if args.delete_events:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"DELETE FROM contribution_events WHERE handle = ?",
|
"DELETE FROM contribution_events WHERE handle = ?",
|
||||||
(item["handle"].lower().lstrip("@"),),
|
(canonical_lower,),
|
||||||
)
|
)
|
||||||
deleted_events += cur.rowcount
|
deleted_events += cur.rowcount
|
||||||
|
cur = conn.execute(
|
||||||
|
"DELETE FROM contributor_aliases WHERE canonical = ?",
|
||||||
|
(canonical_lower,),
|
||||||
|
)
|
||||||
|
deleted_aliases += cur.rowcount
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"DELETE FROM contributors WHERE handle = ?",
|
"DELETE FROM contributors WHERE handle = ?",
|
||||||
(item["handle"],),
|
(item["handle"],),
|
||||||
)
|
)
|
||||||
deleted_publisher_rows += cur.rowcount
|
deleted_publisher_rows += cur.rowcount
|
||||||
|
|
||||||
|
# 5. Audit log entry for the destructive operation (Ganymede Q5).
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO audit_log (timestamp, stage, event, detail) VALUES (datetime('now'), ?, ?, ?)",
|
||||||
|
(
|
||||||
|
"schema_v26",
|
||||||
|
"classify_contributors",
|
||||||
|
json.dumps({
|
||||||
|
"publishers_inserted": inserted_publishers,
|
||||||
|
"agents_updated": reclassified_agents,
|
||||||
|
"garbage_deleted": deleted_garbage,
|
||||||
|
"publisher_rows_deleted": deleted_publisher_rows,
|
||||||
|
"events_deleted": deleted_events,
|
||||||
|
"aliases_deleted": deleted_aliases,
|
||||||
|
"delete_events_flag": bool(args.delete_events),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conn.rollback()
|
conn.rollback()
|
||||||
|
|
@ -358,8 +417,9 @@ def main():
|
||||||
print(f" publisher rows removed from contributors: {deleted_publisher_rows}")
|
print(f" publisher rows removed from contributors: {deleted_publisher_rows}")
|
||||||
if args.delete_events:
|
if args.delete_events:
|
||||||
print(f" contribution_events deleted: {deleted_events}")
|
print(f" contribution_events deleted: {deleted_events}")
|
||||||
|
print(f" contributor_aliases deleted: {deleted_aliases}")
|
||||||
else:
|
else:
|
||||||
print(f" (events kept — re-run with --delete-events to clean them)")
|
print(f" (events + aliases kept — re-run with --delete-events to clean them)")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue