fix: vector-gc ID normalization for Qdrant UUID format

Qdrant auto-formats 32-char MD5 hex as UUID with dashes. Normalize
both sides for comparison. Also read claim_path from payload correctly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-03-30 01:23:10 +01:00
parent 6fb3f2258f
commit d70788a91c

View file

@ -26,7 +26,11 @@ EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
def make_point_id(path: str) -> str:
"""Deterministic UUID from file path (must match embed-claims.py)."""
"""Deterministic UUID from file path (must match embed-claims.py).
Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize
by stripping dashes for comparison.
"""
return hashlib.md5(path.encode()).hexdigest()
@ -99,7 +103,12 @@ def main():
expected = get_expected_ids()
print(f" Filesystem files: {len(expected)}")
qdrant_ids = {p["id"] for p in points}
# Normalize IDs: Qdrant formats 32-char hex as UUID with dashes
def normalize_id(pid: str) -> str:
return pid.replace("-", "")
qdrant_map = {normalize_id(p["id"]): p for p in points}
qdrant_ids = set(qdrant_map.keys())
expected_ids = set(expected.keys())
orphan_ids = qdrant_ids - expected_ids
@ -107,10 +116,11 @@ def main():
# Categorize orphans by their payload path
orphan_details = []
for p in points:
if p["id"] in orphan_ids:
path = p.get("payload", {}).get("path", "unknown")
orphan_details.append({"id": p["id"], "path": path})
for nid in orphan_ids:
p = qdrant_map[nid]
payload = p.get("payload", {})
path = payload.get("claim_path") or payload.get("path", "unknown")
orphan_details.append({"id": p["id"], "path": path})
print(f"\n=== Vector GC Report ===")
print(f"Qdrant vectors: {len(qdrant_ids)}")
@ -130,8 +140,10 @@ def main():
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
if args.purge and orphan_ids:
print(f"\nPurging {len(orphan_ids)} orphan vectors...")
result = delete_points(list(orphan_ids))
# Use original Qdrant IDs (with dashes) for deletion
original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids]
print(f"\nPurging {len(original_orphan_ids)} orphan vectors...")
result = delete_points(original_orphan_ids)
print(f" Done: {result}")
elif orphan_ids and not args.purge:
print(f"\nRun with --purge to delete orphan vectors.")