fix: vector-gc ID normalization for Qdrant UUID format
Qdrant auto-formats 32-char MD5 hex as UUID with dashes. Normalize both sides for comparison. Also read claim_path from payload correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6fb3f2258f
commit
d70788a91c
1 changed files with 20 additions and 8 deletions
|
|
@ -26,7 +26,11 @@ EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
|
|||
|
||||
|
||||
def make_point_id(path: str) -> str:
|
||||
"""Deterministic UUID from file path (must match embed-claims.py)."""
|
||||
"""Deterministic UUID from file path (must match embed-claims.py).
|
||||
|
||||
Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize
|
||||
by stripping dashes for comparison.
|
||||
"""
|
||||
return hashlib.md5(path.encode()).hexdigest()
|
||||
|
||||
|
||||
|
|
@ -99,7 +103,12 @@ def main():
|
|||
expected = get_expected_ids()
|
||||
print(f" Filesystem files: {len(expected)}")
|
||||
|
||||
qdrant_ids = {p["id"] for p in points}
|
||||
# Normalize IDs: Qdrant formats 32-char hex as UUID with dashes
|
||||
def normalize_id(pid: str) -> str:
|
||||
return pid.replace("-", "")
|
||||
|
||||
qdrant_map = {normalize_id(p["id"]): p for p in points}
|
||||
qdrant_ids = set(qdrant_map.keys())
|
||||
expected_ids = set(expected.keys())
|
||||
|
||||
orphan_ids = qdrant_ids - expected_ids
|
||||
|
|
@ -107,10 +116,11 @@ def main():
|
|||
|
||||
# Categorize orphans by their payload path
|
||||
orphan_details = []
|
||||
for p in points:
|
||||
if p["id"] in orphan_ids:
|
||||
path = p.get("payload", {}).get("path", "unknown")
|
||||
orphan_details.append({"id": p["id"], "path": path})
|
||||
for nid in orphan_ids:
|
||||
p = qdrant_map[nid]
|
||||
payload = p.get("payload", {})
|
||||
path = payload.get("claim_path") or payload.get("path", "unknown")
|
||||
orphan_details.append({"id": p["id"], "path": path})
|
||||
|
||||
print(f"\n=== Vector GC Report ===")
|
||||
print(f"Qdrant vectors: {len(qdrant_ids)}")
|
||||
|
|
@ -130,8 +140,10 @@ def main():
|
|||
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
|
||||
|
||||
if args.purge and orphan_ids:
|
||||
print(f"\nPurging {len(orphan_ids)} orphan vectors...")
|
||||
result = delete_points(list(orphan_ids))
|
||||
# Use original Qdrant IDs (with dashes) for deletion
|
||||
original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids]
|
||||
print(f"\nPurging {len(original_orphan_ids)} orphan vectors...")
|
||||
result = delete_points(original_orphan_ids)
|
||||
print(f" Done: {result}")
|
||||
elif orphan_ids and not args.purge:
|
||||
print(f"\nRun with --purge to delete orphan vectors.")
|
||||
|
|
|
|||
Loading…
Reference in a new issue