fix: vector-gc ID normalization for Qdrant UUID format
Qdrant auto-formats 32-char MD5 hex as UUID with dashes. Normalize both sides for comparison. Also read claim_path from payload correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6fb3f2258f
commit
d70788a91c
1 changed files with 20 additions and 8 deletions
|
|
@ -26,7 +26,11 @@ EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
|
||||||
|
|
||||||
|
|
||||||
def make_point_id(path: str) -> str:
|
def make_point_id(path: str) -> str:
|
||||||
"""Deterministic UUID from file path (must match embed-claims.py)."""
|
"""Deterministic UUID from file path (must match embed-claims.py).
|
||||||
|
|
||||||
|
Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize
|
||||||
|
by stripping dashes for comparison.
|
||||||
|
"""
|
||||||
return hashlib.md5(path.encode()).hexdigest()
|
return hashlib.md5(path.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -99,7 +103,12 @@ def main():
|
||||||
expected = get_expected_ids()
|
expected = get_expected_ids()
|
||||||
print(f" Filesystem files: {len(expected)}")
|
print(f" Filesystem files: {len(expected)}")
|
||||||
|
|
||||||
qdrant_ids = {p["id"] for p in points}
|
# Normalize IDs: Qdrant formats 32-char hex as UUID with dashes
|
||||||
|
def normalize_id(pid: str) -> str:
|
||||||
|
return pid.replace("-", "")
|
||||||
|
|
||||||
|
qdrant_map = {normalize_id(p["id"]): p for p in points}
|
||||||
|
qdrant_ids = set(qdrant_map.keys())
|
||||||
expected_ids = set(expected.keys())
|
expected_ids = set(expected.keys())
|
||||||
|
|
||||||
orphan_ids = qdrant_ids - expected_ids
|
orphan_ids = qdrant_ids - expected_ids
|
||||||
|
|
@ -107,9 +116,10 @@ def main():
|
||||||
|
|
||||||
# Categorize orphans by their payload path
|
# Categorize orphans by their payload path
|
||||||
orphan_details = []
|
orphan_details = []
|
||||||
for p in points:
|
for nid in orphan_ids:
|
||||||
if p["id"] in orphan_ids:
|
p = qdrant_map[nid]
|
||||||
path = p.get("payload", {}).get("path", "unknown")
|
payload = p.get("payload", {})
|
||||||
|
path = payload.get("claim_path") or payload.get("path", "unknown")
|
||||||
orphan_details.append({"id": p["id"], "path": path})
|
orphan_details.append({"id": p["id"], "path": path})
|
||||||
|
|
||||||
print(f"\n=== Vector GC Report ===")
|
print(f"\n=== Vector GC Report ===")
|
||||||
|
|
@ -130,8 +140,10 @@ def main():
|
||||||
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
|
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
|
||||||
|
|
||||||
if args.purge and orphan_ids:
|
if args.purge and orphan_ids:
|
||||||
print(f"\nPurging {len(orphan_ids)} orphan vectors...")
|
# Use original Qdrant IDs (with dashes) for deletion
|
||||||
result = delete_points(list(orphan_ids))
|
original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids]
|
||||||
|
print(f"\nPurging {len(original_orphan_ids)} orphan vectors...")
|
||||||
|
result = delete_points(original_orphan_ids)
|
||||||
print(f" Done: {result}")
|
print(f" Done: {result}")
|
||||||
elif orphan_ids and not args.purge:
|
elif orphan_ids and not args.purge:
|
||||||
print(f"\nRun with --purge to delete orphan vectors.")
|
print(f"\nRun with --purge to delete orphan vectors.")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue