diff --git a/ops/vector-gc.py b/ops/vector-gc.py index 27ee82a..5197f58 100644 --- a/ops/vector-gc.py +++ b/ops/vector-gc.py @@ -26,7 +26,11 @@ EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"] def make_point_id(path: str) -> str: - """Deterministic UUID from file path (must match embed-claims.py).""" + """Deterministic UUID from file path (must match embed-claims.py). + + Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize + by stripping dashes for comparison. + """ return hashlib.md5(path.encode()).hexdigest() @@ -99,7 +103,12 @@ def main(): expected = get_expected_ids() print(f" Filesystem files: {len(expected)}") - qdrant_ids = {p["id"] for p in points} + # Normalize IDs: Qdrant formats 32-char hex as UUID with dashes + def normalize_id(pid: str) -> str: + return pid.replace("-", "") + + qdrant_map = {normalize_id(p["id"]): p for p in points} + qdrant_ids = set(qdrant_map.keys()) expected_ids = set(expected.keys()) orphan_ids = qdrant_ids - expected_ids @@ -107,10 +116,11 @@ def main(): # Categorize orphans by their payload path orphan_details = [] - for p in points: - if p["id"] in orphan_ids: - path = p.get("payload", {}).get("path", "unknown") - orphan_details.append({"id": p["id"], "path": path}) + for nid in orphan_ids: + p = qdrant_map[nid] + payload = p.get("payload", {}) + path = payload.get("claim_path") or payload.get("path", "unknown") + orphan_details.append({"id": p["id"], "path": path}) print(f"\n=== Vector GC Report ===") print(f"Qdrant vectors: {len(qdrant_ids)}") @@ -130,8 +140,10 @@ def main(): print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}") if args.purge and orphan_ids: - print(f"\nPurging {len(orphan_ids)} orphan vectors...") - result = delete_points(list(orphan_ids)) + # Use original Qdrant IDs (with dashes) for deletion + original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids] + print(f"\nPurging {len(original_orphan_ids)} orphan vectors...") + result = delete_points(original_orphan_ids) print(f" Done: {result}") elif orphan_ids and not args.purge: print(f"\nRun with --purge to delete orphan vectors.")