#!/usr/bin/env python3 """Vector GC — reconcile Qdrant vectors against filesystem claims. Scrolls all Qdrant points, cross-references against current claim files in the worktree, and reports (or purges) orphan vectors whose source files no longer exist. Usage: python3 vector-gc.py # Dry run — report only python3 vector-gc.py --purge # Delete orphan vectors from Qdrant Pentagon-Agent: Epimetheus <0144398E-4ED3-4FE2-95A3-3D72E1ABF887> """ import argparse import hashlib import json import sys import urllib.request from pathlib import Path REPO_DIR = Path("/opt/teleo-eval/workspaces/main") QDRANT_URL = "http://localhost:6333" COLLECTION = "teleo-claims" EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"] def make_point_id(path: str) -> str: """Deterministic UUID from file path (must match embed-claims.py). Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize by stripping dashes for comparison. """ return hashlib.md5(path.encode()).hexdigest() def scroll_all_points() -> list[dict]: """Scroll all points from Qdrant collection.""" points = [] offset = None while True: body = {"limit": 100, "with_payload": True, "with_vector": False} if offset is not None: body["offset"] = offset data = json.dumps(body).encode() req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", data=data, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=30) as resp: result = json.loads(resp.read())["result"] batch = result.get("points", []) points.extend(batch) offset = result.get("next_page_offset") if not offset or not batch: break except Exception as e: print(f"ERROR scrolling Qdrant: {e}", file=sys.stderr) sys.exit(1) return points def get_expected_ids() -> dict[str, Path]: """Build map of expected point IDs from filesystem.""" expected = {} for d in EMBED_DIRS: dir_path = REPO_DIR / d if not dir_path.exists(): continue for f in dir_path.rglob("*.md"): rel = str(f.relative_to(REPO_DIR)) pid = make_point_id(rel) expected[pid] = f return expected def delete_points(point_ids: list[str]): """Delete points from Qdrant by ID.""" body = json.dumps({"points": point_ids}).encode() req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION}/points/delete", data=body, headers={"Content-Type": "application/json"}, method="POST", ) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read()) def main(): parser = argparse.ArgumentParser(description="Vector GC — reconcile Qdrant vs filesystem") parser.add_argument("--purge", action="store_true", help="Delete orphan vectors") args = parser.parse_args() print("Scrolling all Qdrant points...") points = scroll_all_points() print(f" Qdrant vectors: {len(points)}") print("Scanning filesystem for expected claims...") expected = get_expected_ids() print(f" Filesystem files: {len(expected)}") # Normalize IDs: Qdrant formats 32-char hex as UUID with dashes def normalize_id(pid: str) -> str: return pid.replace("-", "") qdrant_map = {normalize_id(p["id"]): p for p in points} qdrant_ids = set(qdrant_map.keys()) expected_ids = set(expected.keys()) orphan_ids = qdrant_ids - expected_ids missing_ids = expected_ids - qdrant_ids # Categorize orphans by their payload path orphan_details = [] for nid in orphan_ids: p = qdrant_map[nid] payload = p.get("payload", {}) path = payload.get("claim_path") or payload.get("path", "unknown") orphan_details.append({"id": p["id"], "path": path}) print(f"\n=== Vector GC Report ===") print(f"Qdrant vectors: {len(qdrant_ids)}") print(f"Filesystem claims: {len(expected_ids)}") print(f"Orphan vectors: {len(orphan_ids)} (in Qdrant, no file)") print(f"Missing vectors: {len(missing_ids)} (file exists, not in Qdrant)") if orphan_details: print(f"\nOrphan vectors (source file deleted):") for o in sorted(orphan_details, key=lambda x: x["path"]): print(f" {o['id'][:12]} {o['path']}") if missing_ids: print(f"\nMissing from Qdrant (need re-embed):") for mid in sorted(missing_ids): if mid in expected: print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}") if args.purge and orphan_ids: # Use original Qdrant IDs (with dashes) for deletion original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids] print(f"\nPurging {len(original_orphan_ids)} orphan vectors...") result = delete_points(original_orphan_ids) print(f" Done: {result}") elif orphan_ids and not args.purge: print(f"\nRun with --purge to delete orphan vectors.") # Summary JSON for cron output summary = { "qdrant_count": len(qdrant_ids), "filesystem_count": len(expected_ids), "orphans": len(orphan_ids), "missing": len(missing_ids), "orphan_paths": [o["path"] for o in orphan_details], } print(f"\n{json.dumps(summary)}") if __name__ == "__main__": main()