Add queue_staleness to vital signs (sources >7d unprocessed, bucketed by age). Add ops/vector-gc.py to reconcile Qdrant vectors against filesystem claims — reports orphan vectors and missing embeddings, with --purge flag for cleanup. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
151 lines
5 KiB
Python
151 lines
5 KiB
Python
#!/usr/bin/env python3
|
|
"""Vector GC — reconcile Qdrant vectors against filesystem claims.
|
|
|
|
Scrolls all Qdrant points, cross-references against current claim files
|
|
in the worktree, and reports (or purges) orphan vectors whose source files
|
|
no longer exist.
|
|
|
|
Usage:
|
|
python3 vector-gc.py # Dry run — report only
|
|
python3 vector-gc.py --purge # Delete orphan vectors from Qdrant
|
|
|
|
Pentagon-Agent: Epimetheus <0144398E-4ED3-4FE2-95A3-3D72E1ABF887>
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
|
|
QDRANT_URL = "http://localhost:6333"
|
|
COLLECTION = "teleo-claims"
|
|
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
|
|
|
|
|
|
def make_point_id(path: str) -> str:
|
|
"""Deterministic UUID from file path (must match embed-claims.py)."""
|
|
return hashlib.md5(path.encode()).hexdigest()
|
|
|
|
|
|
def scroll_all_points() -> list[dict]:
|
|
"""Scroll all points from Qdrant collection."""
|
|
points = []
|
|
offset = None
|
|
while True:
|
|
body = {"limit": 100, "with_payload": True, "with_vector": False}
|
|
if offset is not None:
|
|
body["offset"] = offset
|
|
data = json.dumps(body).encode()
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
|
|
data=data,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
result = json.loads(resp.read())["result"]
|
|
batch = result.get("points", [])
|
|
points.extend(batch)
|
|
offset = result.get("next_page_offset")
|
|
if not offset or not batch:
|
|
break
|
|
except Exception as e:
|
|
print(f"ERROR scrolling Qdrant: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
return points
|
|
|
|
|
|
def get_expected_ids() -> dict[str, Path]:
|
|
"""Build map of expected point IDs from filesystem."""
|
|
expected = {}
|
|
for d in EMBED_DIRS:
|
|
dir_path = REPO_DIR / d
|
|
if not dir_path.exists():
|
|
continue
|
|
for f in dir_path.rglob("*.md"):
|
|
rel = str(f.relative_to(REPO_DIR))
|
|
pid = make_point_id(rel)
|
|
expected[pid] = f
|
|
return expected
|
|
|
|
|
|
def delete_points(point_ids: list[str]):
|
|
"""Delete points from Qdrant by ID."""
|
|
body = json.dumps({"points": point_ids}).encode()
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points/delete",
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read())
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Vector GC — reconcile Qdrant vs filesystem")
|
|
parser.add_argument("--purge", action="store_true", help="Delete orphan vectors")
|
|
args = parser.parse_args()
|
|
|
|
print("Scrolling all Qdrant points...")
|
|
points = scroll_all_points()
|
|
print(f" Qdrant vectors: {len(points)}")
|
|
|
|
print("Scanning filesystem for expected claims...")
|
|
expected = get_expected_ids()
|
|
print(f" Filesystem files: {len(expected)}")
|
|
|
|
qdrant_ids = {p["id"] for p in points}
|
|
expected_ids = set(expected.keys())
|
|
|
|
orphan_ids = qdrant_ids - expected_ids
|
|
missing_ids = expected_ids - qdrant_ids
|
|
|
|
# Categorize orphans by their payload path
|
|
orphan_details = []
|
|
for p in points:
|
|
if p["id"] in orphan_ids:
|
|
path = p.get("payload", {}).get("path", "unknown")
|
|
orphan_details.append({"id": p["id"], "path": path})
|
|
|
|
print(f"\n=== Vector GC Report ===")
|
|
print(f"Qdrant vectors: {len(qdrant_ids)}")
|
|
print(f"Filesystem claims: {len(expected_ids)}")
|
|
print(f"Orphan vectors: {len(orphan_ids)} (in Qdrant, no file)")
|
|
print(f"Missing vectors: {len(missing_ids)} (file exists, not in Qdrant)")
|
|
|
|
if orphan_details:
|
|
print(f"\nOrphan vectors (source file deleted):")
|
|
for o in sorted(orphan_details, key=lambda x: x["path"]):
|
|
print(f" {o['id'][:12]} {o['path']}")
|
|
|
|
if missing_ids:
|
|
print(f"\nMissing from Qdrant (need re-embed):")
|
|
for mid in sorted(missing_ids):
|
|
if mid in expected:
|
|
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
|
|
|
|
if args.purge and orphan_ids:
|
|
print(f"\nPurging {len(orphan_ids)} orphan vectors...")
|
|
result = delete_points(list(orphan_ids))
|
|
print(f" Done: {result}")
|
|
elif orphan_ids and not args.purge:
|
|
print(f"\nRun with --purge to delete orphan vectors.")
|
|
|
|
# Summary JSON for cron output
|
|
summary = {
|
|
"qdrant_count": len(qdrant_ids),
|
|
"filesystem_count": len(expected_ids),
|
|
"orphans": len(orphan_ids),
|
|
"missing": len(missing_ids),
|
|
"orphan_paths": [o["path"] for o in orphan_details],
|
|
}
|
|
print(f"\n{json.dumps(summary)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|