teleo-infrastructure/ops/vector-gc.py
m3taversal 681afad506
Some checks failed
CI / lint-and-test (push) Has been cancelled
Consolidate pipeline code from teleo-codex + VPS into single repo
Sources merged:
- teleo-codex/ops/pipeline-v2/ (11 newer lib files, 5 new lib modules)
- teleo-codex/ops/ (agent-state, diagnostics expansion, systemd units, ops scripts)
- VPS /opt/teleo-eval/telegram/ (10 new bot files, agent configs)
- VPS /opt/teleo-eval/pipeline/ops/ (vector-gc, backfill-descriptions)
- VPS /opt/teleo-eval/sync-mirror.sh (Bug 2 + Step 2.5 fixes)

Non-trivial merges:
- connect.py: kept codex threshold (0.65) + added infra domain parameter
- watchdog.py: kept infra version (stale_pr integration, superset of codex)
- deploy.sh: codex rsync version (interim, until VPS git clone migration)
- diagnostics/app.py: codex decomposed dashboard (14 new route modules)

81 files changed, +17105/-200 lines

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 16:52:26 +01:00

163 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""Vector GC — reconcile Qdrant vectors against filesystem claims.
Scrolls all Qdrant points, cross-references against current claim files
in the worktree, and reports (or purges) orphan vectors whose source files
no longer exist.
Usage:
python3 vector-gc.py # Dry run — report only
python3 vector-gc.py --purge # Delete orphan vectors from Qdrant
Pentagon-Agent: Epimetheus <0144398E-4ED3-4FE2-95A3-3D72E1ABF887>
"""
import argparse
import hashlib
import json
import sys
import urllib.request
from pathlib import Path
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "teleo-claims"
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
def make_point_id(path: str) -> str:
"""Deterministic UUID from file path (must match embed-claims.py).
Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize
by stripping dashes for comparison.
"""
return hashlib.md5(path.encode()).hexdigest()
def scroll_all_points() -> list[dict]:
"""Scroll all points from Qdrant collection."""
points = []
offset = None
while True:
body = {"limit": 100, "with_payload": True, "with_vector": False}
if offset is not None:
body["offset"] = offset
data = json.dumps(body).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=data,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
result = json.loads(resp.read())["result"]
batch = result.get("points", [])
points.extend(batch)
offset = result.get("next_page_offset")
if not offset or not batch:
break
except Exception as e:
print(f"ERROR scrolling Qdrant: {e}", file=sys.stderr)
sys.exit(1)
return points
def get_expected_ids() -> dict[str, Path]:
"""Build map of expected point IDs from filesystem."""
expected = {}
for d in EMBED_DIRS:
dir_path = REPO_DIR / d
if not dir_path.exists():
continue
for f in dir_path.rglob("*.md"):
rel = str(f.relative_to(REPO_DIR))
pid = make_point_id(rel)
expected[pid] = f
return expected
def delete_points(point_ids: list[str]):
"""Delete points from Qdrant by ID."""
body = json.dumps({"points": point_ids}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/delete",
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
def main():
parser = argparse.ArgumentParser(description="Vector GC — reconcile Qdrant vs filesystem")
parser.add_argument("--purge", action="store_true", help="Delete orphan vectors")
args = parser.parse_args()
print("Scrolling all Qdrant points...")
points = scroll_all_points()
print(f" Qdrant vectors: {len(points)}")
print("Scanning filesystem for expected claims...")
expected = get_expected_ids()
print(f" Filesystem files: {len(expected)}")
# Normalize IDs: Qdrant formats 32-char hex as UUID with dashes
def normalize_id(pid: str) -> str:
return pid.replace("-", "")
qdrant_map = {normalize_id(p["id"]): p for p in points}
qdrant_ids = set(qdrant_map.keys())
expected_ids = set(expected.keys())
orphan_ids = qdrant_ids - expected_ids
missing_ids = expected_ids - qdrant_ids
# Categorize orphans by their payload path
orphan_details = []
for nid in orphan_ids:
p = qdrant_map[nid]
payload = p.get("payload", {})
path = payload.get("claim_path") or payload.get("path", "unknown")
orphan_details.append({"id": p["id"], "path": path})
print(f"\n=== Vector GC Report ===")
print(f"Qdrant vectors: {len(qdrant_ids)}")
print(f"Filesystem claims: {len(expected_ids)}")
print(f"Orphan vectors: {len(orphan_ids)} (in Qdrant, no file)")
print(f"Missing vectors: {len(missing_ids)} (file exists, not in Qdrant)")
if orphan_details:
print(f"\nOrphan vectors (source file deleted):")
for o in sorted(orphan_details, key=lambda x: x["path"]):
print(f" {o['id'][:12]} {o['path']}")
if missing_ids:
print(f"\nMissing from Qdrant (need re-embed):")
for mid in sorted(missing_ids):
if mid in expected:
print(f" {mid[:12]} {expected[mid].relative_to(REPO_DIR)}")
if args.purge and orphan_ids:
# Use original Qdrant IDs (with dashes) for deletion
original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids]
print(f"\nPurging {len(original_orphan_ids)} orphan vectors...")
result = delete_points(original_orphan_ids)
print(f" Done: {result}")
elif orphan_ids and not args.purge:
print(f"\nRun with --purge to delete orphan vectors.")
# Summary JSON for cron output
summary = {
"qdrant_count": len(qdrant_ids),
"filesystem_count": len(expected_ids),
"orphans": len(orphan_ids),
"missing": len(missing_ids),
"orphan_paths": [o["path"] for o in orphan_details],
}
print(f"\n{json.dumps(summary)}")
if __name__ == "__main__":
main()