teleo-infrastructure/ops/vector-gc.py

#!/usr/bin/env python3
"""Vector GC — reconcile Qdrant vectors against filesystem claims.

Scrolls all Qdrant points, cross-references against current claim files
in the worktree, and reports (or purges) orphan vectors whose source files
no longer exist.

Usage:
    python3 vector-gc.py                # Dry run — report only
    python3 vector-gc.py --purge        # Delete orphan vectors from Qdrant

Pentagon-Agent: Epimetheus <0144398E-4ED3-4FE2-95A3-3D72E1ABF887>
"""

import argparse
import hashlib
import json
import sys
import urllib.request
from pathlib import Path

REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "teleo-claims"
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]


def make_point_id(path: str) -> str:
    """Deterministic UUID from file path (must match embed-claims.py).

    Qdrant auto-formats 32-char hex as UUID with dashes, so we normalize
    by stripping dashes for comparison.
    """
    return hashlib.md5(path.encode()).hexdigest()


def scroll_all_points() -> list[dict]:
    """Scroll all points from Qdrant collection."""
    points = []
    offset = None
    while True:
        body = {"limit": 100, "with_payload": True, "with_vector": False}
        if offset is not None:
            body["offset"] = offset
        data = json.dumps(body).encode()
        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
            data=data,
            headers={"Content-Type": "application/json"},
            method="POST",
        )
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                result = json.loads(resp.read())["result"]
                batch = result.get("points", [])
                points.extend(batch)
                offset = result.get("next_page_offset")
                if not offset or not batch:
                    break
        except Exception as e:
            print(f"ERROR scrolling Qdrant: {e}", file=sys.stderr)
            sys.exit(1)
    return points


def get_expected_ids() -> dict[str, Path]:
    """Build map of expected point IDs from filesystem."""
    expected = {}
    for d in EMBED_DIRS:
        dir_path = REPO_DIR / d
        if not dir_path.exists():
            continue
        for f in dir_path.rglob("*.md"):
            rel = str(f.relative_to(REPO_DIR))
            pid = make_point_id(rel)
            expected[pid] = f
    return expected


def delete_points(point_ids: list[str]):
    """Delete points from Qdrant by ID."""
    body = json.dumps({"points": point_ids}).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION}/points/delete",
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read())


def main():
    parser = argparse.ArgumentParser(description="Vector GC — reconcile Qdrant vs filesystem")
    parser.add_argument("--purge", action="store_true", help="Delete orphan vectors")
    args = parser.parse_args()

    print("Scrolling all Qdrant points...")
    points = scroll_all_points()
    print(f"  Qdrant vectors: {len(points)}")

    print("Scanning filesystem for expected claims...")
    expected = get_expected_ids()
    print(f"  Filesystem files: {len(expected)}")

    # Normalize IDs: Qdrant formats 32-char hex as UUID with dashes
    def normalize_id(pid: str) -> str:
        return pid.replace("-", "")

    qdrant_map = {normalize_id(p["id"]): p for p in points}
    qdrant_ids = set(qdrant_map.keys())
    expected_ids = set(expected.keys())

    orphan_ids = qdrant_ids - expected_ids
    missing_ids = expected_ids - qdrant_ids

    # Categorize orphans by their payload path
    orphan_details = []
    for nid in orphan_ids:
        p = qdrant_map[nid]
        payload = p.get("payload", {})
        path = payload.get("claim_path") or payload.get("path", "unknown")
        orphan_details.append({"id": p["id"], "path": path})

    print(f"\n=== Vector GC Report ===")
    print(f"Qdrant vectors:    {len(qdrant_ids)}")
    print(f"Filesystem claims: {len(expected_ids)}")
    print(f"Orphan vectors:    {len(orphan_ids)} (in Qdrant, no file)")
    print(f"Missing vectors:   {len(missing_ids)} (file exists, not in Qdrant)")

    if orphan_details:
        print(f"\nOrphan vectors (source file deleted):")
        for o in sorted(orphan_details, key=lambda x: x["path"]):
            print(f"  {o['id'][:12]}  {o['path']}")

    if missing_ids:
        print(f"\nMissing from Qdrant (need re-embed):")
        for mid in sorted(missing_ids):
            if mid in expected:
                print(f"  {mid[:12]}  {expected[mid].relative_to(REPO_DIR)}")

    if args.purge and orphan_ids:
        # Use original Qdrant IDs (with dashes) for deletion
        original_orphan_ids = [qdrant_map[nid]["id"] for nid in orphan_ids]
        print(f"\nPurging {len(original_orphan_ids)} orphan vectors...")
        result = delete_points(original_orphan_ids)
        print(f"  Done: {result}")
    elif orphan_ids and not args.purge:
        print(f"\nRun with --purge to delete orphan vectors.")

    # Summary JSON for cron output
    summary = {
        "qdrant_count": len(qdrant_ids),
        "filesystem_count": len(expected_ids),
        "orphans": len(orphan_ids),
        "missing": len(missing_ids),
        "orphan_paths": [o["path"] for o in orphan_details],
    }
    print(f"\n{json.dumps(summary)}")


if __name__ == "__main__":
    main()