teleo-infrastructure/ops/vector-gc.py

#!/usr/bin/env python3
"""Vector GC — reconcile Qdrant vectors against filesystem claims.

Scrolls all Qdrant points, cross-references against current claim files
in the worktree, and reports (or purges) orphan vectors whose source files
no longer exist.

Usage:
    python3 vector-gc.py                # Dry run — report only
    python3 vector-gc.py --purge        # Delete orphan vectors from Qdrant

Pentagon-Agent: Epimetheus <0144398E-4ED3-4FE2-95A3-3D72E1ABF887>
"""

import argparse
import hashlib
import json
import sys
import urllib.request
from pathlib import Path

REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "teleo-claims"
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]


def make_point_id(path: str) -> str:
    """Deterministic UUID from file path (must match embed-claims.py)."""
    return hashlib.md5(path.encode()).hexdigest()


def scroll_all_points() -> list[dict]:
    """Scroll all points from Qdrant collection."""
    points = []
    offset = None
    while True:
        body = {"limit": 100, "with_payload": True, "with_vector": False}
        if offset is not None:
            body["offset"] = offset
        data = json.dumps(body).encode()
        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
            data=data,
            headers={"Content-Type": "application/json"},
            method="POST",
        )
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                result = json.loads(resp.read())["result"]
                batch = result.get("points", [])
                points.extend(batch)
                offset = result.get("next_page_offset")
                if not offset or not batch:
                    break
        except Exception as e:
            print(f"ERROR scrolling Qdrant: {e}", file=sys.stderr)
            sys.exit(1)
    return points


def get_expected_ids() -> dict[str, Path]:
    """Build map of expected point IDs from filesystem."""
    expected = {}
    for d in EMBED_DIRS:
        dir_path = REPO_DIR / d
        if not dir_path.exists():
            continue
        for f in dir_path.rglob("*.md"):
            rel = str(f.relative_to(REPO_DIR))
            pid = make_point_id(rel)
            expected[pid] = f
    return expected


def delete_points(point_ids: list[str]):
    """Delete points from Qdrant by ID."""
    body = json.dumps({"points": point_ids}).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION}/points/delete",
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read())


def main():
    parser = argparse.ArgumentParser(description="Vector GC — reconcile Qdrant vs filesystem")
    parser.add_argument("--purge", action="store_true", help="Delete orphan vectors")
    args = parser.parse_args()

    print("Scrolling all Qdrant points...")
    points = scroll_all_points()
    print(f"  Qdrant vectors: {len(points)}")

    print("Scanning filesystem for expected claims...")
    expected = get_expected_ids()
    print(f"  Filesystem files: {len(expected)}")

    qdrant_ids = {p["id"] for p in points}
    expected_ids = set(expected.keys())

    orphan_ids = qdrant_ids - expected_ids
    missing_ids = expected_ids - qdrant_ids

    # Categorize orphans by their payload path
    orphan_details = []
    for p in points:
        if p["id"] in orphan_ids:
            path = p.get("payload", {}).get("path", "unknown")
            orphan_details.append({"id": p["id"], "path": path})

    print(f"\n=== Vector GC Report ===")
    print(f"Qdrant vectors:    {len(qdrant_ids)}")
    print(f"Filesystem claims: {len(expected_ids)}")
    print(f"Orphan vectors:    {len(orphan_ids)} (in Qdrant, no file)")
    print(f"Missing vectors:   {len(missing_ids)} (file exists, not in Qdrant)")

    if orphan_details:
        print(f"\nOrphan vectors (source file deleted):")
        for o in sorted(orphan_details, key=lambda x: x["path"]):
            print(f"  {o['id'][:12]}  {o['path']}")

    if missing_ids:
        print(f"\nMissing from Qdrant (need re-embed):")
        for mid in sorted(missing_ids):
            if mid in expected:
                print(f"  {mid[:12]}  {expected[mid].relative_to(REPO_DIR)}")

    if args.purge and orphan_ids:
        print(f"\nPurging {len(orphan_ids)} orphan vectors...")
        result = delete_points(list(orphan_ids))
        print(f"  Done: {result}")
    elif orphan_ids and not args.purge:
        print(f"\nRun with --purge to delete orphan vectors.")

    # Summary JSON for cron output
    summary = {
        "qdrant_count": len(qdrant_ids),
        "filesystem_count": len(expected_ids),
        "orphans": len(orphan_ids),
        "missing": len(missing_ids),
        "orphan_paths": [o["path"] for o in orphan_details],
    }
    print(f"\n{json.dumps(summary)}")


if __name__ == "__main__":
    main()