teleo-infrastructure/embed-claims.py

#!/usr/bin/env python3
# ONE-SHOT BACKFILL + ongoing embed-on-merge utility.
"""Embed KB claims/decisions/entities into Qdrant for vector search.

Reads markdown files, embeds title+body via OpenAI text-embedding-3-small,
upserts into Qdrant with minimal metadata (path, title, domain, confidence, type).

Usage:
    python3 embed-claims.py                    # Bulk embed all
    python3 embed-claims.py --file path.md     # Embed single file
    python3 embed-claims.py --dry-run          # Count without embedding

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
"""

import argparse
import json
import os
import re
import sys
import time
import urllib.request
from pathlib import Path

import yaml

REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "teleo-claims"
EMBEDDING_MODEL = "text-embedding-3-small"

# Directories to embed
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]


def _get_api_key() -> str:
    """Load OpenRouter API key (same key used for LLM calls)."""
    for path in ["/opt/teleo-eval/secrets/openrouter-key"]:
        if os.path.exists(path):
            return open(path).read().strip()
    key = os.environ.get("OPENROUTER_API_KEY", "")
    if key:
        return key
    print("ERROR: No OpenRouter API key found")
    sys.exit(1)


def embed_text(text: str, api_key: str) -> list[float] | None:
    """Embed text via OpenRouter (OpenAI-compatible embeddings endpoint)."""
    payload = json.dumps({"model": f"openai/{EMBEDDING_MODEL}", "input": text[:8000]}).encode()
    req = urllib.request.Request(
        "https://openrouter.ai/api/v1/embeddings",
        data=payload,
        headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read())
            return data["data"][0]["embedding"]
    except Exception as e:
        print(f"  Embedding failed: {e}")
        return None


def parse_frontmatter(path: Path) -> tuple[dict | None, str]:
    """Parse YAML frontmatter and body."""
    text = path.read_text(errors="replace")
    if not text.startswith("---"):
        return None, text
    end = text.find("\n---", 3)
    if end == -1:
        return None, text
    try:
        fm = yaml.safe_load(text[3:end])
        if not isinstance(fm, dict):
            return None, text
        return fm, text[end + 4:].strip()
    except Exception:
        return None, text


def upsert_to_qdrant(point_id: str, vector: list[float], payload: dict):
    """Upsert a single point to Qdrant."""
    data = json.dumps({
        "points": [{
            "id": point_id,
            "vector": vector,
            "payload": payload,
        }]
    }).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION}/points",
        data=data,
        headers={"Content-Type": "application/json"},
        method="PUT",
    )
    with urllib.request.urlopen(req, timeout=10) as resp:
        return json.loads(resp.read())


def make_point_id(path: str) -> str:
    """Create a deterministic UUID from file path."""
    import hashlib
    return str(hashlib.md5(path.encode()).hexdigest())


def classify_file(fm: dict, path: Path) -> tuple[str, str, str, str]:
    """Extract type, domain, confidence, title from frontmatter + path."""
    ft = fm.get("type", "")
    if ft == "decision":
        file_type = "decision"
    elif ft == "entity":
        file_type = "entity"
    else:
        file_type = "claim"

    domain = fm.get("domain", "")
    if not domain:
        # Infer from path
        rel = path.relative_to(REPO_DIR)
        parts = rel.parts
        if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
            domain = parts[1]
        elif parts[0] == "core":
            domain = "core"
        elif parts[0] == "foundations" and len(parts) >= 2:
            domain = parts[1]

    confidence = fm.get("confidence", "unknown")
    title = fm.get("name", fm.get("title", path.stem.replace("-", " ")))

    return file_type, domain, confidence, str(title)


def embed_file(path: Path, api_key: str, dry_run: bool = False) -> bool:
    """Embed a single file into Qdrant. Returns True if successful."""
    fm, body = parse_frontmatter(path)
    if not fm:
        return False

    # Skip non-knowledge files
    ft = fm.get("type", "")
    if ft in ("source", "musing"):
        return False
    if path.name.startswith("_"):
        return False

    file_type, domain, confidence, title = classify_file(fm, path)
    rel_path = str(path.relative_to(REPO_DIR))

    # Build embed text: title + first ~2000 chars of body
    embed_text_str = f"{title}\n\n{body[:2000]}" if body else title

    if dry_run:
        print(f"  [{file_type}] {rel_path}: {title[:60]}")
        return True

    # Embed
    vector = embed_text(embed_text_str, api_key)
    if not vector:
        return False

    # Upsert to Qdrant
    point_id = make_point_id(rel_path)
    payload = {
        "claim_path": rel_path,
        "claim_title": title,
        "domain": domain,
        "confidence": confidence,
        "type": file_type,
        "snippet": body[:200] if body else "",
    }

    try:
        upsert_to_qdrant(point_id, vector, payload)
        return True
    except Exception as e:
        print(f"  Qdrant upsert failed for {rel_path}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--file", type=str, help="Embed a single file")
    args = parser.parse_args()

    api_key = _get_api_key()

    if args.file:
        path = Path(args.file)
        if not path.exists():
            print(f"File not found: {path}")
            sys.exit(1)
        ok = embed_file(path, api_key, dry_run=args.dry_run)
        print("OK" if ok else "SKIP")
        return

    # Bulk embed
    files = []
    for d in EMBED_DIRS:
        base = REPO_DIR / d
        if not base.exists():
            continue
        for md in base.rglob("*.md"):
            if not md.name.startswith("_"):
                files.append(md)

    print(f"Found {len(files)} files to process")

    embedded = 0
    skipped = 0
    failed = 0

    for i, path in enumerate(files):
        if i % 50 == 0 and i > 0:
            print(f"  Progress: {i}/{len(files)} ({embedded} embedded, {skipped} skipped)")
            if not args.dry_run:
                time.sleep(0.5)  # Rate limit courtesy

        ok = embed_file(path, api_key, dry_run=args.dry_run)
        if ok:
            embedded += 1
        else:
            skipped += 1

        if not args.dry_run and embedded % 20 == 0 and embedded > 0:
            time.sleep(1)  # Batch rate limit

    print(f"\nDone: {embedded} embedded, {skipped} skipped, {failed} failed")

    if not args.dry_run:
        # Verify
        try:
            resp = urllib.request.urlopen(f"{QDRANT_URL}/collections/{COLLECTION}")
            data = json.loads(resp.read())
            count = data["result"]["points_count"]
            print(f"Qdrant collection: {count} vectors")
        except Exception as e:
            print(f"Verification failed: {e}")


if __name__ == "__main__":
    main()