#!/usr/bin/env python3 # ONE-SHOT BACKFILL + ongoing embed-on-merge utility. """Embed KB claims/decisions/entities into Qdrant for vector search. Reads markdown files, embeds title+body via OpenAI text-embedding-3-small, upserts into Qdrant with minimal metadata (path, title, domain, confidence, type). Usage: python3 embed-claims.py # Bulk embed all python3 embed-claims.py --file path.md # Embed single file python3 embed-claims.py --dry-run # Count without embedding Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> """ import argparse import json import os import re import sys import time import urllib.request from pathlib import Path import yaml REPO_DIR = Path("/opt/teleo-eval/workspaces/main") QDRANT_URL = "http://localhost:6333" COLLECTION = "teleo-claims" EMBEDDING_MODEL = "text-embedding-3-small" # Directories to embed EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"] def _get_api_key() -> str: """Load OpenRouter API key (same key used for LLM calls).""" for path in ["/opt/teleo-eval/secrets/openrouter-key"]: if os.path.exists(path): return open(path).read().strip() key = os.environ.get("OPENROUTER_API_KEY", "") if key: return key print("ERROR: No OpenRouter API key found") sys.exit(1) def embed_text(text: str, api_key: str) -> list[float] | None: """Embed text via OpenRouter (OpenAI-compatible embeddings endpoint).""" payload = json.dumps({"model": f"openai/{EMBEDDING_MODEL}", "input": text[:8000]}).encode() req = urllib.request.Request( "https://openrouter.ai/api/v1/embeddings", data=payload, headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, ) try: with urllib.request.urlopen(req, timeout=15) as resp: data = json.loads(resp.read()) return data["data"][0]["embedding"] except Exception as e: print(f" Embedding failed: {e}") return None def parse_frontmatter(path: Path) -> tuple[dict | None, str]: """Parse YAML frontmatter and body.""" text = path.read_text(errors="replace") if not text.startswith("---"): return None, text end = text.find("\n---", 3) if end == -1: return None, text try: fm = yaml.safe_load(text[3:end]) if not isinstance(fm, dict): return None, text return fm, text[end + 4:].strip() except Exception: return None, text def upsert_to_qdrant(point_id: str, vector: list[float], payload: dict): """Upsert a single point to Qdrant.""" data = json.dumps({ "points": [{ "id": point_id, "vector": vector, "payload": payload, }] }).encode() req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION}/points", data=data, headers={"Content-Type": "application/json"}, method="PUT", ) with urllib.request.urlopen(req, timeout=10) as resp: return json.loads(resp.read()) def make_point_id(path: str) -> str: """Create a deterministic UUID from file path.""" import hashlib return str(hashlib.md5(path.encode()).hexdigest()) def classify_file(fm: dict, path: Path) -> tuple[str, str, str, str]: """Extract type, domain, confidence, title from frontmatter + path.""" ft = fm.get("type", "") if ft == "decision": file_type = "decision" elif ft == "entity": file_type = "entity" else: file_type = "claim" domain = fm.get("domain", "") if not domain: # Infer from path rel = path.relative_to(REPO_DIR) parts = rel.parts if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"): domain = parts[1] elif parts[0] == "core": domain = "core" elif parts[0] == "foundations" and len(parts) >= 2: domain = parts[1] confidence = fm.get("confidence", "unknown") title = fm.get("name", fm.get("title", path.stem.replace("-", " "))) return file_type, domain, confidence, str(title) def embed_file(path: Path, api_key: str, dry_run: bool = False) -> bool: """Embed a single file into Qdrant. Returns True if successful.""" fm, body = parse_frontmatter(path) if not fm: return False # Skip non-knowledge files ft = fm.get("type", "") if ft in ("source", "musing"): return False if path.name.startswith("_"): return False file_type, domain, confidence, title = classify_file(fm, path) rel_path = str(path.relative_to(REPO_DIR)) # Build embed text: title + first ~6000 chars of body (model handles 8191 tokens) embed_text_str = f"{title}\n\n{body[:6000]}" if body else title if dry_run: print(f" [{file_type}] {rel_path}: {title[:60]}") return True # Embed vector = embed_text(embed_text_str, api_key) if not vector: return False # Upsert to Qdrant point_id = make_point_id(rel_path) payload = { "claim_path": rel_path, "claim_title": title, "domain": domain, "confidence": confidence, "type": file_type, "snippet": body[:200] if body else "", } try: upsert_to_qdrant(point_id, vector, payload) return True except Exception as e: print(f" Qdrant upsert failed for {rel_path}: {e}") return False def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") parser.add_argument("--file", type=str, help="Embed a single file") args = parser.parse_args() api_key = _get_api_key() if args.file: path = Path(args.file) if not path.exists(): print(f"File not found: {path}") sys.exit(1) ok = embed_file(path, api_key, dry_run=args.dry_run) print("OK" if ok else "SKIP") return # Bulk embed files = [] for d in EMBED_DIRS: base = REPO_DIR / d if not base.exists(): continue for md in base.rglob("*.md"): if not md.name.startswith("_"): files.append(md) print(f"Found {len(files)} files to process") embedded = 0 skipped = 0 failed = 0 for i, path in enumerate(files): if i % 50 == 0 and i > 0: print(f" Progress: {i}/{len(files)} ({embedded} embedded, {skipped} skipped)") if not args.dry_run: time.sleep(0.5) # Rate limit courtesy ok = embed_file(path, api_key, dry_run=args.dry_run) if ok: embedded += 1 else: skipped += 1 if not args.dry_run and embedded % 20 == 0 and embedded > 0: time.sleep(1) # Batch rate limit print(f"\nDone: {embedded} embedded, {skipped} skipped, {failed} failed") if not args.dry_run: # Verify try: resp = urllib.request.urlopen(f"{QDRANT_URL}/collections/{COLLECTION}") data = json.loads(resp.read()) count = data["result"]["points_count"] print(f"Qdrant collection: {count} vectors") except Exception as e: print(f"Verification failed: {e}") if __name__ == "__main__": main()