- embed-claims.py: bulk embeds all claims/decisions/entities into Qdrant via OpenRouter (openai/text-embedding-3-small, 1536 dims) - diagnostics/app.py: search endpoint switched from OpenAI direct to OpenRouter (same key as LLM calls, no new credentials) - Qdrant running on VPS (Docker, port 6333, persistent storage) - Collection: teleo-claims, cosine distance, 1536 dims 854 files to embed. Bulk backfill running. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
244 lines
7.2 KiB
Python
244 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
# ONE-SHOT BACKFILL + ongoing embed-on-merge utility.
|
|
"""Embed KB claims/decisions/entities into Qdrant for vector search.
|
|
|
|
Reads markdown files, embeds title+body via OpenAI text-embedding-3-small,
|
|
upserts into Qdrant with minimal metadata (path, title, domain, confidence, type).
|
|
|
|
Usage:
|
|
python3 embed-claims.py # Bulk embed all
|
|
python3 embed-claims.py --file path.md # Embed single file
|
|
python3 embed-claims.py --dry-run # Count without embedding
|
|
|
|
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
|
|
QDRANT_URL = "http://localhost:6333"
|
|
COLLECTION = "teleo-claims"
|
|
EMBEDDING_MODEL = "text-embedding-3-small"
|
|
|
|
# Directories to embed
|
|
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
|
|
|
|
|
|
def _get_api_key() -> str:
|
|
"""Load OpenRouter API key (same key used for LLM calls)."""
|
|
for path in ["/opt/teleo-eval/secrets/openrouter-key"]:
|
|
if os.path.exists(path):
|
|
return open(path).read().strip()
|
|
key = os.environ.get("OPENROUTER_API_KEY", "")
|
|
if key:
|
|
return key
|
|
print("ERROR: No OpenRouter API key found")
|
|
sys.exit(1)
|
|
|
|
|
|
def embed_text(text: str, api_key: str) -> list[float] | None:
|
|
"""Embed text via OpenRouter (OpenAI-compatible embeddings endpoint)."""
|
|
payload = json.dumps({"model": f"openai/{EMBEDDING_MODEL}", "input": text[:8000]}).encode()
|
|
req = urllib.request.Request(
|
|
"https://openrouter.ai/api/v1/embeddings",
|
|
data=payload,
|
|
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
data = json.loads(resp.read())
|
|
return data["data"][0]["embedding"]
|
|
except Exception as e:
|
|
print(f" Embedding failed: {e}")
|
|
return None
|
|
|
|
|
|
def parse_frontmatter(path: Path) -> tuple[dict | None, str]:
|
|
"""Parse YAML frontmatter and body."""
|
|
text = path.read_text(errors="replace")
|
|
if not text.startswith("---"):
|
|
return None, text
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return None, text
|
|
try:
|
|
fm = yaml.safe_load(text[3:end])
|
|
if not isinstance(fm, dict):
|
|
return None, text
|
|
return fm, text[end + 4:].strip()
|
|
except Exception:
|
|
return None, text
|
|
|
|
|
|
def upsert_to_qdrant(point_id: str, vector: list[float], payload: dict):
|
|
"""Upsert a single point to Qdrant."""
|
|
data = json.dumps({
|
|
"points": [{
|
|
"id": point_id,
|
|
"vector": vector,
|
|
"payload": payload,
|
|
}]
|
|
}).encode()
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points",
|
|
data=data,
|
|
headers={"Content-Type": "application/json"},
|
|
method="PUT",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
return json.loads(resp.read())
|
|
|
|
|
|
def make_point_id(path: str) -> str:
|
|
"""Create a deterministic UUID from file path."""
|
|
import hashlib
|
|
return str(hashlib.md5(path.encode()).hexdigest())
|
|
|
|
|
|
def classify_file(fm: dict, path: Path) -> tuple[str, str, str, str]:
|
|
"""Extract type, domain, confidence, title from frontmatter + path."""
|
|
ft = fm.get("type", "")
|
|
if ft == "decision":
|
|
file_type = "decision"
|
|
elif ft == "entity":
|
|
file_type = "entity"
|
|
else:
|
|
file_type = "claim"
|
|
|
|
domain = fm.get("domain", "")
|
|
if not domain:
|
|
# Infer from path
|
|
rel = path.relative_to(REPO_DIR)
|
|
parts = rel.parts
|
|
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
|
|
domain = parts[1]
|
|
elif parts[0] == "core":
|
|
domain = "core"
|
|
elif parts[0] == "foundations" and len(parts) >= 2:
|
|
domain = parts[1]
|
|
|
|
confidence = fm.get("confidence", "unknown")
|
|
title = fm.get("name", fm.get("title", path.stem.replace("-", " ")))
|
|
|
|
return file_type, domain, confidence, str(title)
|
|
|
|
|
|
def embed_file(path: Path, api_key: str, dry_run: bool = False) -> bool:
|
|
"""Embed a single file into Qdrant. Returns True if successful."""
|
|
fm, body = parse_frontmatter(path)
|
|
if not fm:
|
|
return False
|
|
|
|
# Skip non-knowledge files
|
|
ft = fm.get("type", "")
|
|
if ft in ("source", "musing"):
|
|
return False
|
|
if path.name.startswith("_"):
|
|
return False
|
|
|
|
file_type, domain, confidence, title = classify_file(fm, path)
|
|
rel_path = str(path.relative_to(REPO_DIR))
|
|
|
|
# Build embed text: title + first ~2000 chars of body
|
|
embed_text_str = f"{title}\n\n{body[:2000]}" if body else title
|
|
|
|
if dry_run:
|
|
print(f" [{file_type}] {rel_path}: {title[:60]}")
|
|
return True
|
|
|
|
# Embed
|
|
vector = embed_text(embed_text_str, api_key)
|
|
if not vector:
|
|
return False
|
|
|
|
# Upsert to Qdrant
|
|
point_id = make_point_id(rel_path)
|
|
payload = {
|
|
"claim_path": rel_path,
|
|
"claim_title": title,
|
|
"domain": domain,
|
|
"confidence": confidence,
|
|
"type": file_type,
|
|
"snippet": body[:200] if body else "",
|
|
}
|
|
|
|
try:
|
|
upsert_to_qdrant(point_id, vector, payload)
|
|
return True
|
|
except Exception as e:
|
|
print(f" Qdrant upsert failed for {rel_path}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--file", type=str, help="Embed a single file")
|
|
args = parser.parse_args()
|
|
|
|
api_key = _get_api_key()
|
|
|
|
if args.file:
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(f"File not found: {path}")
|
|
sys.exit(1)
|
|
ok = embed_file(path, api_key, dry_run=args.dry_run)
|
|
print("OK" if ok else "SKIP")
|
|
return
|
|
|
|
# Bulk embed
|
|
files = []
|
|
for d in EMBED_DIRS:
|
|
base = REPO_DIR / d
|
|
if not base.exists():
|
|
continue
|
|
for md in base.rglob("*.md"):
|
|
if not md.name.startswith("_"):
|
|
files.append(md)
|
|
|
|
print(f"Found {len(files)} files to process")
|
|
|
|
embedded = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for i, path in enumerate(files):
|
|
if i % 50 == 0 and i > 0:
|
|
print(f" Progress: {i}/{len(files)} ({embedded} embedded, {skipped} skipped)")
|
|
if not args.dry_run:
|
|
time.sleep(0.5) # Rate limit courtesy
|
|
|
|
ok = embed_file(path, api_key, dry_run=args.dry_run)
|
|
if ok:
|
|
embedded += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
if not args.dry_run and embedded % 20 == 0 and embedded > 0:
|
|
time.sleep(1) # Batch rate limit
|
|
|
|
print(f"\nDone: {embedded} embedded, {skipped} skipped, {failed} failed")
|
|
|
|
if not args.dry_run:
|
|
# Verify
|
|
try:
|
|
resp = urllib.request.urlopen(f"{QDRANT_URL}/collections/{COLLECTION}")
|
|
data = json.loads(resp.read())
|
|
count = data["result"]["points_count"]
|
|
print(f"Qdrant collection: {count} vectors")
|
|
except Exception as e:
|
|
print(f"Verification failed: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|