teleo-infrastructure/embed-claims.py
m3taversal 5f554bc2de
Some checks failed
CI / lint-and-test (pull_request) Has been cancelled
feat: atomic extract-and-connect + stale PR monitor + response audit
Atomic extract-and-connect (lib/connect.py):
- After extraction writes claim files, each new claim is embedded via
  OpenRouter, searched against Qdrant, and top-5 neighbors (cosine > 0.55)
  are added as `related` edges in the claim's frontmatter
- Edges written on NEW claim only — avoids merge conflicts
- Cross-domain connections enabled, non-fatal on Qdrant failure
- Wired into openrouter-extract-v2.py post-extraction step

Stale PR monitor (lib/stale_pr.py):
- Every watchdog cycle checks open extract/* PRs
- If open >30 min AND 0 claim files → auto-close with comment
- After 2 stale closures → marks source as extraction_failed
- Wired into watchdog.py as check #6

Response audit system:
- response_audit table (migration v8), persistent audit conn in bot.py
- 90-day retention cleanup, tool_calls JSON column
- Confidence tag stripping, systemd ReadWritePaths for pipeline.db

Supporting infrastructure:
- reweave.py: nightly edge reconnection for orphan claims
- reconcile-sources.py: source status reconciliation
- backfill-domains.py: domain classification backfill
- ops/reconcile-source-status.sh: operational reconciliation script
- Attribution improvements, post-extract enrichments, merge improvements

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 22:34:20 +00:00

244 lines
7.2 KiB
Python

#!/usr/bin/env python3
# ONE-SHOT BACKFILL + ongoing embed-on-merge utility.
"""Embed KB claims/decisions/entities into Qdrant for vector search.
Reads markdown files, embeds title+body via OpenAI text-embedding-3-small,
upserts into Qdrant with minimal metadata (path, title, domain, confidence, type).
Usage:
python3 embed-claims.py # Bulk embed all
python3 embed-claims.py --file path.md # Embed single file
python3 embed-claims.py --dry-run # Count without embedding
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.request
from pathlib import Path
import yaml
REPO_DIR = Path("/opt/teleo-eval/workspaces/main")
QDRANT_URL = "http://localhost:6333"
COLLECTION = "teleo-claims"
EMBEDDING_MODEL = "text-embedding-3-small"
# Directories to embed
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
def _get_api_key() -> str:
"""Load OpenRouter API key (same key used for LLM calls)."""
for path in ["/opt/teleo-eval/secrets/openrouter-key"]:
if os.path.exists(path):
return open(path).read().strip()
key = os.environ.get("OPENROUTER_API_KEY", "")
if key:
return key
print("ERROR: No OpenRouter API key found")
sys.exit(1)
def embed_text(text: str, api_key: str) -> list[float] | None:
"""Embed text via OpenRouter (OpenAI-compatible embeddings endpoint)."""
payload = json.dumps({"model": f"openai/{EMBEDDING_MODEL}", "input": text[:8000]}).encode()
req = urllib.request.Request(
"https://openrouter.ai/api/v1/embeddings",
data=payload,
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read())
return data["data"][0]["embedding"]
except Exception as e:
print(f" Embedding failed: {e}")
return None
def parse_frontmatter(path: Path) -> tuple[dict | None, str]:
"""Parse YAML frontmatter and body."""
text = path.read_text(errors="replace")
if not text.startswith("---"):
return None, text
end = text.find("\n---", 3)
if end == -1:
return None, text
try:
fm = yaml.safe_load(text[3:end])
if not isinstance(fm, dict):
return None, text
return fm, text[end + 4:].strip()
except Exception:
return None, text
def upsert_to_qdrant(point_id: str, vector: list[float], payload: dict):
"""Upsert a single point to Qdrant."""
data = json.dumps({
"points": [{
"id": point_id,
"vector": vector,
"payload": payload,
}]
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points",
data=data,
headers={"Content-Type": "application/json"},
method="PUT",
)
with urllib.request.urlopen(req, timeout=10) as resp:
return json.loads(resp.read())
def make_point_id(path: str) -> str:
"""Create a deterministic UUID from file path."""
import hashlib
return str(hashlib.md5(path.encode()).hexdigest())
def classify_file(fm: dict, path: Path) -> tuple[str, str, str, str]:
"""Extract type, domain, confidence, title from frontmatter + path."""
ft = fm.get("type", "")
if ft == "decision":
file_type = "decision"
elif ft == "entity":
file_type = "entity"
else:
file_type = "claim"
domain = fm.get("domain", "")
if not domain:
# Infer from path
rel = path.relative_to(REPO_DIR)
parts = rel.parts
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
domain = parts[1]
elif parts[0] == "core":
domain = "core"
elif parts[0] == "foundations" and len(parts) >= 2:
domain = parts[1]
confidence = fm.get("confidence", "unknown")
title = fm.get("name", fm.get("title", path.stem.replace("-", " ")))
return file_type, domain, confidence, str(title)
def embed_file(path: Path, api_key: str, dry_run: bool = False) -> bool:
"""Embed a single file into Qdrant. Returns True if successful."""
fm, body = parse_frontmatter(path)
if not fm:
return False
# Skip non-knowledge files
ft = fm.get("type", "")
if ft in ("source", "musing"):
return False
if path.name.startswith("_"):
return False
file_type, domain, confidence, title = classify_file(fm, path)
rel_path = str(path.relative_to(REPO_DIR))
# Build embed text: title + first ~6000 chars of body (model handles 8191 tokens)
embed_text_str = f"{title}\n\n{body[:6000]}" if body else title
if dry_run:
print(f" [{file_type}] {rel_path}: {title[:60]}")
return True
# Embed
vector = embed_text(embed_text_str, api_key)
if not vector:
return False
# Upsert to Qdrant
point_id = make_point_id(rel_path)
payload = {
"claim_path": rel_path,
"claim_title": title,
"domain": domain,
"confidence": confidence,
"type": file_type,
"snippet": body[:200] if body else "",
}
try:
upsert_to_qdrant(point_id, vector, payload)
return True
except Exception as e:
print(f" Qdrant upsert failed for {rel_path}: {e}")
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--file", type=str, help="Embed a single file")
args = parser.parse_args()
api_key = _get_api_key()
if args.file:
path = Path(args.file)
if not path.exists():
print(f"File not found: {path}")
sys.exit(1)
ok = embed_file(path, api_key, dry_run=args.dry_run)
print("OK" if ok else "SKIP")
return
# Bulk embed
files = []
for d in EMBED_DIRS:
base = REPO_DIR / d
if not base.exists():
continue
for md in base.rglob("*.md"):
if not md.name.startswith("_"):
files.append(md)
print(f"Found {len(files)} files to process")
embedded = 0
skipped = 0
failed = 0
for i, path in enumerate(files):
if i % 50 == 0 and i > 0:
print(f" Progress: {i}/{len(files)} ({embedded} embedded, {skipped} skipped)")
if not args.dry_run:
time.sleep(0.5) # Rate limit courtesy
ok = embed_file(path, api_key, dry_run=args.dry_run)
if ok:
embedded += 1
else:
skipped += 1
if not args.dry_run and embedded % 20 == 0 and embedded > 0:
time.sleep(1) # Batch rate limit
print(f"\nDone: {embedded} embedded, {skipped} skipped, {failed} failed")
if not args.dry_run:
# Verify
try:
resp = urllib.request.urlopen(f"{QDRANT_URL}/collections/{COLLECTION}")
data = json.loads(resp.read())
count = data["result"]["points_count"]
print(f"Qdrant collection: {count} vectors")
except Exception as e:
print(f"Verification failed: {e}")
if __name__ == "__main__":
main()