Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
480 lines
17 KiB
Python
480 lines
17 KiB
Python
"""Shared Qdrant vector search library for the Teleo knowledge base.
|
|
|
|
Provides embed + search + graph expansion as a reusable library.
|
|
Any consumer (Argus dashboard, Telegram bot, agent research) imports from here.
|
|
|
|
Layer 1: Qdrant vector search (semantic similarity)
|
|
Layer 2: Graph expansion (1-hop via frontmatter edges)
|
|
Layer 3: Left to the caller (agent context, domain filtering)
|
|
|
|
Owner: Epimetheus
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import urllib.request
|
|
|
|
from . import config
|
|
|
|
logger = logging.getLogger("pipeline.search")
|
|
|
|
# --- Config (all from environment or config.py defaults) ---
|
|
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
|
|
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
|
|
EMBEDDING_MODEL = "text-embedding-3-small"
|
|
|
|
_OPENROUTER_KEY: str | None = None
|
|
|
|
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
|
|
# Structural files that should never be included in graph expansion results.
|
|
# These are indexes/MOCs, not claims — expanding them pulls entire domains.
|
|
STRUCTURAL_FILES = {"_map.md", "_overview.md"}
|
|
|
|
|
|
def _get_api_key() -> str | None:
|
|
"""Load OpenRouter API key (cached after first read)."""
|
|
global _OPENROUTER_KEY
|
|
if _OPENROUTER_KEY:
|
|
return _OPENROUTER_KEY
|
|
key_file = config.SECRETS_DIR / "openrouter-key"
|
|
if key_file.exists():
|
|
_OPENROUTER_KEY = key_file.read_text().strip()
|
|
return _OPENROUTER_KEY
|
|
_OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY")
|
|
return _OPENROUTER_KEY
|
|
|
|
|
|
# --- Layer 1: Vector search ---
|
|
|
|
|
|
def embed_query(text: str) -> list[float] | None:
|
|
"""Embed a query string via OpenRouter (OpenAI-compatible endpoint).
|
|
|
|
Returns 1536-dim vector or None on failure.
|
|
"""
|
|
api_key = _get_api_key()
|
|
if not api_key:
|
|
logger.error("No OpenRouter API key available for embedding")
|
|
return None
|
|
|
|
payload = json.dumps({
|
|
"model": f"openai/{EMBEDDING_MODEL}",
|
|
"input": text[:8000],
|
|
}).encode()
|
|
req = urllib.request.Request(
|
|
"https://openrouter.ai/api/v1/embeddings",
|
|
data=payload,
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
data = json.loads(resp.read())
|
|
return data["data"][0]["embedding"]
|
|
except Exception as e:
|
|
logger.error("Embedding failed: %s", e)
|
|
return None
|
|
|
|
|
|
def search_qdrant(vector: list[float], limit: int = 10,
|
|
domain: str | None = None, confidence: str | None = None,
|
|
exclude: list[str] | None = None,
|
|
score_threshold: float = 0.3,
|
|
offset: int = 0) -> list[dict]:
|
|
"""Search Qdrant collection for nearest claims.
|
|
|
|
Args:
|
|
offset: Skip first N results (Qdrant native offset for pagination).
|
|
|
|
Returns list of hits: [{id, score, payload: {claim_path, claim_title, ...}}]
|
|
"""
|
|
must_filters = []
|
|
if domain:
|
|
must_filters.append({"key": "domain", "match": {"value": domain}})
|
|
if confidence:
|
|
must_filters.append({"key": "confidence", "match": {"value": confidence}})
|
|
|
|
must_not_filters = []
|
|
if exclude:
|
|
for path in exclude:
|
|
must_not_filters.append({"key": "claim_path", "match": {"value": path}})
|
|
|
|
body = {
|
|
"vector": vector,
|
|
"limit": limit,
|
|
"with_payload": True,
|
|
"score_threshold": score_threshold,
|
|
}
|
|
if offset > 0:
|
|
body["offset"] = offset
|
|
if must_filters or must_not_filters:
|
|
body["filter"] = {}
|
|
if must_filters:
|
|
body["filter"]["must"] = must_filters
|
|
if must_not_filters:
|
|
body["filter"]["must_not"] = must_not_filters
|
|
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
|
|
data=json.dumps(body).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
data = json.loads(resp.read())
|
|
return data.get("result", [])
|
|
except Exception as e:
|
|
logger.error("Qdrant search failed: %s", e)
|
|
return []
|
|
|
|
|
|
# --- Layer 2: Graph expansion ---
|
|
|
|
|
|
def _parse_frontmatter_edges(path: Path) -> dict:
|
|
"""Extract relationship edges from a claim's frontmatter.
|
|
|
|
Handles both YAML formats:
|
|
depends_on: ["item1", "item2"] (inline list)
|
|
depends_on: (multi-line list)
|
|
- item1
|
|
- item2
|
|
|
|
Returns {supports: [...], challenges: [...], depends_on: [...], related: [...], wiki_links: [...]}.
|
|
wiki_links are separated from explicit related edges for differential weighting.
|
|
"""
|
|
edges = {"supports": [], "challenges": [], "depends_on": [], "related": [], "wiki_links": []}
|
|
try:
|
|
text = path.read_text(errors="replace")
|
|
except Exception:
|
|
return edges
|
|
|
|
if not text.startswith("---"):
|
|
return edges
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return edges
|
|
|
|
fm_text = text[3:end]
|
|
|
|
# Use YAML parser for reliable edge extraction
|
|
try:
|
|
import yaml
|
|
fm = yaml.safe_load(fm_text)
|
|
if isinstance(fm, dict):
|
|
for field in ("supports", "challenges", "depends_on", "related"):
|
|
val = fm.get(field)
|
|
if isinstance(val, list):
|
|
edges[field] = [str(v).strip() for v in val if v]
|
|
elif isinstance(val, str) and val.strip():
|
|
edges[field] = [val.strip()]
|
|
except Exception:
|
|
pass
|
|
|
|
# Extract wiki links from body as separate edge type (lower weight)
|
|
body = text[end + 4:]
|
|
all_explicit = set()
|
|
for field in ("supports", "challenges", "depends_on", "related"):
|
|
all_explicit.update(edges[field])
|
|
|
|
wiki_links = WIKI_LINK_RE.findall(body)
|
|
for link in wiki_links:
|
|
link = link.strip()
|
|
if link and link not in all_explicit and link not in edges["wiki_links"]:
|
|
edges["wiki_links"].append(link)
|
|
|
|
return edges
|
|
|
|
|
|
def _resolve_claim_path(name: str, repo_root: Path) -> Path | None:
|
|
"""Resolve a claim name (from frontmatter edge or wiki link) to a file path.
|
|
|
|
Handles both naming conventions:
|
|
- "GLP-1 receptor agonists are..." → "GLP-1 receptor agonists are....md" (spaces)
|
|
- "glp-1-persistence-drops..." → "glp-1-persistence-drops....md" (slugified)
|
|
|
|
Checks domains/, core/, foundations/, decisions/ subdirectories.
|
|
"""
|
|
# Try exact name first (spaces in filename), then slugified
|
|
candidates = [name]
|
|
slug = name.lower().replace(" ", "-").replace("_", "-")
|
|
if slug != name:
|
|
candidates.append(slug)
|
|
|
|
for subdir in ["domains", "core", "foundations", "decisions"]:
|
|
base = repo_root / subdir
|
|
if not base.is_dir():
|
|
continue
|
|
for candidate_name in candidates:
|
|
for md in base.rglob(f"{candidate_name}.md"):
|
|
return md
|
|
return None
|
|
|
|
|
|
def graph_expand(seed_paths: list[str], repo_root: Path | None = None,
|
|
max_expanded: int = 30,
|
|
challenge_weight: float = 1.5,
|
|
seen: set[str] | None = None) -> list[dict]:
|
|
"""Layer 2: Expand seed claims 1-hop through knowledge graph edges.
|
|
|
|
Traverses supports/challenges/depends_on/related/wiki_links edges in frontmatter.
|
|
Edge weights: challenges 1.5x, depends_on 1.25x, supports/related 1.0x, wiki_links 0.5x.
|
|
Results sorted by weight descending so cap cuts low-value edges first.
|
|
|
|
Args:
|
|
seen: Optional set of paths already matched (e.g. from keyword search) to exclude.
|
|
|
|
Returns list of {claim_path, claim_title, edge_type, edge_weight, from_claim}.
|
|
Excludes claims already in seed_paths or seen set.
|
|
"""
|
|
EDGE_WEIGHTS = {
|
|
"challenges": 1.5,
|
|
"challenged_by": 1.5,
|
|
"depends_on": 1.25,
|
|
"supports": 1.0,
|
|
"related": 1.0,
|
|
"wiki_links": 0.5,
|
|
}
|
|
|
|
root = repo_root or config.MAIN_WORKTREE
|
|
all_expanded = []
|
|
visited = set(seed_paths)
|
|
if seen:
|
|
visited.update(seen)
|
|
|
|
for seed_path in seed_paths:
|
|
full_path = root / seed_path
|
|
if not full_path.exists():
|
|
continue
|
|
|
|
edges = _parse_frontmatter_edges(full_path)
|
|
|
|
for edge_type, targets in edges.items():
|
|
weight = EDGE_WEIGHTS.get(edge_type, 1.0)
|
|
|
|
for target_name in targets:
|
|
target_path = _resolve_claim_path(target_name, root)
|
|
if target_path is None:
|
|
continue
|
|
|
|
rel_path = str(target_path.relative_to(root))
|
|
if rel_path in visited:
|
|
continue
|
|
# Skip structural files (MOCs/indexes) — they pull entire domains
|
|
if target_path.name in STRUCTURAL_FILES:
|
|
continue
|
|
visited.add(rel_path)
|
|
|
|
# Read title from frontmatter
|
|
title = target_name
|
|
try:
|
|
text = target_path.read_text(errors="replace")
|
|
if text.startswith("---"):
|
|
end = text.find("\n---", 3)
|
|
if end > 0:
|
|
import yaml
|
|
fm = yaml.safe_load(text[3:end])
|
|
if isinstance(fm, dict):
|
|
title = fm.get("name", fm.get("title", target_name))
|
|
except Exception:
|
|
pass
|
|
|
|
all_expanded.append({
|
|
"claim_path": rel_path,
|
|
"claim_title": str(title),
|
|
"edge_type": edge_type,
|
|
"edge_weight": weight,
|
|
"from_claim": seed_path,
|
|
})
|
|
|
|
# Sort by weight descending so cap cuts lowest-value edges first
|
|
all_expanded.sort(key=lambda x: x["edge_weight"], reverse=True)
|
|
return all_expanded[:max_expanded]
|
|
|
|
|
|
# --- Combined search (Layer 1 + Layer 2) ---
|
|
|
|
# Default thresholds — lowered Apr 5 after production audit showed 0 vector hits.
|
|
# text-embedding-3-small scores 0.50-0.60 on conceptual matches (e.g. "risks in
|
|
# investing" vs specific claims). 0.70 rejected every result. 0.50/0.40 lets
|
|
# relevant claims through while still filtering noise.
|
|
PASS1_LIMIT = 5
|
|
PASS1_THRESHOLD = 0.50
|
|
PASS2_LIMIT = 5
|
|
PASS2_THRESHOLD = 0.40
|
|
HARD_CAP = 10
|
|
|
|
|
|
def _dedup_hits(hits: list[dict], seen: set[str]) -> list[dict]:
|
|
"""Filter Qdrant hits: dedup by claim_path, exclude structural files."""
|
|
results = []
|
|
for hit in hits:
|
|
payload = hit.get("payload", {})
|
|
claim_path = payload.get("claim_path", "")
|
|
if claim_path in seen:
|
|
continue
|
|
if claim_path.split("/")[-1] in STRUCTURAL_FILES:
|
|
continue
|
|
seen.add(claim_path)
|
|
results.append({
|
|
"claim_title": payload.get("claim_title", ""),
|
|
"claim_path": claim_path,
|
|
"score": round(hit.get("score", 0), 4),
|
|
"domain": payload.get("domain", ""),
|
|
"confidence": payload.get("confidence", ""),
|
|
"snippet": payload.get("snippet", "")[:200],
|
|
"type": payload.get("type", "claim"),
|
|
})
|
|
return results
|
|
|
|
|
|
def _sort_results(direct: list[dict], expanded: list[dict]) -> list[dict]:
|
|
"""Sort combined results: similarity desc → challenged_by → other expansion.
|
|
|
|
Sort order is load-bearing: LLMs have primacy bias, so best claims first.
|
|
"""
|
|
# Direct results already sorted by Qdrant (cosine desc)
|
|
sorted_direct = sorted(direct, key=lambda x: x.get("score", 0), reverse=True)
|
|
|
|
# Expansion: challenged_by first (counterpoints), then rest by weight
|
|
challenged = [e for e in expanded if e.get("edge_type") == "challenges"]
|
|
other_expanded = [e for e in expanded if e.get("edge_type") != "challenges"]
|
|
challenged.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)
|
|
other_expanded.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)
|
|
|
|
return sorted_direct + challenged + other_expanded
|
|
|
|
|
|
def search(query: str, expand: bool = False,
|
|
domain: str | None = None, confidence: str | None = None,
|
|
exclude: list[str] | None = None) -> dict:
|
|
"""Two-pass semantic search: embed query, search Qdrant, optionally expand.
|
|
|
|
Pass 1 (expand=False, default): Top 5 claims from Qdrant, score >= 0.70.
|
|
Sufficient for ~80% of queries. Fast and focused.
|
|
|
|
Pass 2 (expand=True): Next 5 claims (offset=5, score >= 0.60) plus
|
|
graph-expanded claims (challenged_by, related edges). Hard cap 10 total.
|
|
Agent calls this only when pass 1 didn't answer the question.
|
|
|
|
Returns {
|
|
"query": str,
|
|
"direct_results": [...], # Layer 1 Qdrant hits (sorted by score desc)
|
|
"expanded_results": [...], # Layer 2 graph expansion (challenges first)
|
|
"total": int,
|
|
}
|
|
"""
|
|
vector = embed_query(query)
|
|
if vector is None:
|
|
return {"query": query, "direct_results": [], "expanded_results": [],
|
|
"total": 0, "error": "embedding_failed"}
|
|
|
|
# --- Pass 1: Top 5, high threshold ---
|
|
hits = search_qdrant(vector, limit=PASS1_LIMIT, domain=domain,
|
|
confidence=confidence, exclude=exclude,
|
|
score_threshold=PASS1_THRESHOLD)
|
|
|
|
seen_paths: set[str] = set()
|
|
if exclude:
|
|
seen_paths.update(exclude)
|
|
direct = _dedup_hits(hits, seen_paths)
|
|
|
|
expanded = []
|
|
if expand:
|
|
# --- Pass 2: Next 5 from Qdrant (lower threshold, offset) ---
|
|
pass2_hits = search_qdrant(vector, limit=PASS2_LIMIT, domain=domain,
|
|
confidence=confidence, exclude=exclude,
|
|
score_threshold=PASS2_THRESHOLD,
|
|
offset=PASS1_LIMIT)
|
|
pass2_direct = _dedup_hits(pass2_hits, seen_paths)
|
|
direct.extend(pass2_direct)
|
|
|
|
# Graph expansion on all direct results (pass 1 + pass 2 seeds)
|
|
seed_paths = [r["claim_path"] for r in direct]
|
|
remaining_cap = HARD_CAP - len(direct)
|
|
if remaining_cap > 0:
|
|
expanded = graph_expand(seed_paths, max_expanded=remaining_cap,
|
|
seen=seen_paths)
|
|
|
|
# Enforce hard cap across all results
|
|
all_sorted = _sort_results(direct, expanded)[:HARD_CAP]
|
|
|
|
# Split back into direct vs expanded for backward compat
|
|
direct_paths = {r["claim_path"] for r in direct}
|
|
final_direct = [r for r in all_sorted if r.get("claim_path") in direct_paths]
|
|
final_expanded = [r for r in all_sorted if r.get("claim_path") not in direct_paths]
|
|
|
|
return {
|
|
"query": query,
|
|
"direct_results": final_direct,
|
|
"expanded_results": final_expanded,
|
|
"total": len(all_sorted),
|
|
}
|
|
|
|
|
|
# --- Duplicate detection ---
|
|
|
|
|
|
def check_duplicate(text: str, threshold: float = 0.85,
|
|
domain: str | None = None) -> dict:
|
|
"""Check if a claim/text is a near-duplicate of existing KB content.
|
|
|
|
Embeds the text, searches Qdrant, returns top-3 matches with scores.
|
|
Thresholds: >=0.85 likely duplicate, 0.70-0.85 check manually, <0.70 novel.
|
|
|
|
Args:
|
|
text: The claim text to check.
|
|
threshold: Minimum score to flag as potential duplicate (default 0.85).
|
|
domain: Optional domain filter.
|
|
|
|
Returns:
|
|
{
|
|
"query": str,
|
|
"is_duplicate": bool, # True if any match >= threshold
|
|
"highest_score": float, # Best match score
|
|
"verdict": str, # "duplicate" | "check_manually" | "novel"
|
|
"matches": [ # Top 3 matches
|
|
{"score": float, "claim_path": str, "claim_title": str, "domain": str}
|
|
]
|
|
}
|
|
"""
|
|
vector = embed_query(text)
|
|
if vector is None:
|
|
return {"query": text[:100], "is_duplicate": False, "highest_score": 0,
|
|
"verdict": "error", "matches": [], "error": "embedding_failed"}
|
|
|
|
hits = search_qdrant(vector, limit=3, domain=domain, score_threshold=0.3)
|
|
|
|
matches = []
|
|
for hit in hits:
|
|
payload = hit.get("payload", {})
|
|
matches.append({
|
|
"score": round(hit.get("score", 0), 4),
|
|
"claim_path": payload.get("claim_path", ""),
|
|
"claim_title": payload.get("claim_title", ""),
|
|
"domain": payload.get("domain", ""),
|
|
})
|
|
|
|
highest = matches[0]["score"] if matches else 0.0
|
|
|
|
if highest >= threshold:
|
|
verdict = "duplicate"
|
|
elif highest >= 0.70:
|
|
verdict = "check_manually"
|
|
else:
|
|
verdict = "novel"
|
|
|
|
return {
|
|
"query": text[:100],
|
|
"is_duplicate": highest >= threshold,
|
|
"highest_score": highest,
|
|
"verdict": verdict,
|
|
"matches": matches,
|
|
}
|