teleo-codex/ops/pipeline-v2/lib/search.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

480 lines
17 KiB
Python

"""Shared Qdrant vector search library for the Teleo knowledge base.
Provides embed + search + graph expansion as a reusable library.
Any consumer (Argus dashboard, Telegram bot, agent research) imports from here.
Layer 1: Qdrant vector search (semantic similarity)
Layer 2: Graph expansion (1-hop via frontmatter edges)
Layer 3: Left to the caller (agent context, domain filtering)
Owner: Epimetheus
"""
import json
import logging
import os
import re
from pathlib import Path
import urllib.request
from . import config
logger = logging.getLogger("pipeline.search")
# --- Config (all from environment or config.py defaults) ---
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
EMBEDDING_MODEL = "text-embedding-3-small"
_OPENROUTER_KEY: str | None = None
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# Structural files that should never be included in graph expansion results.
# These are indexes/MOCs, not claims — expanding them pulls entire domains.
STRUCTURAL_FILES = {"_map.md", "_overview.md"}
def _get_api_key() -> str | None:
"""Load OpenRouter API key (cached after first read)."""
global _OPENROUTER_KEY
if _OPENROUTER_KEY:
return _OPENROUTER_KEY
key_file = config.SECRETS_DIR / "openrouter-key"
if key_file.exists():
_OPENROUTER_KEY = key_file.read_text().strip()
return _OPENROUTER_KEY
_OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY")
return _OPENROUTER_KEY
# --- Layer 1: Vector search ---
def embed_query(text: str) -> list[float] | None:
"""Embed a query string via OpenRouter (OpenAI-compatible endpoint).
Returns 1536-dim vector or None on failure.
"""
api_key = _get_api_key()
if not api_key:
logger.error("No OpenRouter API key available for embedding")
return None
payload = json.dumps({
"model": f"openai/{EMBEDDING_MODEL}",
"input": text[:8000],
}).encode()
req = urllib.request.Request(
"https://openrouter.ai/api/v1/embeddings",
data=payload,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read())
return data["data"][0]["embedding"]
except Exception as e:
logger.error("Embedding failed: %s", e)
return None
def search_qdrant(vector: list[float], limit: int = 10,
domain: str | None = None, confidence: str | None = None,
exclude: list[str] | None = None,
score_threshold: float = 0.3,
offset: int = 0) -> list[dict]:
"""Search Qdrant collection for nearest claims.
Args:
offset: Skip first N results (Qdrant native offset for pagination).
Returns list of hits: [{id, score, payload: {claim_path, claim_title, ...}}]
"""
must_filters = []
if domain:
must_filters.append({"key": "domain", "match": {"value": domain}})
if confidence:
must_filters.append({"key": "confidence", "match": {"value": confidence}})
must_not_filters = []
if exclude:
for path in exclude:
must_not_filters.append({"key": "claim_path", "match": {"value": path}})
body = {
"vector": vector,
"limit": limit,
"with_payload": True,
"score_threshold": score_threshold,
}
if offset > 0:
body["offset"] = offset
if must_filters or must_not_filters:
body["filter"] = {}
if must_filters:
body["filter"]["must"] = must_filters
if must_not_filters:
body["filter"]["must_not"] = must_not_filters
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
data=json.dumps(body).encode(),
headers={"Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read())
return data.get("result", [])
except Exception as e:
logger.error("Qdrant search failed: %s", e)
return []
# --- Layer 2: Graph expansion ---
def _parse_frontmatter_edges(path: Path) -> dict:
"""Extract relationship edges from a claim's frontmatter.
Handles both YAML formats:
depends_on: ["item1", "item2"] (inline list)
depends_on: (multi-line list)
- item1
- item2
Returns {supports: [...], challenges: [...], depends_on: [...], related: [...], wiki_links: [...]}.
wiki_links are separated from explicit related edges for differential weighting.
"""
edges = {"supports": [], "challenges": [], "depends_on": [], "related": [], "wiki_links": []}
try:
text = path.read_text(errors="replace")
except Exception:
return edges
if not text.startswith("---"):
return edges
end = text.find("\n---", 3)
if end == -1:
return edges
fm_text = text[3:end]
# Use YAML parser for reliable edge extraction
try:
import yaml
fm = yaml.safe_load(fm_text)
if isinstance(fm, dict):
for field in ("supports", "challenges", "depends_on", "related"):
val = fm.get(field)
if isinstance(val, list):
edges[field] = [str(v).strip() for v in val if v]
elif isinstance(val, str) and val.strip():
edges[field] = [val.strip()]
except Exception:
pass
# Extract wiki links from body as separate edge type (lower weight)
body = text[end + 4:]
all_explicit = set()
for field in ("supports", "challenges", "depends_on", "related"):
all_explicit.update(edges[field])
wiki_links = WIKI_LINK_RE.findall(body)
for link in wiki_links:
link = link.strip()
if link and link not in all_explicit and link not in edges["wiki_links"]:
edges["wiki_links"].append(link)
return edges
def _resolve_claim_path(name: str, repo_root: Path) -> Path | None:
"""Resolve a claim name (from frontmatter edge or wiki link) to a file path.
Handles both naming conventions:
- "GLP-1 receptor agonists are...""GLP-1 receptor agonists are....md" (spaces)
- "glp-1-persistence-drops...""glp-1-persistence-drops....md" (slugified)
Checks domains/, core/, foundations/, decisions/ subdirectories.
"""
# Try exact name first (spaces in filename), then slugified
candidates = [name]
slug = name.lower().replace(" ", "-").replace("_", "-")
if slug != name:
candidates.append(slug)
for subdir in ["domains", "core", "foundations", "decisions"]:
base = repo_root / subdir
if not base.is_dir():
continue
for candidate_name in candidates:
for md in base.rglob(f"{candidate_name}.md"):
return md
return None
def graph_expand(seed_paths: list[str], repo_root: Path | None = None,
max_expanded: int = 30,
challenge_weight: float = 1.5,
seen: set[str] | None = None) -> list[dict]:
"""Layer 2: Expand seed claims 1-hop through knowledge graph edges.
Traverses supports/challenges/depends_on/related/wiki_links edges in frontmatter.
Edge weights: challenges 1.5x, depends_on 1.25x, supports/related 1.0x, wiki_links 0.5x.
Results sorted by weight descending so cap cuts low-value edges first.
Args:
seen: Optional set of paths already matched (e.g. from keyword search) to exclude.
Returns list of {claim_path, claim_title, edge_type, edge_weight, from_claim}.
Excludes claims already in seed_paths or seen set.
"""
EDGE_WEIGHTS = {
"challenges": 1.5,
"challenged_by": 1.5,
"depends_on": 1.25,
"supports": 1.0,
"related": 1.0,
"wiki_links": 0.5,
}
root = repo_root or config.MAIN_WORKTREE
all_expanded = []
visited = set(seed_paths)
if seen:
visited.update(seen)
for seed_path in seed_paths:
full_path = root / seed_path
if not full_path.exists():
continue
edges = _parse_frontmatter_edges(full_path)
for edge_type, targets in edges.items():
weight = EDGE_WEIGHTS.get(edge_type, 1.0)
for target_name in targets:
target_path = _resolve_claim_path(target_name, root)
if target_path is None:
continue
rel_path = str(target_path.relative_to(root))
if rel_path in visited:
continue
# Skip structural files (MOCs/indexes) — they pull entire domains
if target_path.name in STRUCTURAL_FILES:
continue
visited.add(rel_path)
# Read title from frontmatter
title = target_name
try:
text = target_path.read_text(errors="replace")
if text.startswith("---"):
end = text.find("\n---", 3)
if end > 0:
import yaml
fm = yaml.safe_load(text[3:end])
if isinstance(fm, dict):
title = fm.get("name", fm.get("title", target_name))
except Exception:
pass
all_expanded.append({
"claim_path": rel_path,
"claim_title": str(title),
"edge_type": edge_type,
"edge_weight": weight,
"from_claim": seed_path,
})
# Sort by weight descending so cap cuts lowest-value edges first
all_expanded.sort(key=lambda x: x["edge_weight"], reverse=True)
return all_expanded[:max_expanded]
# --- Combined search (Layer 1 + Layer 2) ---
# Default thresholds — lowered Apr 5 after production audit showed 0 vector hits.
# text-embedding-3-small scores 0.50-0.60 on conceptual matches (e.g. "risks in
# investing" vs specific claims). 0.70 rejected every result. 0.50/0.40 lets
# relevant claims through while still filtering noise.
PASS1_LIMIT = 5
PASS1_THRESHOLD = 0.50
PASS2_LIMIT = 5
PASS2_THRESHOLD = 0.40
HARD_CAP = 10
def _dedup_hits(hits: list[dict], seen: set[str]) -> list[dict]:
"""Filter Qdrant hits: dedup by claim_path, exclude structural files."""
results = []
for hit in hits:
payload = hit.get("payload", {})
claim_path = payload.get("claim_path", "")
if claim_path in seen:
continue
if claim_path.split("/")[-1] in STRUCTURAL_FILES:
continue
seen.add(claim_path)
results.append({
"claim_title": payload.get("claim_title", ""),
"claim_path": claim_path,
"score": round(hit.get("score", 0), 4),
"domain": payload.get("domain", ""),
"confidence": payload.get("confidence", ""),
"snippet": payload.get("snippet", "")[:200],
"type": payload.get("type", "claim"),
})
return results
def _sort_results(direct: list[dict], expanded: list[dict]) -> list[dict]:
"""Sort combined results: similarity desc → challenged_by → other expansion.
Sort order is load-bearing: LLMs have primacy bias, so best claims first.
"""
# Direct results already sorted by Qdrant (cosine desc)
sorted_direct = sorted(direct, key=lambda x: x.get("score", 0), reverse=True)
# Expansion: challenged_by first (counterpoints), then rest by weight
challenged = [e for e in expanded if e.get("edge_type") == "challenges"]
other_expanded = [e for e in expanded if e.get("edge_type") != "challenges"]
challenged.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)
other_expanded.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)
return sorted_direct + challenged + other_expanded
def search(query: str, expand: bool = False,
domain: str | None = None, confidence: str | None = None,
exclude: list[str] | None = None) -> dict:
"""Two-pass semantic search: embed query, search Qdrant, optionally expand.
Pass 1 (expand=False, default): Top 5 claims from Qdrant, score >= 0.70.
Sufficient for ~80% of queries. Fast and focused.
Pass 2 (expand=True): Next 5 claims (offset=5, score >= 0.60) plus
graph-expanded claims (challenged_by, related edges). Hard cap 10 total.
Agent calls this only when pass 1 didn't answer the question.
Returns {
"query": str,
"direct_results": [...], # Layer 1 Qdrant hits (sorted by score desc)
"expanded_results": [...], # Layer 2 graph expansion (challenges first)
"total": int,
}
"""
vector = embed_query(query)
if vector is None:
return {"query": query, "direct_results": [], "expanded_results": [],
"total": 0, "error": "embedding_failed"}
# --- Pass 1: Top 5, high threshold ---
hits = search_qdrant(vector, limit=PASS1_LIMIT, domain=domain,
confidence=confidence, exclude=exclude,
score_threshold=PASS1_THRESHOLD)
seen_paths: set[str] = set()
if exclude:
seen_paths.update(exclude)
direct = _dedup_hits(hits, seen_paths)
expanded = []
if expand:
# --- Pass 2: Next 5 from Qdrant (lower threshold, offset) ---
pass2_hits = search_qdrant(vector, limit=PASS2_LIMIT, domain=domain,
confidence=confidence, exclude=exclude,
score_threshold=PASS2_THRESHOLD,
offset=PASS1_LIMIT)
pass2_direct = _dedup_hits(pass2_hits, seen_paths)
direct.extend(pass2_direct)
# Graph expansion on all direct results (pass 1 + pass 2 seeds)
seed_paths = [r["claim_path"] for r in direct]
remaining_cap = HARD_CAP - len(direct)
if remaining_cap > 0:
expanded = graph_expand(seed_paths, max_expanded=remaining_cap,
seen=seen_paths)
# Enforce hard cap across all results
all_sorted = _sort_results(direct, expanded)[:HARD_CAP]
# Split back into direct vs expanded for backward compat
direct_paths = {r["claim_path"] for r in direct}
final_direct = [r for r in all_sorted if r.get("claim_path") in direct_paths]
final_expanded = [r for r in all_sorted if r.get("claim_path") not in direct_paths]
return {
"query": query,
"direct_results": final_direct,
"expanded_results": final_expanded,
"total": len(all_sorted),
}
# --- Duplicate detection ---
def check_duplicate(text: str, threshold: float = 0.85,
domain: str | None = None) -> dict:
"""Check if a claim/text is a near-duplicate of existing KB content.
Embeds the text, searches Qdrant, returns top-3 matches with scores.
Thresholds: >=0.85 likely duplicate, 0.70-0.85 check manually, <0.70 novel.
Args:
text: The claim text to check.
threshold: Minimum score to flag as potential duplicate (default 0.85).
domain: Optional domain filter.
Returns:
{
"query": str,
"is_duplicate": bool, # True if any match >= threshold
"highest_score": float, # Best match score
"verdict": str, # "duplicate" | "check_manually" | "novel"
"matches": [ # Top 3 matches
{"score": float, "claim_path": str, "claim_title": str, "domain": str}
]
}
"""
vector = embed_query(text)
if vector is None:
return {"query": text[:100], "is_duplicate": False, "highest_score": 0,
"verdict": "error", "matches": [], "error": "embedding_failed"}
hits = search_qdrant(vector, limit=3, domain=domain, score_threshold=0.3)
matches = []
for hit in hits:
payload = hit.get("payload", {})
matches.append({
"score": round(hit.get("score", 0), 4),
"claim_path": payload.get("claim_path", ""),
"claim_title": payload.get("claim_title", ""),
"domain": payload.get("domain", ""),
})
highest = matches[0]["score"] if matches else 0.0
if highest >= threshold:
verdict = "duplicate"
elif highest >= 0.70:
verdict = "check_manually"
else:
verdict = "novel"
return {
"query": text[:100],
"is_duplicate": highest >= threshold,
"highest_score": highest,
"verdict": verdict,
"matches": matches,
}